
    <hL                       S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	J
r
  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  \R<                  " \5      r  " S S\
RB                  5      r" " S S\
RB                  5      r# " S S\
RB                  5      r$ " S S\
RB                  5      r% " S S\
RB                  5      r& " S S\
RB                  5      r'   SyS\
RB                  S\RP                  S\RP                  S\RP                  S\\RP                     S \\)   S!\)S"\\RP                     4S# jjr* " S$ S%\
RB                  5      r+ " S& S'\
RB                  5      r, " S( S)\
RB                  5      r- " S* S+\
RB                  5      r. " S, S-\
RB                  5      r/ " S. S/\
RB                  5      r0 " S0 S1\
RB                  5      r1\ " S2 S3\5      5       r2 " S4 S5\
RB                  5      r3   SzS6\RP                  S7\)S8\\4   S9\5S:\64
S; jjr7  S{S6\RP                  S<\\4\64   S8\\4   S:\64S= jjr8 " S> S?\
RB                  5      r9 " S@ SA\
RB                  5      r: " SB SC\
RB                  5      r; " SD SE\
RB                  5      r< " SF SG\
RB                  5      r=\\" SHSI9 " SJ SK\5      5       5       r> " SL SM\25      r?\\" SNSI9 " SO SP\5      5       5       r@\" SQSI9 " SR SS\25      5       rA\\" STSI9 " SU SV\5      5       5       rB\" SWSI9 " SX SY\25      5       rC\\" SZSI9 " S[ S\\5      5       5       rD\\" S]SI9 " S^ S_\5      5       5       rE\\" S]SI9 " S` Sa\5      5       5       rFSb\R                  R                  Sc\RP                  Sd\RP                  4Se jrIS|Sf\RP                  Sg\\RP                     Sd\RP                  4Sh jjrJ " Si Sj\25      rK\\" SkSI9 " Sl Sm\5      5       5       rL " Sn So\25      rM\\" SpSI9 " Sq Sr\5      5       5       rN " Ss St\
RB                  5      rO\" SuSI9 " Sv Sw\25      5       rP/ SxQrQg)}zPyTorch PatchTSMixer model.    N)	dataclass)CallableOptionalUnion)PreTrainedModel)ModelOutput   )FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)NegativeBinomialOutputNormalOutputStudentTOutput)auto_docstringlogging   )PatchTSMixerConfigc                   >   ^  \ rS rSrSrS\S\4U 4S jjrS rSrU =r	$ )PatchTSMixerGatedAttention&   z
Module that applies gated attention to input data.

Args:
    in_size (`int`): The input size.
    out_size (`int`): The output size.
in_sizeout_sizec                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " SS9U l        g )Ndim)super__init__nnLinear
attn_layerSoftmaxattn_softmax)selfr   r   	__class__s      n/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/modeling_patchtsmixer.pyr   #PatchTSMixerGatedAttention.__init__/   s/    ))G6JJ2.    c                 N    U R                  U R                  U5      5      nX-  nU$ N)r#   r!   )r$   inputsattn_weights      r&   forward"PatchTSMixerGatedAttention.forward4   s(    ''(?@%r(   )r!   r#   )
__name__
__module____qualname____firstlineno____doc__intr   r-   __static_attributes____classcell__r%   s   @r&   r   r   &   s%    / /s /
 r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerBatchNorm;   zH
Compute batch normalization over the sequence length (time) dimension.
configc                 ~   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        g )Neps)r   r   r   BatchNorm1dd_modelnorm_eps	batchnormr$   r;   r%   s     r&   r   PatchTSMixerBatchNorm.__init__@   s(    FOOLr(   r+   c                 l    UR                  SS5      nU R                  U5      nUR                  SS5      $ )z
Parameters:
    inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
        input for Batch norm calculation
Returns:
    `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
r      )	transposerB   )r$   r+   outputs      r&   r-   PatchTSMixerBatchNorm.forwardD   s7     !!!Q''1%%r(   )rB   r/   r0   r1   r2   r3   r   r   torchTensorr-   r5   r6   r7   s   @r&   r9   r9   ;   s,    M1 M
&ell 
& 
&r(   r9   c                      ^  \ rS rSrSrS\4U 4S jjr\S\S\R                  4S j5       r
S\R                  4S jrS	rU =r$ )
PatchTSMixerPositionalEncodingQ   z
Class for positional encoding
r;   c                    > [         TU ]  5         UR                  (       a  U R                  U5      U l        g [
        R                  " [        R                  " UR                  UR                  5      5      U l        g r*   )r   r   use_positional_encoding_init_peposition_encr   	ParameterrK   zerosnum_patchesr@   rC   s     r&   r   'PatchTSMixerPositionalEncoding.__init__V   sN    )) $f 5D "U[[9K9KV^^-\ ]Dr(   returnc                 d   U R                   S:X  a@  [        R                  " [        R                  " U R
                  U R                  5      SS9nU$ U R                   S:X  Ga8  [        R                  " U R
                  U R                  5      n[        R                  " SU R
                  5      R                  S5      n[        R                  " [        R                  " SU R                  S5      [        R                  " S5      U R                  -  * -  5      n[        R                  " X#-  5      US S 2SS S24'   [        R                  " X#-  5      US S 2SS S24'   XR                  5       -
  nXR!                  5       S	-  -  n[        R                  " US
S9nU$ [#        U R                    S35      e)NrandomTrequires_gradsincosr   r   rF   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   rT   rK   randnrV   r@   rU   arange	unsqueezeexpmathlogsincosmeanstd
ValueError)r;   rS   positiondiv_terms       r&   rR   'PatchTSMixerPositionalEncoding._init_pe^   sn    **h6<<F4F4F(WgklL  ,,8 ;;v'9'96>>JL||Av'9'9:DDQGHyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!'*;*;*==L'+;+;+=+BCL<<EJL
  223  4B  C r(   patch_inputc                 "    XR                   -   nU$ r*   rS   )r$   rn   hidden_states      r&   r-   &PatchTSMixerPositionalEncoding.forwardr   s    "%6%66r(   rp   )r/   r0   r1   r2   r3   r   r   staticmethodr   rT   rR   rK   rL   r-   r5   r6   r7   s   @r&   rN   rN   Q   sS    ^1 ^ +   &5<<  r(   rN   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerNormLayerx   zUNormalization block

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                    > [         TU ]  5         UR                  U l        SUR                  R                  5       ;   a  [	        U5      U l        g [        R                  " UR                  UR                  S9U l        g )Nbatchr=   )
r   r   norm_mlplowerr9   normr   	LayerNormr@   rA   rC   s     r&   r   PatchTSMixerNormLayer.__init__   sT    foo++---f5DIV^^IDIr(   r+   c                 l   SU R                   R                  5       ;   a  [        R                  " UUR                  S   UR                  S   -  UR                  S   UR                  S   45      nU R                  U5      n[        R                  " X!R                  5      nU$ U R                  U5      nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        Input to the normalization layer.
Returns:
    `torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`
rx   r   r   rF   r	   )ry   rz   rK   reshapeshaper{   )r$   r+   inputs_reshapeds      r&   r-   PatchTSMixerNormLayer.forward   s     dmm))++#mmLLOfll1o5LLOLLOO #ii8O ]]?LLAF
  YYv&Fr(   )r{   ry   rJ   r7   s   @r&   ru   ru   x   s,    J1 Jell  r(   ru   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )PatchTSMixerMLP   c                 >  > [         TU ]  5         XR                  -  n[        R                  " X5      U l        [        R                  " UR                  5      U l        [        R                  " XB5      U l	        [        R                  " UR                  5      U l
        g r*   )r   r   expansion_factorr   r    fc1Dropoutdropoutdropout1fc2dropout2)r$   in_featuresout_featuresr;   
num_hiddenr%   s        r&   r   PatchTSMixerMLP.__init__   sd     #:#::
99[5

6>>299Z6

6>>2r(   r+   c                     U R                  [        R                  R                  U R	                  U5      5      5      nU R                  U5      nU R                  U5      nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        Input to the MLP layer.
Returns:
    `torch.Tensor` of the same shape as `inputs`
)r   r   
functionalgelur   r   r   )r$   r+   s     r&   r-   PatchTSMixerMLP.forward   sK     r}}11$((62BCD&!v&r(   )r   r   r   r   )
r/   r0   r1   r2   r   rK   rL   r-   r5   r6   r7   s   @r&   r   r      s    3ell  r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )$PatchTSMixerChannelFeatureMixerBlock   zzThis module mixes the features in the channel dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                   > [         TU ]  5         [        U5      U l        UR                  U l        [        UR                  UR                  US9U l        UR                  (       a$  [        UR                  UR                  S9U l	        g g Nr   r   r;   r   r   )
r   r   ru   r{   
gated_attnr   num_input_channelsmlpr   gating_blockrC   s     r&   r   -PatchTSMixerChannelFeatureMixerBlock.__init__   sv    )&1	 ++"1122
  :11F<U<U!D r(   r+   c                     UnU R                  U5      nUR                  SSSS5      nU R                  (       a  U R                  U5      nU R	                  U5      nUR                  SSSS5      nX-   nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        input to the MLP layer
Returns:
    `torch.Tensor` of the same shape as `inputs`
r   r	   rF   r   )r{   permuter   r   r   )r$   r+   residualouts       r&   r-   ,PatchTSMixerChannelFeatureMixerBlock.forward   sq     6"1a+??&&v.F&!1a+
r(   r   r   r   r{   rJ   r7   s   @r&   r   r      s*    1  ell  r(   r   modulequerykeyvalueattention_maskscalingr   	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R
                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R
                  R                  XU R                  S9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )Nr         rF   r	   r   r   )ptraining)sizerK   matmulrG   r   r   softmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r&   eager_attention_forwardr      s     **R.D(<<}}Q':;gEL!#4==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$r(   c                   Z  ^  \ rS rSrSr     SS\S\S\S\S\S\S	\\	   4U 4S
 jjjr
    SS\R                  S\\R                     S\\R                     S\\R                     S\\   S\\   S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )PatchTSMixerAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderbias	is_causalr;   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).r   )r   )r   r   r   r   r   head_dimr;   rj   r   r   r   r   r    k_projv_projq_projout_proj)	r$   r   r   r   r   r   r   r;   r%   s	           r&   r   PatchTSMixerAttention.__init__  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr(   hidden_stateskey_value_statesr   layer_head_maskoutput_attentionsr   rX   c                     USLnUR                   SS u  pU(       a  UR                   S   OU	n
XSU R                  4nXSU R                  4nU R                  U5      R                  " U6 R	                  SS5      nU(       a  UOUnU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUU4U R                  (       d  SOU R                  U R                  UUS.UD6u  nnUR                  XS5      R                  5       nU R!                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNr   r   rF   eager        )r   r   r   r   )r   r   r   r   rG   r   r   r   r;   _attn_implementationr   r   r   r   r   r   r   )r$   r   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r&   r-   PatchTSMixerAttention.forward2  s    .T9 %**3B//A"((+wr4==9DMM: {{=166FPPQRTUV-?)][[055~FPPQRTUV
{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#;FFHmmK0L$..r(   )r;   r   r   r   r   r   r   r   r   r   r   r   )r   FTFN)NNNF)r/   r0   r1   r2   r3   r4   floatboolr   r   r   rK   rL   r   r
   tupler-   r5   r6   r7   s   @r&   r   r     s!   G  /3CC C 	C
 C C C +,C CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/ 3/r(   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )PatchMixerBlockih  zhThis module mixes the patch dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                   > [         TU ]  5         [        U5      U l        UR                  U l        UR
                  U l        [        UR                  UR                  US9U l        UR
                  (       a#  [        UR                  UR                  S9U l
        UR                  (       a@  [        UR                  UR                  UR                  US9U l        [        U5      U l        g g )Nr   r   )r   r   r   r;   )r   r   ru   r{   	self_attnr   r   rV   r   r   r   r   r@   self_attn_headsr   self_attn_layer	norm_attnrC   s     r&   r   PatchMixerBlock.__init__p  s    )&1	)) ++"**++
  :6CUCU`f`r`r sD#8 .. 00	$D  36:DN r(   c                    UnU R                  U5      nU R                  (       aI  UR                  u  p4pVUR                  X4-  XV5      nU R	                  USS9u  n  n	UR                  X4XV5      nUR                  SS5      nU R                  U5      nU R                  (       a  U R                  U5      nUR                  SS5      nU R                  (       a  U R                  UW-   5      nX-   n
U
$ )zj
Args:
    hidden_state (`torch.Tensor`): Input tensor.

Returns:
    `torch.Tensor`: Transformed tensor.
F)r   rF   r	   )
r{   r   r   r   r   rG   r   r   r   r   )r$   rq   r   
batch_sizen_varsrV   r@   hidden_state_reshapedx_attn_r   s              r&   r-   PatchMixerBlock.forward  s      yy.>>7C7I7I4J$0$8$89Lk$c!//0EY^/_LFAq^^JMF $--a3xx-??,,\:L $--a3>>>>,*?@L%
r(   )r   r   r   r{   r   r   r   
r/   r0   r1   r2   r3   r   r   r-   r5   r6   r7   s   @r&   r   r   h  s    ;1 ;4! !r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )FeatureMixerBlocki  zrThis module mixes the hidden feature dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

r;   c                   > [         TU ]  5         [        U5      U l        UR                  U l        [        UR                  UR                  US9U l        UR                  (       a$  [        UR                  UR                  S9U l	        g g r   )
r   r   ru   r{   r   r   r@   r   r   r   rC   s     r&   r   FeatureMixerBlock.__init__  sn    )&1	 ++"
  :6>>\b\j\j kD r(   hiddenc                     UnU R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX-   nU$ )
Args:
    hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
        Input tensor to the layer.

Returns:
    `torch.Tensor`: Transformed tensor.
)r{   r   r   r   )r$   r   r   r   s       r&   r-   FeatureMixerBlock.forward  sI     6"&!??&&v.F
r(   r   rJ   r7   s   @r&   r   r     s,    l1 l ell  r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerLayeri  z
The `PatchTSMixer` layer that does all three kinds of mixing.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

r;   c                    > [         TU ]  5         [        US9U l        [	        US9U l        UR                  U l        UR                  S:X  a  [        US9U l        g g )Nr;   mix_channel)	r   r   r   patch_mixerr   feature_mixermoder   channel_feature_mixerrC   s     r&   r   PatchTSMixerLayer.__init__  sR    *&9.f=KK	;;-')MU[)\D& (r(   r   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU$ )r   r   )r  r  r   r   )r$   r   s     r&   r-   PatchTSMixerLayer.forward  sE     99%//7F!!&)##F+r(   )r  r   r  r   rJ   r7   s   @r&   r   r     s,    	]1 	]ell  r(   r   c                   F   ^  \ rS rSrSrS\4U 4S jjrSS\4S jjrSr	U =r
$ )	PatchTSMixerBlocki  z{The main computing framework of the `PatchTSMixer` model.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c           	         > [         TU ]  5         UR                  n[        R                  " [        U5       Vs/ sH  n[        US9PM     sn5      U l        g s  snf Nr   )r   r   
num_layersr   
ModuleListranger   mixers)r$   r;   r
  r   r%   s       r&   r   PatchTSMixerBlock.__init__	  sI    &&
mmuU_O`$aO`!%6f%EO`$ab$as   Aoutput_hidden_statesc                     / nUnU R                    H%  nU" U5      nU(       d  M  UR                  U5        M'     U(       a  XC4$ US4$ )a3  
Args:
    hidden_state (`torch.Tensor`): The input tensor.
    output_hidden_states (`bool`, *optional*, defaults to False.):
        Whether to output the hidden states as well.

Returns:
    `torch.Tensor`: The embedding. `list`: List of all hidden states if `output_hidden_states` is set to
    `True`.
N)r  append)r$   rq   r  all_hidden_states	embeddingmods         r&   r-   PatchTSMixerBlock.forward  sR      	;;CII##!((3 
  //d?"r(   )r  F)r/   r0   r1   r2   r3   r   r   r   r-   r5   r6   r7   s   @r&   r  r    s(    c1 c#$ # #r(   r  c                   >   ^  \ rS rSrSrSS\4U 4S jjjrS rSrU =r	$ )PatchTSMixerForPredictionHeadi*  zaPrediction Head for Forecasting

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                   > [         TU ]  5         UR                  U l        U R                  b  U R                  R                  5         [        R
                  " UR                  5      U l        Uc>  [        R                  " UR                  UR                  -  UR                  5      U l        O-UR                  UR                  UR                  -  5      U l        [        R                  " SS9U l        g )N	start_dim)r   r   prediction_channel_indicessortr   r   head_dropoutdropout_layerr    rV   r@   prediction_lengthbase_forecast_blockget_parameter_projectionFlattenflatten)r$   r;   distribution_outputr%   s      r&   r   &PatchTSMixerForPredictionHead.__init__2  s    *0*K*K'**6++002ZZ(;(;<&')yy&2D2Dv~~2UX^XpXp'qD$':'S'S""V^^3(D$ zzB/r(   c                 v  ^  T R                  U5      nT R                  U5      nT R                  U5      n[        U[        5      (       a  [	        S U 5       5      nOUR                  SS5      nT R                  b=  [        U[        5      (       a  [	        U 4S jU 5       5      nU$ UST R                  4   nU$ )a:  

Args:
    hidden_features (`torch.Tensor` of shape `(batch_size, num_patch, d_model)` in `flatten` mode
        or `(batch_size, n_vars, num_patch, d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size, prediction_length, nvars)`.

c              3   B   #    U H  oR                  S S5      v   M     g7f)r   r  N)rG   ).0zs     r&   	<genexpr>8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>U  s     C(Q[[R00(s   r   r  c              3   B   >#    U H  oS TR                   4   v   M     g7f).N)r  )r*  r+  r$   s     r&   r,  r-  [  s!      [RZQ3(G(G#G!HRZs   .)r%  r   r"  
isinstancer   rG   r  r$   hidden_featuresforecasts   `  r&   r-   %PatchTSMixerForPredictionHead.forwardD  s     ,,7,,_=++O<h&&C(CCH))"b1H**6(E**  [RZ [[  $C)H)H$HIr(   )r"  r   r%  r  r*   r   r7   s   @r&   r  r  *  s$    01 0 0$ r(   r  c                   >   ^  \ rS rSrSrSS\4U 4S jjjrS rSrU =r	$ )PatchTSMixerLinearHeadib  zpLinear head for Classification and Regression.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                 \  > [         TU ]  5         UR                  U l        UR                  U l        UR                  c  UR                  nOSnX l        UcA  [        R                  " UR                  UR                  -  U-  UR                  5      U l        O0UR                  UR                  UR                  -  U-  5      U l        UR                  c  [        R                  " SS9U l        O[        R                  " SS9U l        [        R                  " UR                   5      U l        g )Nr   r  r  )r   r   head_aggregationoutput_rangerV   r&  r   r    r@   r   num_targets
projectionr#  r$  r%  r   r  r   )r$   r;   r&  
mul_factorr%   s       r&   r   PatchTSMixerLinearHead.__init__j  s     & 7 7"//""*++JJ#6 & ii!:!::ZG""DO
 2JJ!:!::ZGDO ""*::3DL::3DLzz&"5"56r(   c                 0   UR                  SS5      nU R                  S:X  a  US   nOIU R                  S:X  a  UR                  SS9R                  nOU R                  S:X  a  UR	                  SS9nU R
                  (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  cS  U R                  bF  [        R                  " U5      U R                  S   U R                  S	   -
  -  U R                  S	   -   nU$ )
a1  
Args:
    hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
        or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size x num_targets)`.
r   r  use_last).r   max_poolr   avg_poolr   r   )rG   r8  maxvaluesrh   r%  r   r;  r&  r9  rK   sigmoid)r$   r1  s     r&   r-   PatchTSMixerLinearHead.forward  s
    *33B;  J.-g6O""j0-11b19@@O""j0-22r2:O<<"ll?;O,,7///:$$,43D3D3Po.$2C2CA2FIZIZ[\I]2]^aeararstauu  r(   )r&  r   r%  r8  r9  r;  r*   r   r7   s   @r&   r5  r5  b  s$    71 7 78   r(   r5  c                   2    \ rS rSr% \\S'   SrSrSrS r	Sr
g)	PatchTSMixerPreTrainedModeli  r;   modelpast_valuesFc                    [        U[        5      (       aE  U R                  R                  S:X  a*  [        R
                  R                  UR                  SSS9  gg[        U[        R                  [        R                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a^  UR                   R                  R                  R                  5         UR                   R                  R                  R                  S5        g[        U[        R"                  5      (       ak  UR                  R                  R                  SU R                  R$                  S9  UR                  b%  UR                  R                  R                  5         ggg)zInitialize weightsrZ   r   g?)rh   ri         ?N)r/  rN   r;   r_   r   initnormal_rS   r|   r?   r   datazero_weightfill_r9   rB   r    init_std)r$   r   s     r&   _init_weights)PatchTSMixerPreTrainedModel._init_weights  s:   f<=={{33x? 3 3#3G @r~~ >??KK""$MM$$S) 566!!&&,,.##((..s3		**MM&&CT[[5I5I&J{{&  &&( ' +r(    N)r/   r0   r1   r2   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingrS  r5   rU  r(   r&   rG  rG    s      #O&+#)r(   rG  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )PatchTSMixerPretrainHeadi  zSPretraining head.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                    > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        g r*   )
r   r   r   r   r  r   r    r@   patch_lengthbase_pt_blockrC   s     r&   r   !PatchTSMixerPretrainHead.__init__  sB    ZZ(;(;<YYv~~v7J7JKr(   c                 J    U R                  U5      nU R                  U5      nU$ )aG  
Args:
    hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
        or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size x n_vars x num_patch x patch_length)`.
)r   r^  r0  s      r&   r-    PatchTSMixerPretrainHead.forward  s)     ,,_=%%o6r(   )r^  r   r   r7   s   @r&   r[  r[    s!    L1 L r(   r[  r+   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    US:  d  US:  a  [        SU S35      eU R                  u  pVpxU R                  n	[        USU-
  -  5      n
U(       a*  [        R
                  " USXyS9nUR                  SUS5      nO[        R
                  " XVXyS9n[        R                  " XVXyS9nSUSS2SS2SU
24'   [        R                  " USS9n[        R                  " USS9n[        R                  " USUS	9nUR                  S5      R                  SSSU5      nUb  SUSS2USS2SS24'   U R                  UR                  5       U5      nXS
   4$ )a  random_masking: Mask the input considering the control variables.

Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
        The input tensor to mask.
    mask_ratio (`float`):
        Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
    unmasked_channel_indices (list, *optional*):
        Indices of channels that will not be masked.
    channel_consistent_masking (bool, *optional*, defaults to `False`):
        When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
        across channels.
    mask_value (int, *optional*, defaults to 0):
        Define the value of masked patches for pretraining.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
    n]
r   r   zMask ratio z has to be between 0 and 1.deviceNr   r   )r   index.r   )rj   r   rh  r4   rK   randrepeatonesargsortgatherrb   masked_fillr   )r+   rb  rc  rd  re  r   num_channelssequence_lengthnum_featuresrh  len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r&   random_maskingrz    sA   4 A~q;zl2MNOO>Dll;Jo]]F?a*n56H!

:q/IQa0 

:_T ::jODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r(   num_forecast_mask_patchesc                 N   [        U[        5      (       a  U/nU Vs/ sH  nSPM     nnU R                  u  pgp[        R                  " XgXR
                  S9n
/ nSn[        U5      n[        X5       HG  u  pUS::  d  X:  a  [        SU S35      e[        Xo-  U-  5      nUR                  XU/5        UU-  nMI     [        US S9nX:  a  US   S   Xl-
  -   US   S'   OX:  a  US	   S   X-
  -   US	   S'   SnU H  u  nnnUU-   nSU
UU2S
S
2U* S
24'   UnM     [        R                  " U
R                  S   5      nU
U   n
U
R                  S	5      R                  SSSU	5      n
Ub  SU
S
S
2US
S
2S
S
24'   U R                  U
R                  5       U5      nUU
S   4$ s  snf )ai  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

Parameters:
    inputs (`torch.Tensor`):
        Input of shape `(bs, num_channels, num_patch, patch_length)`
    num_forecast_mask_patches (`list`):
        Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
    unmasked_channel_indices (`list`, *optional*):
        Indices of channels that are not masked.
    mask_value (`int`, *optional*, defaults to 0):
        Values in the masked patches will be filled by `mask_value`.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
    num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
r   rg  r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     U S   $ NrF   rU  )xs    r&   <lambda>"forecast_masking.<locals>.<lambda>I  s    !A$r(   )r   rF   r   Nrj  )r/  r4   r   rK   rU   rh  sumziprj   r  sortedrandpermrb   rl  rp  r   )r+   r{  rc  re  r   forecast_mask_ratiosr   rq  rr  rs  rv  t_listtotal_lengthtotal_ratior]  ratiotemp_lenbatch1	patch_lenbatch2permry  s                         r&   forecast_maskingr    s   0 +S11%>$?!'@A'@!A'@A>Dll;Jo;;zWDFL*+K"#<S1 ?,\N:pq  z)K78|H56   T F/F ay|z'@Aq	!		"r
1)BCr
1F"(	1h("./VF]A	z{*+ #)
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   F"c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerPatchifyib  z
A class to patchify the time series sequence into different patches

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r;   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  ::  a&  [        SU R                   SU R                   S35      e[        U R                  U R                  5      U R                  -
  U R
                  -  S-   U l        U R                  U R
                  U R                  S-
  -  -   nU R                  U-
  U l	        g )NzSequence length (z+) has to be greater than the patch length ()r   )
r   r   context_lengthrr  r]  patch_striderj   rB  rV   sequence_start)r$   r;   new_sequence_lengthr%   s      r&   r   PatchTSMixerPatchify.__init__j  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr(   rI  c                 4   UR                   S   nX R                  :w  a  [        SU SU R                   S35      eUSS2U R                  S2SS24   nUR	                  SU R
                  U R                  S9nUR                  SS5      R                  5       nU$ )z
Parameters:
    past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
        Input for patchification

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r  zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionr   stepr7  )	r   rr  rj   r  unfoldr]  r  rG   r   )r$   rI  rr  rH   s       r&   r-   PatchTSMixerPatchify.forward{  s     &++B/222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r(   )rV   r]  r  rr  r  rJ   r7   s   @r&   r  r  b  s,    I1 I"5<<  r(   r  c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerMaskingi  ap  
Class to perform random or forecast masking.

Parameters:
    config (`PatchTSMixerConfig`): model config
Returns:
    x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points
r;   c                 >  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  b  [        U R                  5      U l        g g r*   )	r   r   random_mask_ratiord  	mask_typer{  rc  re  r  rC   s     r&   r   PatchTSMixerMasking.__init__  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r(   rn   c                 d   U R                   S:X  a8  [        UU R                  U R                  U R                  U R
                  S9u  p#OVU R                   S:X  a-  [        UU R                  U R                  U R
                  S9u  p#O[        SU R                    S35      eUR                  5       nX#4$ )a  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input

Return:
    masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points

rZ   )r+   rb  rc  rd  re  r2  )r+   r{  rc  re  zInvalid mask type .)
r  rz  r  rc  rd  re  r  r{  rj   r   )r$   rn   masked_inputrv  s       r&   r-   PatchTSMixerMasking.forward  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{!!r(   )rd  r  re  r{  r  rc  rJ   r7   s   @r&   r  r    s,    
	R1 	R!"5<< !" !"r(   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSMixerStdScaleri  z
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
r;   c                   > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  U l        g SU l        g )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r   r   hasattrr  r   r  r  rC   s     r&   r   PatchTSMixerStdScaler.__init__  sd    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[_r(   rN  observed_indicatorrX   c                 r   UR                  U R                  U R                  S9nUR                  S5      nX-  R                  U R                  U R                  S9U-  nX-
  U-  S-  R                  U R                  U R                  S9U-  n[        R
                  " XPR                  -   5      nX-
  U-  XF4$ )  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
r  rK  rF   )r  r   r  	clamp_minrK   sqrtr  )r$   rN  r  denominatorlocvariancescales          r&   r-   PatchTSMixerStdScaler.forward  s     ),,TXXt||,L!++C0(--dhh-MP[[j$661<AA$((TXT`T`Aadoo

8&8&889
e#S//r(   )r   r  r  r/   r0   r1   r2   r3   r   r   rK   rL   r   r-   r5   r6   r7   s   @r&   r  r    sY    
`1 `0LL06;ll0	u||U\\5<<7	80 0r(   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSMixerMeanScaleri  z~
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
r;   c                 N  > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  OSU l        [        US5      (       a  UR                  U l        g S U l        g )Nr  r   r  Tr  绽|=default_scale)r   r   r  r  r   r  r  r  rC   s     r&   r   PatchTSMixerMeanScaler.__init__  s    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[`5<V_5U5UV11[_r(   rN  r  rX   c                    X-  R                  5       R                  U R                  SS9nUR                  U R                  SS9nU[        R                  " USS9-  nU R
                  cL  UR                  SS9n[        R                  " UR                  S5      SS9n[        R                  " Xg-  5      nO#U R
                  [        R                  " U5      -  n[        R                  " US:  XX5      n[        R                  " XPR                  S9nX-  n	U R                  (       d  UR                  U R                  S9nU	[        R                  " U5      U4$ )r  Tr  r   minr   r   )absr  r   rK   clampr  squeeze	ones_likewherer  r  
zeros_like)
r$   rN  r  ts_sumnum_observedr  	batch_sumbatch_observationsr  scaled_datas
             r&   r-   PatchTSMixerMeanScaler.forward  s"    +00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)*HIM ..1GGM L1,eC E'9'9:l||MMdhhM/EE,,U3U::r(   )r  r   r  r  r  r7   s   @r&   r  r    sY    
`1 `&;LL&;6;ll&;	u||U\\5<<7	8&; &;r(   r  c            
          ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\R                  \R                  \R                  4   4S jjrS	rU =r$ )PatchTSMixerNOPScaleri+  zt
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
r;   c                    > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  U l        g SU l        g )Nr  r   r  T)r   r   r  r  r   r  rC   s     r&   r   PatchTSMixerNOPScaler.__init__0  sF    )0)G)G6%%Q)0)C)Cv~~r(   rN  r  rX   c                     [         R                  " USS9R                  U R                  U R                  S9n[         R
                  " USS9R                  U R                  U R                  S9nXU4$ )aP  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
Fr[   r   r  )rK   r  rh   r   r  r  )r$   rN  r  r  r  s        r&   r-   PatchTSMixerNOPScaler.forward5  sg     E:??DHHVZVbVb?ct59>>488UYUaUa>b%r(   r  r*   )r/   r0   r1   r2   r3   r   r   rK   rL   r   r   r-   r5   r6   r7   s   @r&   r  r  +  se    N1 N PT LL 6>u||6L 	u||U\\5<<7	8   r(   r  zS
    Base class for `PatchTSMixerEncoderOutput`, with potential hidden states.
    )custom_introc                   p    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Srg)PatchTSMixerEncoderOutputiF  a  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, d_model)`):
    Hidden-state at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer.
Nlast_hidden_stater   rU  )r/   r0   r1   r2   r3   r  r   rK   FloatTensorrV  r   r   r5   rU  r(   r&   r  r  F  s9     6:x 1 1298<M8E%"3"345<r(   r  c                      ^  \ rS rSrSrS\4U 4S jjr\  SS\R                  S\
\   S\
\   S\\\4   4S	 jj5       rS
rU =r$ )PatchTSMixerEncoderiX  z
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                 T  > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  [        US9U l
        OS U l
        [        US9U l        UR                  (       a  U R                  5         g g r	  )r   r   use_return_dictr   r    r]  r@   patcherrQ   rN   positional_encoderr  mlp_mixer_encoder	post_initrC   s     r&   r   PatchTSMixerEncoder.__init__a  s     %55yy!4!4fnnE))&DF&SD#&*D#!2&!A NN r(   rI  r  return_dictrX   c                     Ub  UOU R                   nU R                  U5      nU R                  b  U R                  U5      nU R                  XBS9u  pVU(       d  [	        S UU4 5       5      $ [        XVS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to
    predict the masked portion. For a forecasting task, this denotes the history/past time series values.
    Similarly, for classification or regression tasks, it denotes the appropriate context values of the
    time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series,
    it is greater than 1.

Returns:
    `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)`
)r  c              3   $   #    U H  nUv   M	     g 7fr*   rU  r*  vs     r&   r,  .PatchTSMixerEncoder.forward.<locals>.<genexpr>  s      A    )r  r   )r  r  r  r  r   r  )r$   rI  r  r  patchesr  r   s          r&   r-   PatchTSMixerEncoder.forwardq  s    * &1%<k$BVBV ,,{+ "".--g6G+/+A+A'+A+u(  &!   );Ljjr(   )r  r  r  r  )FN)r/   r0   r1   r2   r3   r   r   r   rK   rL   r   r   r   r   r  r-   r5   r6   r7   s   @r&   r  r  X  st    1    05&*	(k\\(k 'tn(k d^	(k
 
u//	0(k (kr(   r  zG
    Base class for model's outputs, with potential hidden states.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
rg)PatchTSMixerModelOutputi  a  
last_hidden_state (`torch.FloatTensor`  of shape `(batch_size, num_channels, num_patches, d_model)`):
    Hidden-state at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer.
patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
    Patched input data to the model.
mask (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*):
    Bool Tensor indicating True in masked patches and False otherwise.
loc (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
    Gives the mean of the context window per channel. Used for revin denorm outside the model, if revin
    enabled.
scale (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
    Gives the std dev of the context window per channel. Used for revin denorm outside the model, if revin
    enabled.
Nr  r   rn   rv  r  r  rU  )r/   r0   r1   r2   r3   r  r   rK   r  rV  r   r   rn   rv  r  r  r5   rU  r(   r&   r  r    s    " 6:x 1 1298<M8E%"3"345</3K%++,3(,D(5$$
%,'+C%##	$+)-E8E%%&-r(   r  z=
    The PatchTSMixer Model for time-series forecasting.
    c                      ^  \ rS rSrSS\S\4U 4S jjjr\   SS\R                  S\
\R                     S\
\   S\
\   S	\4
S
 jj5       rSrU =r$ )PatchTSMixerModeli  r;   
mask_inputc                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        USL a  [        U5      U l        OSU l        UR                  S:X  a  [        U5      U l        O@UR                  S:X  d  UR                  SL a  [        U5      U l        O[        U5      U l        UR                  (       a  U R                  5         gg)z}
mask_input (bool, *optional*, defaults to `False`):
    Whether to mask the input using the [`PatchTSMixerMasking`] module.
TNrh   ri   )r   r   r  r  encoderr  patchingr  maskingr   r  scalerr  r  r  )r$   r;   r  r%   s      r&   r   PatchTSMixerModel.__init__  s    
 	 %55*62,V4.v6DLDL>>V#08DK^^u$$(>/7DK/7DK NN r(   rI  observed_maskr  r  rX   c           	         Ub  UOU R                   nSnUc  [        R                  " U5      nU R                  X5      u  pgnU R	                  U5      n	U	n
U R
                  b  U R                  U	5      u  pU R                  U
UUS9n[        U[        5      (       a  [        U6 nU(       d,  [        S UR                  UR                  U	UUU4 5       5      $ [        UR                  UR                  U	UUUS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
Nr  r  c              3   $   #    U H  nUv   M	     g 7fr*   rU  r  s     r&   r,  ,PatchTSMixerModel.forward.<locals>.<genexpr>        
A r  )r  r   rn   rv  r  r  )r  rK   r  r  r  r  r  r/  r   r  r  r   r  )r$   rI  r  r  r  rv  scaled_past_valuesr  r  	patched_x	enc_inputencoder_outputs               r&   r-   PatchTSMixerModel.forward  s   , &1%<k$BVBV !OOK8M)-[)P&MM"45		<<#"ll95OI !5# & 
 ne,,6GN 
 #44"00
 
 
 ',>>(66!
 	
r(   )r  r  r  r  r  r  )NFN)r/   r0   r1   r2   r   r   r   r   rK   rL   r   r  r-   r5   r6   r7   s   @r&   r  r    s    1 t  6  15/4&*A
\\A
  -A
 'tn	A

 d^A
 
!A
 A
r(   r  z>
    Output type of [`PatchTSMixerForPreTrainingOutput`].
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Srg)	 PatchTSMixerForPreTrainingOutputi#  a  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, patch_length)`):
    Prediction output from the pretrain head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer.
Nlossprediction_outputsr  r   rU  r/   r0   r1   r2   r3   r  r   rK   r  rV  r	  r  r   r   r5   rU  r(   r&   r  r  #  d    	 )-D(5$$
%,6:!2!23:59x 1 1298<M8E%"3"345<r(   r  z.
    `PatchTSMixer` for mask pretraining.
    c                      ^  \ rS rSrS\4U 4S jjr\    SS\R                  S\	\R                     S\	\
   S\
S\	\
   S	\4S
 jj5       rSrU =r$ )PatchTSMixerForPretrainingi;  r;   c                    > [         TU ]  U5        [        USS9U l        [	        US9U l        UR                  U l        UR                  U l        UR                  (       a  U R                  5         g g )NT)r  r   )	r   r   r  rH  r[  headmasked_lossr  r  rC   s     r&   r   #PatchTSMixerForPretraining.__init__A  s`     &v$?
,F;	!--%55 NN r(   rI  r  r  return_lossr  rX   c                    Ub  UOU R                   nU R                  SL a  [        R                  R	                  SS9nO[        R                  R	                  SS9nU R                  UUUUS9n[        U[        5      (       a  [        U6 nU R                  UR                  5      nUSL a  U" XR                  5      n	OSn	U R                  SL aK  U	bH  U	R                  SS9UR                  -  R                  5       UR                  R                  5       S	-   -  n	U(       d*  [        S
 U	UUR                  UR                  4 5       5      $ [!        U	UUR                  UR                  S9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
return_loss (`bool`,  *optional*):
    Whether to return the loss in the `forward` call.
NTnone	reductionrh   r  r  r  r   r   r  c              3   $   #    U H  nUv   M	     g 7fr*   rU  r  s     r&   r,  5PatchTSMixerForPretraining.forward.<locals>.<genexpr>        A r  r  r	  r  r   )r  r  rK   r   MSELossrH  r/  r   r  r  r  rn   rh   rv  r  r   r  )
r$   rI  r  r  r  r  r  model_outputx_hatloss_vals
             r&   r-   "PatchTSMixerForPretraining.forwardL  sp   2 &1%<k$BVBVt#88##f#5D88##f#5D zz'!5#	 " 
 lE**2LAL		,889$E#;#;<HH t#(< "-0A0AAFFHLL]L]LaLaLcfkLklH   22 ..	   0$*<<&44	
 	
r(   )r  r  rH  r  NFTN)r/   r0   r1   r2   r   r   r   rK   rL   r   r   r  r-   r5   r6   r7   s   @r&   r  r  ;  s    	1 	  15/4 &*D
\\D
  -D
 'tn	D

 D
 d^D
 
*D
 D
r(   r  z=
    Output type of [`PatchTSMixerForPredictionOutput`].
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
rg)PatchTSMixerForPredictionOutputi  a  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss.
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_input_channels)`):
    Prediction output from the forecast head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
loc (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
    Input mean
scale (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
    Input std dev
Nr  r	  r  r   r  r  rU  )r/   r0   r1   r2   r3   r  r   rK   r  rV  r	  r  r   r   r  r  r5   rU  r(   r&   r#  r#    s     )-D(5$$
%,6:!2!23:59x 1 1298<M8E%"3"345<'+C%##	$+)-E8E%%&-r(   r#  z
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.
    c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)"SamplePatchTSMixerPredictionOutputi  
sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, number_channels)`):
    Sampled values from the chosen distribution.
N	sequencesrU  r/   r0   r1   r2   r3   r'  r   rK   r  rV  r5   rU  r(   r&   r%  r%        
 .2Ix))*1r(   r%  c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)"SamplePatchTSMixerRegressionOutputi  r&  Nr'  rU  r(  rU  r(   r&   r+  r+    r)  r(   r+  inputtargetrX   c                 &    U R                  U5      * $ )z[
Computes the negative log likelihood loss from input distribution with respect to target.
)log_prob)r,  r-  s     r&   nllr0    s     NN6"""r(   input_tensorweightsc                 R   Ub  [         R                  " US:g  X-  [         R                  " U 5      5      n[         R                  " U(       a  UR	                  US9OUR	                  5       SS9nU(       a  UR	                  US9U-  $ UR	                  5       U-  $ U R                  US9$ )a:  
Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

Args:
    input_tensor (`torch.FloatTensor`):
        Input tensor, of which the average must be computed.
    weights (`torch.FloatTensor`, *optional*):
        Weights tensor, of the same shape as `input_tensor`.
    dim (`int`, *optional*):
        The dim along which to average `input_tensor`.

Returns:
    `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
r   r   rK  r  )rK   r  r  r  r  rh   )r1  r2  r   weighted_tensorsum_weightss        r&   weighted_averager6    s      ++glL4JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r(   c                   8  ^  \ rS rSrSrS\4U 4S jjr\     SS\R                  S\
\R                     S\
\R                     S\
\   S	\S
\
\   S\4S jj5       r\R                  " 5        SS\R                  S\
\R                     S\4S jj5       rSrU =r$ )PatchTSMixerForPredictioni  z
`PatchTSMixer` for forecasting application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r;   c                 4  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  S:X  a  S U l        OaUR                  n[        [        [        S.nUR                  UR                  S 5      nUb  U" US9U l        O[        SUR                   35      e[        U5      U l        [        UU R                  S9U l        UR"                  (       a  U R#                  5         g g )Nmse	student_tnormalnegative_binomialr   Unknown distribution output r;   r&  )r   r   r  r  r  num_parallel_samplesr&  r!  r   r   r   getrj   r  rH  r  r  r  )r$   r;   r   distribution_output_mapoutput_classr%   s        r&   r   "PatchTSMixerForPrediction.__init__  s     KK	%55*0*K*K'$*$?$?!;;%'+D$**C+&%;'#
 366v7Q7QSWXL'+7C+@( #?@Z@Z?[!\]]&v.
1 $ 8 8
	 NN r(   rI  r  future_valuesr  r  r  rX   c           	         U R                   S:X  a  [        R                  " SS9nO"U R                   S:X  a  [        nO[	        S5      eUb  UOU R
                  nU R                  UUUUS9n[        U[        5      (       a  [        U6 nU R                  UR                  5      n	Sn
U R                  b  U R                  (       ay  U R                  R                  U	UR                  SU R                  4   UR                   SU R                  4   S	9nUb(  US
L a#  U" UUSU R                  4   5      n
[#        U
5      n
OXR                   SU R                  4   -  UR                  SU R                  4   -   n	Ub  US
L a  U" XSU R                  4   5      n
OU R                  (       aJ  U R                  R                  XR                  UR                   S	9nUb  US
L a  U" X5      n
[#        U
5      n
O+XR                   -  UR                  -   n	Ub  US
L a  U" X5      n
U R                  b7  UR                  SU R                  4   nUR                   SU R                  4   nOUR                  nUR                   nU(       d,  [        S U
U	UR                  UR$                  UU4 5       5      $ ['        U
U	UR                  UR$                  UUS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,:
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target values of the time series, that serve as labels for the model. The `future_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.
return_loss (`bool`,  *optional*):
    Whether to return the loss in the `forward` call.
r:  rh   r  r0  2Invalid loss function: Allowed values: mse and nllNr  .r  r  Tc              3   $   #    U H  nUv   M	     g 7fr*   rU  r  s     r&   r,  4PatchTSMixerForPrediction.forward.<locals>.<genexpr>  r   r  )r  r	  r  r   r  r  )r  r   r  r0  rj   r  rH  r/  r   r  r  r  r  r&  distributionr  r  r6  r   r#  )r$   rI  r  rF  r  r  r  r  r  y_hatr  rL  r  r  s                 r&   r-   !PatchTSMixerForPrediction.forward  s   H 99::/DYY%DQRR%0%<k$BVBV zz'!5#	 " 
 lE**2LAL 		,889**6''#77DD$((d.M.M)MN&,,S$2Q2Q-QR  E  
 !,1D#$%c4+J+J&JK H
  09H ..sD4S4S/STT"&&sD,K,K'KLM  !,1D#Ed>]>]9]+^_H''#77DD//|7I7I  E   !,1D#L@H/9H 2 22\5E5EE ,1D#E9H**6""3(G(G#GHC &&sD,K,K'KLE""C &&E 
  22 ..
 
 
 /$*<<&44
 	
r(   c                 2   U R                   nU " USUSS9nU R                  R                  UR                  UR                  UR
                  S9n[        U5       Vs/ sH  oeR                  5       PM     nn[        R                  " USS9n[        US9$ s  snf )aX  
Generate sequences of sample predictions from a model with a probability distribution head.

Args:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.

    observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSMixerPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
    number of samples, prediction_length, num_input_channels)`.
NF)rI  rF  r  r  rI  r   r   r'  )rA  r&  rL  r	  r  r  r  samplerK   stackr%  )r$   rI  r  rA  outputsrL  r   sampless           r&   generate"PatchTSMixerForPrediction.generate  s    2  $88 #'!&	
 //<<&&GKKw}} = 

 388L2MN2MQ&&(2MN ++g1-1GDD	 Os   B)r&  r  r  rH  rA  r  r  )NNFTNr*   )r/   r0   r1   r2   r3   r   r   r   rK   rL   r   r   r#  r-   no_gradr%  rU  r5   r6   r7   s   @r&   r8  r8    s    	1 @  1504/4 &*w
\\w
  -w
  -	w

 'tnw
 w
 d^w
 
)w
 w
r ]]_ 15-E\\-E  --E 
,	-E -Er(   r8  zK
    Output type of [`PatchTSMixerForTimeSeriesClassificationOutput`].
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Srg)	-PatchTSMixerForTimeSeriesClassificationOutputi  a,  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss.
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
    Prediction output from the classification head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nr  r	  r  r   rU  r
  rU  r(   r&   rY  rY    r  r(   rY  c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\R                  S\
\R                     S\
\   S\S	\
\   S
\4S jj5       rSrU =r$ )'PatchTSMixerForTimeSeriesClassificationi  z
`PatchTSMixer` for classification application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r;   c                 <  > [         TU ]  U5        [        U5      U l        [	        US9U l        UR                  U l        UR                  S;   a$  [        UR                  UR                  S9U l        OS U l        UR                  (       a  U R                  5         g g )Nr   ri   rh   Tr@   rV   )r   r   r  rH  r5  r  r  r   InjectScalerStatistics4Dr@   rV   inject_scaler  rC   s     r&   r   0PatchTSMixerForTimeSeriesClassification.__init__  s     &v.
*
	  &55>>22 8]c]o]o pD $D NN r(   rI  target_valuesr  r  r  rX   c                 <   [         R                  R                  5       nUb  UOU R                  nU R	                  UUUS9n[        U[        5      (       a  [        U6 nU R                  b4  U R                  UR                  UR                  UR                  S9Ul	        U R                  UR                  5      nUb  USL a	  U" X5      n	OSn	U(       d*  [        S U	UUR                  UR                  4 5       5      $ [        U	UUR                  UR                  S9$ )aH  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target
    values of the time series, that serve as labels for the model. The `target_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.

    For a classification task, it has a shape of `(batch_size,)`.

    For a regression task, it has a shape of `(batch_size, num_targets)`.
return_loss (`bool`, *optional*):
    Whether to return the loss in the `forward` call.
Nr  rI  Tc              3   $   #    U H  nUv   M	     g 7fr*   rU  r  s     r&   r,  BPatchTSMixerForTimeSeriesClassification.forward.<locals>.<genexpr>>  r  r  r  )rK   r   CrossEntropyLossr  rH  r/  r   r  r`  r  r  r  r  r   rY  )
r$   rI  rb  r  r  r  r  r  rM  r  s
             r&   r-   /PatchTSMixerForTimeSeriesClassification.forward  s/   H xx((*%0%<k$BVBVzz!5# " 

 lE**2LAL(-1->->.. $$"(( .? .L* 		,889$)<E1HH   22 ..	   =$*<<&44	
 	
r(   )r  r`  rH  r  r!  )r/   r0   r1   r2   r3   r   r   r   rK   rL   r   r   rY  r-   r5   r6   r7   s   @r&   r[  r[    s    	1 "  15/4 &*M
\\M
  -M
 'tn	M

 M
 d^M
 
7M
 M
r(   r[  z=
    Output type of [`PatchTSMixerForRegressionOutput`].
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Srg)	PatchTSMixerForRegressionOutputiP  a)  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss.
regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
    Prediction output from the regression head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nr  regression_outputsr  r   rU  )r/   r0   r1   r2   r3   r  r   rK   r  rV  rj  r  r   r   r5   rU  r(   r&   ri  ri  P  r  r(   ri  c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  S\R                  S\R                  4S	 jrS
r	U =r
$ )r_  ih  r@   rV   	expansionc                 (  > [         TU ]  5         [        R                  " US-   X1-  5      U l        [        R                  " X1-  U5      U l        [        R                  " SSU-  5      U l        [        R                  " SU-  S5      U l        X l        g r~  )	r   r   r   r    inverse_trans_expansioninverse_trans_compressionmap_scale_expansionmap_scale_compressionrV   )r$   r@   rV   rl  r%   s       r&   r   !InjectScalerStatistics4D.__init__i  sr    ')yy1i>Q'R$)+93F)P&#%99QI#> %'YYq9}a%@"&r(   r+   r  r  c                    UR                  SS5      nUR                  S5      nUR                  SSU R                  S5      nUR                  SS5      nUR                  S5      nUR                  SSU R                  S5      n[        R
                  " XE/SS9nU R                  U5      nU R                  U5      n[        R
                  " X/SS9nU R                  U5      nU R                  U5      nU$ )aQ  
Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`)
    loc (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
    scale (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
Returns:
    `torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`
r   r  r   r   )
rG   rb   rl  rV   rK   catrp  rq  rn  ro  )r$   r+   r  r  rh   stdevconcat_statss          r&   r-    InjectScalerStatistics4D.forwardr  s     }}R$~~b!{{1a!1!115B'#Q4#3#3Q7yy$B7//=11,?F1r:--f5//7r(   )ro  rn  rq  rp  rV   )rF   )r/   r0   r1   r2   r4   r   rK   rL   r-   r5   r6   r7   s   @r&   r_  r_  h  sM    ' '# '# ' 'ell  ell  r(   r_  z4
    `PatchTSMixer` for regression application.
    c                      ^  \ rS rSrS\4U 4S jjr\    SS\R                  S\	\R                     S\	\
   S\
S\	\
   S	\4S
 jj5       r\R                  " 5       S\R                  S	\4S j5       rSrU =r$ )PatchTSMixerForRegressioni  r;   c                   > [         TU ]  U5        [        U5      U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  S:X  a  S U l        O^[        [        [        S.nUR                  UR
                  5      nUb  U" UR                  S9U l        O[        SUR
                   35      eUR                  S;   a$  [        UR                   UR"                  S9U l        OS U l        ['        UU R
                  S9U l        UR*                  (       a  U R+                  5         g g )Nr:  r;  r   r?  r]  r^  r@  )r   r   r  rH  r  r&  r  rA  r   r   r   rB  r:  rj   r   r_  r@   rV   r`  r5  r  r  )r$   r;   rC  rD  r%   s       r&   r   "PatchTSMixerForRegression.__init__  s$    &v.
KK	#)#=#= %55$*$?$?!;;%'+D$ ,&%;'#
 366v7Q7QRL'+7F<N<N+O( #?@Z@Z?[!\]]>>22 8]c]o]o pD $D* $ 8 8
	 NN r(   rI  rb  r  r  r  rX   c           	         U R                   S:X  a  [        R                  " SS9nO"U R                   S:X  a  [        nO[	        S5      eUb  UOU R
                  nU R                  UUUS9n[        U[        5      (       a  [        U6 nU R                  b4  U R                  UR                  UR                  UR                  S9Ul        U R                  UR                  5      nUb  US	L a  U R                  (       a  U R                  S
:X  a)  [         R"                  " US:  5      (       a  [%        S5      eU R                  R'                  U5      n	[        U V
s/ sH(  oR)                  SU R*                  R,                  5      PM*     sn
5      nU" X5      n[/        U5      nOU" X5      nOSnU(       d*  [        S UUUR                  UR0                  4 5       5      $ [3        UUUR                  UR0                  S9$ s  sn
f )aD  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target values of the time series, that serve as labels for the model. The `target_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.

    For a classification task, it has a shape of `(batch_size,)`.

    For a regression task, it has a shape of `(batch_size, num_targets)`.
return_loss (`bool`, *optional*):
    Whether to return the loss in the `forward` call.
r:  rh   r  r0  rH  Nr  rI  Tr>  r   zDtarget_values cannot be negative for negative_binomial distribution.r   c              3   $   #    U H  nUv   M	     g 7fr*   rU  r  s     r&   r,  4PatchTSMixerForRegression.forward.<locals>.<genexpr>	  r  r  )r  rj  r  r   )r  r   r  r0  rj   r  rH  r/  r   r  r`  r  r  r  r  r&  rK   any	ExceptionrL  r   r;   r:  r6  r   ri  )r$   rI  rb  r  r  r  r  r  rM  rL  itemr  s               r&   r-   !PatchTSMixerForRegression.forward  s   F 99::/DYY%DQRR%0%<k$BVBVzz!5# " 

 lE**2LAL(-1->->.. $$"(( .? .L* 		,889$)<''++/BBuyyQ^abQbGcGc#$jkk#77DDUKRWXRW$yyT[[-D-DERWXY<+H55H   22 ..	   /$*<<&44	
 	
) Ys   .G>c                 P   U R                   nU " USSS9nU R                  R                  UR                  5      n[	        U5       Vs/ sH  oTR                  5       PM     nn[        R                  " USS9R                  SX R                  R                  5      n[        US9$ s  snf )a  
Generate sequences of sample predictions from a model with a probability distribution head.

Args:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the target values.

Return:
    [`SamplePatchTSMixerRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
    number of samples, num_targets)`.
NF)rI  rb  r  r   r   r   rP  )rA  r&  rL  rj  r  rQ  rK   rR  r   r;   r:  r+  )r$   rI  rA  rS  rL  r   rT  s          r&   rU  "PatchTSMixerForRegression.generate  s    "  $88 #!&
 //<<W=W=WX ,11E+F
+Fa!+F 	 

 ++g1-2227K[[MdMde1GDD
s   B#)r&  r  r`  r  rH  rA  r  r!  )r/   r0   r1   r2   r   r   r   rK   rL   r   r   ri  r-   rW  r+  rU  r5   r6   r7   s   @r&   ry  ry    s    %1 %N  15/4 &*Z
\\Z
  -Z
 'tn	Z

 Z
 d^Z
 
)Z
 Z
x ]]_#E\\#E 
,#E #Er(   ry  )rG  r  r  r8  r[  ry  )Nr   N)NFr   )Nr   )NN)Rr3   rd   dataclassesr   typingr   r   r   rK   torch.nnr   transformers.modeling_utilsr   transformers.utilsr   modeling_flash_attention_utilsr
   modeling_utilsr   processing_utilsr   time_series_utilsr   r   r   utilsr   r   configuration_patchtsmixerr   
get_loggerr/   loggerModuler   r9   rN   ru   r   r   rL   r   r   r   r   r   r   r  r  r5  rG  r[  listr   r4   rz  r  r  r  r  r  r  r  r  r  r  r  r  r#  r%  r+  distributionsDistributionr0  r6  r8  rY  r[  ri  r_  ry  __all__rU  r(   r&   <module>r     sK   "  ! , ,   7 * B 5 & U U , : 
		H	% *&BII &,$RYY $N.BII .bbii .-299 -n  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%>U/BII U/pCbii CL*		 *Z#		 #L&#		 &#R5BII 5pDRYY DN )/ ) )2ryy D 04',7%LL7%7% 'tn7% !%	7%
 7%| 04	A%LLA%$T3Y/A% 'tnA% 	A%J-299 -b9"")) 9"z 0BII  0H3;RYY 3;n BII  6 
	= 	= 	=Bk5 BkJ 
.k . .4 
^
3 ^

^
B 
={ = =$ 
Q
!< Q

Q
h 
.k . .0 2 2 2 2 2 2#u""// # #%,, #*5<< *(5<<:P *fkfrfr *0TE ; TEn 
=K = =$k
.I k
\ 
=k = =$%ryy %P 
iE ; iE
iEXr(   