
    <hի                     8   S r SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
  SSKrSSKrSSKrSSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJrJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  \$RT                  " \+5      r,\\#" SS9 " S S\"5      5       5       r-\\#" SS9 " S S\"5      5       5       r.S r/ " S S\R`                  5      r1 " S S\R`                  5      r2 SAS\R`                  S\Rf                  S \Rf                  S!\Rf                  S"\	\Rf                     S#\4S$\44S% jjr5 " S& S'\R`                  5      r6 " S( S)\R`                  5      r7 " S* S+\R`                  5      r8 " S, S-\R`                  5      r9 " S. S/\R`                  5      r: " S0 S1\5      r; " S2 S3\R`                  5      r<\# " S4 S5\5      5       r=\# " S6 S7\=5      5       r> " S8 S9\R`                  5      r?\#" S:S9 " S; S<\=5      5       r@\#" S=S9 " S> S?\=5      5       rA/ S@QrBg)Bz,PyTorch VideoMAE (masked autoencoder) model.    N)deepcopy)	dataclass)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD   )VideoMAEConfigz[
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)VideoMAEDecoderOutput-   zx
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
    Pixel reconstruction logits.
Nlogitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r    r   torchFloatTensor__annotations__r!   tupler"   __static_attributes__r#       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/videomae/modeling_videomae.pyr   r   -   sR    
 +/FHU&&'.8<M8E%"3"345<59Ju00129r.   r   zb
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	VideoMAEForPreTrainingOutput>   z
loss (`torch.FloatTensor` of shape `(1,)`):
    Pixel reconstruction loss.
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
    Pixel reconstruction logits.
Nlossr    r!   r"   r#   )r$   r%   r&   r'   r(   r3   r   r)   r*   r+   r    r!   r,   r"   r-   r#   r.   r/   r1   r1   >   sg     )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r.   r1   c                 t  ^ U4S jn[         R                  " [        U 5       Vs/ sH
  o2" U5      PM     sn5      n[         R                  " USS2SSS24   5      USS2SSS24'   [         R                  " USS2SSS24   5      USS2SSS24'   [
        R                  " U5      R                  S5      $ s  snf )z Sinusoid position encoding tablec           
         > [        T5       Vs/ sH%  o[        R                  " SSUS-  -  T-  5      -  PM'     sn$ s  snf )Ni'     )rangenppower)positionhid_jd_hids     r/   get_position_angle_vec;get_sinusoid_encoding_table.<locals>.get_position_angle_vecX   s?    RWX]R^_R^288E1
+;e+CDDR^___s   +=Nr   r6   r   )r8   arrayr7   sincosr)   r*   	unsqueeze)
n_positionr<   r=   pos_isinusoid_tables    `   r/   get_sinusoid_encoding_tablerF   T   s    ` XX%PZJ[\J[5e<J[\]N ff^Aqt!tG%<=N1add7 ff^Aqt!tG%<=N1add7^,66q99	 ]s   B5c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VideoMAEEmbeddingsb   z/
Construct the patch and position embeddings.

c                    > [         TU ]  5         [        U5      U l        U R                  R                  U l        [        U R                  UR                  5      U l        Xl        g N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesrF   hidden_sizeposition_embeddingsconfigselfrS   	__class__s     r/   rM   VideoMAEEmbeddings.__init__h   sP     7 ?00<<#>t?O?OQWQcQc#d r.   c                    U R                  U5      nX0R                  R                  5       R                  U5      R	                  UR
                  SS9-   nUb'  UR                  u  pEnX2)    nUR                  USU5      nU$ )NTdevicecopy)rO   rR   detachtype_astorZ   shapereshape)rU   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelss          r/   forwardVideoMAEEmbeddings.forwardq   s    **<8
  ":":"A"A"C"K"KJ"W"Z"Z$$4 #[ #
 


 &*4*:*:'J<#$45J#++JLIJr.   )rS   rP   rO   rR   	r$   r%   r&   r'   r(   rM   rh   r-   __classcell__rV   s   @r/   rH   rH   b   s    
 r.   rH   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rN      ac  
Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
patch_size).

c           	        > [         T	U ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nUR                  n[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nX l        X0l        [        U5      U l        US   US   -  US   US   -  -  X`R                  -  -  nX@l        Xl        [        R                  " UUU R                  US   US   4U R                  US   US   4S9U l        g )Nr   r   )in_channelsout_channelskernel_sizestride)rL   rM   
image_size
patch_sizerg   rQ   
num_framestubelet_size
isinstancecollectionsabcIterableintrP   r   Conv3d
projection)
rU   rS   rt   ru   rg   rQ   rv   rw   rP   rV   s
            r/   rM    VideoMAEPatchEmbeddings.__init__   s3   &&
&&
**((&&
**#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
$$-]jm+
1A0NOS]ararSrs 	 )&))$$**JqM:a=I%%z!}jmD	
r.   c                    UR                   u  p#pEnX@R                  :w  a  [        S5      eXPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR	                  SSSS	S
5      nU R                  U5      R                  S5      R                  SS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r6   r      )r`   rg   
ValueErrorrt   permuter~   flatten	transpose)rU   rb   re   rv   rg   heightwidthrd   s           r/   rh   VideoMAEPatchEmbeddings.forward   s    >J>P>P;
e,,,w  __Q''5OOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  $++Aq!Q:__\2::1=GG1M
r.   )rt   rg   rP   ru   r~   rw   rj   rl   s   @r/   rN   rN      s    
6 r.   rN   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr\   )dimdtype)ptrainingr   r6   )r)   matmulr   r   
functionalsoftmaxfloat32r_   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r.   c            
          ^  \ rS rSrS\SS4U 4S jjr S
S\\R                     S\	S\
\\R                  \R                  4   \\R                     4   4S jjrS	rU =r$ )VideoMAESelfAttention   rS   returnNc                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  SS9U l        UR&                  (       as  [        R(                  " [*        R,                  " U R                  5      5      U l        [        R(                  " [*        R,                  " U R                  5      5      U l        g S U l        S U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      Fbias)rL   rM   rQ   num_attention_headshasattrr   rS   r|   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearr   r   r   qkv_bias	Parameterr)   zerosq_biasv_biasrT   s     r/   rM   VideoMAESelfAttention.__init__   s    : ::a?PVXhHiHi"6#5#5"6 7334A7  #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EER
99V//1C1C%PYYv1143E3EER
??,,u{{43E3E'FGDK,,u{{43E3E'FGDKDKDKr.   	head_maskoutput_attentionsc                    UR                   u  pEnU R                  b  [        R                  " U R                  SS9OS n[
        R                  R                  XR                  R                  US9n[
        R                  R                  XR                  R                  U R                  S9n	[
        R                  R                  XR                  R                  U R                  S9n
UR                  USU R                  U R                  5      R                  SS5      nU	R                  USU R                  U R                  5      R                  SS5      nU
R                  USU R                  U R                  5      R                  SS5      n[         nU R"                  R$                  S:w  aT  U R"                  R$                  S:X  a  U(       a  [&        R)                  S	5        O[*        U R"                  R$                     nU" U UUUUU R,                  U R.                  U R0                  (       d  S
OU R2                  S9u  nnUR5                  5       S S U R6                  4-   nUR9                  U5      nU(       a  UU4nU$ U4nU$ )NF)requires_grad)inputweightr   r\   r   r6   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r   )r`   r   r)   
zeros_liker   r   r   linearr   r   r   r   viewr   r   r   r   rS   _attn_implementationloggerwarning_oncer   r   r   r   r   sizer   ra   )rU   r!   r   r   re   
seq_lengthrf   k_biaskeysvaluesqueries	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                      r/   rh   VideoMAESelfAttention.forward   s'    %2$7$7!
GK{{G^!!$++UCdh}}##-V\#]%%M**BSBSZ^ZeZe%f--&&]::CTCT[_[f[f&gIIj"d.F.FH`H`akklmopq	kk*b$2J2JDLdLdeoopqstull:r43K3KTMeMefppqrtuv(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=/2 O\M]r.   )r   r   rS   r   r   r   r   r   r   r   r   r   NF)r$   r%   r&   r'   r   rM   r   r)   Tensorboolr   r,   rh   r-   rk   rl   s   @r/   r   r      sm    ~ $ 6 bg'(0(>'Z^'	uU\\5<</0%2EE	F' 'r.   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )VideoMAESelfOutputi  z
The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rS   r   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rK   )	rL   rM   r   r   rQ   denseDropouthidden_dropout_probr   rT   s     r/   rM   VideoMAESelfOutput.__init__#  sB    YYv1163E3EF
zz&"<"<=r.   r!   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ rK   r   r   rU   r!   r   s      r/   rh   VideoMAESelfOutput.forward(  s$    

=1]3r.   r   )r$   r%   r&   r'   r(   r   rM   r)   r   rh   r-   rk   rl   s   @r/   r   r     sI    
>~ >$ >
U\\  RWR^R^  r.   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )VideoMAEAttentioni0  rS   r   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g rK   )rL   rM   r   	attentionr   outputsetpruned_headsrT   s     r/   rM   VideoMAEAttention.__init__1  s0    .v6(0Er.   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rU   r   indexs      r/   prune_headsVideoMAEAttention.prune_heads7  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r.   r!   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )rU   r!   r   r   self_outputsattention_outputr   s          r/   rh   VideoMAEAttention.forwardI  sC     ~~m@QR;;|AF#%QR(88r.   )r   r   r   r   )r$   r%   r&   r'   r   rM   r   r|   r   r)   r   r   r   r   r,   rh   r-   rk   rl   s   @r/   r   r   0  s    "~ "$ ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r.   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	VideoMAEIntermediateiX  rS   r   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rK   )rL   rM   r   r   rQ   intermediate_sizer   rx   
hidden_actstrr   intermediate_act_fnrT   s     r/   rM   VideoMAEIntermediate.__init__Y  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r.   r!   c                 J    U R                  U5      nU R                  U5      nU$ rK   r   r   )rU   r!   s     r/   rh   VideoMAEIntermediate.forwarda  s&    

=100?r.   r  r$   r%   r&   r'   r   rM   r)   r   rh   r-   rk   rl   s   @r/   r   r   X  s6    9~ 9$ 9U\\ ell  r.   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )
VideoMAEOutputii  rS   r   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g rK   )
rL   rM   r   r   r   rQ   r   r   r   r   rT   s     r/   rM   VideoMAEOutput.__init__j  sB    YYv779K9KL
zz&"<"<=r.   r!   r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ rK   r   r   s      r/   rh   VideoMAEOutput.forwardo  s,    

=1]3%4r.   r   r  rl   s   @r/   r  r  i  sD    >~ >$ >
U\\  RWR^R^  r.   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr  SS\R                  S\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )VideoMAELayeriy  z?This corresponds to the Block class in the timm implementation.rS   r   Nc                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)rL   rM   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater  r   r   	LayerNormrQ   layer_norm_epslayernorm_beforelayernorm_afterrT   s     r/   rM   VideoMAELayer.__init__|  s    '-'E'E$*6208$V, "V-?-?VEZEZ [!||F,>,>FDYDYZr.   r!   r   r   c                     U R                  U R                  U5      UUS9nUS   nUSS  nXQ-   nU R                  U5      nU R                  U5      nU R	                  Xq5      nU4U-   nU$ )N)r   r   r   )r   r  r  r  r   )rU   r!   r   r   self_attention_outputsr   r   layer_outputs           r/   rh   VideoMAELayer.forward  s     "&!!-0/ "0 "

 2!4(, )8 ++M:((6 {{<?/G+r.   )r   r  r  r  r  r   r  r   )r$   r%   r&   r'   r(   r   rM   r)   r   r   r   r   r,   rh   r-   rk   rl   s   @r/   r  r  y  s    I[~ [$ [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r.   r  c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )VideoMAEEncoderi  rS   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
rL   rM   rS   r   
ModuleListr7   num_hidden_layersr  layergradient_checkpointing)rU   rS   rf   rV   s      r/   rM   VideoMAEEncoder.__init__  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A%r!   r   r   output_hidden_statesreturn_dictc                 6   U(       a  SOS nU(       a  SOS n[        U R                  5       H9  u  pU(       a  Xa4-   nUb  X(   OS n
U	" XU5      nUS   nU(       d  M1  X{S   4-   nM;     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr#   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frK   r#   .0vs     r/   	<genexpr>*VideoMAEEncoder.forward.<locals>.<genexpr>  s     m$[q$[   	last_hidden_stater!   r"   )	enumerater   r,   r   )rU   r!   r   r   r#  r$  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r/   rh   VideoMAEEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO(IZ[M)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r.   )rS   r!  r   )NFFT)r$   r%   r&   r'   r   rM   r)   r   r   r   r   r,   r   rh   r-   rk   rl   s   @r/   r  r    s    ,~ ,$ , -1"'%* !
||!
 ELL)!
  	!

 #!
 !
 
uo%	&!
 !
r.   r  c                   B    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrS rSrg)	VideoMAEPreTrainedModeli  rS   videomaerb   Tc                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsr   )meanstdNg      ?)rx   r   r   r}   r   datanormal_rS   initializer_ranger   zero_r  fill_)rU   r   s     r/   _init_weights%VideoMAEPreTrainedModel._init_weights  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r.   r#   N)r$   r%   r&   r'   r   r+   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendrB  r-   r#   r.   r/   r8  r8    s5    "$O&*#N"&
*r.   r8  c                      ^  \ rS rSrU 4S jrS rS r\     SS\R                  S\
\R                     S\
\R                     S\
\   S	\
\   S
\
\   S\\\4   4S jj5       rSrU =r$ )VideoMAEModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  S U l        O.[        R                  " UR                  UR                  S9U l        U R                  5         g )Nr  )rL   rM   rS   rH   rd   r  encoderuse_mean_pooling	layernormr   r  rQ   r  	post_initrT   s     r/   rM   VideoMAEModel.__init__  sg     ,V4&v.""!DN\\&*<*<&BWBWXDN 	r.   c                 .    U R                   R                  $ rK   )rd   rO   )rU   s    r/   get_input_embeddings"VideoMAEModel.get_input_embeddings  s    ///r.   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrN  r   r   r   )rU   heads_to_pruner   r   s       r/   _prune_headsVideoMAEModel._prune_heads  s<    
 +002LELLu%//;;EB 3r.   rb   rc   r   r   r#  r$  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  X0R                   R
                  5      nU R                  X5      nU R                  UUUUUS9nUS   n	U R                  b  U R                  U	5      n	U(       d	  U	4USS -   $ [        U	UR                  UR                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
    batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
    length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import AutoImageProcessor, VideoMAEModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 16 frames
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 1568, 768]
```Nr   r   r#  r$  r   r   r-  )rS   r   r#  use_return_dictget_head_maskr  rd   rN  rP  r   r!   r"   )
rU   rb   rc   r   r   r#  r$  embedding_outputencoder_outputssequence_outputs
             r/   rh   VideoMAEModel.forward  s    r 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y++2O2OP	??<I,,/!5# ' 
 *!,>>%"nn_=O#%(;;;-)77&11
 	
r.   )rS   rd   rN  rP  )NNNNN)r$   r%   r&   r'   rM   rT  rY  r   r)   r*   r   
BoolTensorr   r   r   r,   r   rh   r-   rk   rl   s   @r/   rL  rL    s    0C  7;,0,0/3&*y
''y
 "%"2"23y
 ELL)	y

 $D>y
 'tny
 d^y
 
uo%	&y
 y
r.   rL  c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )VideoMAEDecoderi  c                   > [         TU ]  5         UR                  UR                  -  UR                  S-  -  n[        U5      nUR                  Ul        UR                  Ul	        UR                  Ul        UR                  Ul        [        R                  " [!        UR                  5       Vs/ sH  n[#        U5      PM     sn5      U l        [        R&                  " UR                  5      U l        US:  a!  [        R*                  " UR                  U5      O[        R,                  " 5       U l        SU l        Xl        g s  snf )Nr6   r   F)rL   rM   rg   rw   ru   r   decoder_hidden_sizerQ   decoder_num_hidden_layersr  decoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r7   r  decoder_layersr  normr   Identityheadr!  rS   )rU   rS   rP   decoder_num_labelsdecoder_configrf   rV   s         r/   rM   VideoMAEDecoder.__init__  s
   #0063F3FFIZIZ\]I]]!&)%+%?%?"+1+K+K(-3-O-O*+1+K+K( mm49&:Z:Z4[\4[q]>*4[\
 LL!;!;<	I[^_I_BIIf002DEegepeper 		 ',# ]s   /Ec                    U(       a  SOS nU(       a  SOS n[        U R                  5       H/  u  pU(       a  Xa4-   nU	" US US9n
U
S   nU(       d  M'  XzS   4-   nM1     U(       a  Xa4-   nUS:  a  US S 2U* S 24   nU R                  U5      nU R                  U5      nU(       d  [	        S XU4 5       5      $ [        XUS9$ )Nr#   )r   r   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frK   r#   r'  s     r/   r*  *VideoMAEDecoder.forward.<locals>.<genexpr>  s     f$Tq$Tr,  )r    r!   r"   )r/  rk  rl  rn  r,   r   )rU   r!   return_token_numr   r#  r$  r0  r1  r2  r3  r5  r    s               r/   rh   VideoMAEDecoder.forward  s     #7BD$5b4()<)<=OA#$58H$H!($ZklM)!,M  &91=M<O&O#  >   14D Da)!.>->-?*?@M 		-0=)fV@S$Tfff$F`sttr.   )rS   rk  r!  rn  rl  )FFT)r$   r%   r&   r'   rM   rh   r-   rk   rl   s   @r/   re  re    s    4  ""u "ur.   re  zb
    The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.
    c                      ^  \ rS rSrU 4S jr\    SS\R                  S\R                  S\	\R                     S\	\   S\	\   S\	\   S	\\\4   4S
 jj5       rSrU =r$ )VideoMAEForPreTrainingi  c                   > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " SSUR                  5      5      U l        [        U R                  R                  R                   UR                  5      U l        [%        XR                  R                  R                   S9U l        U R)                  5         g )NFr   r   )rP   )rL   rM   rS   rL  r9  r   r   rQ   rg  encoder_to_decoderr   r)   r   
mask_tokenrF   rd   rP   rR   re  decoderrQ  rT   s     r/   rM   VideoMAEForPreTraining.__init__  s     %f-"$))F,>,>@Z@Zaf"g,,u{{1a9S9S'TU#>MM$$00&2L2L$
  'v==;S;S;_;_` 	r.   rb   rc   r   r   r#  r$  r   c                 t   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   nU R                  U5      nUR                  u  pnUc  [        S5      eU R                  R                  U	SS5      R                  U5      nUR                  5       R                  UR                  SS9nX)    R                  U	SU5      nX   R                  U	SU5      n[        R                  " X-   U R                  U-   /SS	9nU R!                  XR                  S   5      nUR"                  nSn[        R$                  " 5          U R                   R&                  S
:w  a  UnOUR                  nUR(                  n[        R*                  " [,        5      R                  UUS9SSSS2SS4   n[        R*                  " [.        5      R                  UUS9SSSS2SS4   nUU-  U-   nUR                  u  n	nnnnU R                   R0                  U R                   R2                  nnU R                   R4                  (       a  UR7                  U	UU-  UUUU-  UUU-  U5      nUR9                  SSSSSSSS
5      R;                  5       nUR7                  U	UU-  U-  U-  U-  U-  UU-  U-  U5      nUUR=                  SSS9-
  UR?                  SSSS9RA                  5       S-   -  nUR7                  U	UU-  U-  U-  U-  U-  UU-  U-  U-  5      nOU R                   R&                  S
:w  a  [        S5      eUR7                  U	UU-  UUUU-  UUU-  U5      nUR9                  SSSSSSSS
5      R;                  5       nUR7                  U	UU-  U-  U-  U-  U-  UU-  U-  U-  5      nUR                  u  n	nnUU   R                  U	SU5      n SSS5        [C        5       n!U!" UW 5      nU(       d  U4USS -   n"Ub  U4U"-   $ U"$ [E        UUURF                  URH                  S9$ ! , (       d  f       N\= f)ae  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
    batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
    (image_size // patch_size) ** 2`.

Examples:
```python
>>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
>>> import numpy as np
>>> import torch

>>> num_frames = 16
>>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

>>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

>>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
>>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
>>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss = outputs.loss
```N)rc   r   r   r#  r$  r   z!One must provided a boolean mask r\   TrY   r   r   r   )rZ   r   r      r6         r   )r   keepdim)r   unbiasedr  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r3   r    r!   r"   )%rS   r]  r9  rz  r`   r   rR   expandr^   r]   r_   rZ   ra   r)   catr{  r|  r    no_gradrg   r   	as_tensorr   r   rw   ru   norm_pix_lossr   r   r   r;  varsqrtr   r1   r!   r"   )#rU   rb   rc   r   r   r#  r$  r   ra  re   seq_lenrg   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr    r3   framesrZ   r   r;  r<  timer   r   rw   ru   frames_normvideos_patchrf   labelsloss_fctr   s#                                      r/   rh   VideoMAEForPreTraining.forward  s   J &1%<k$++B]B]--+/!5#   
 "!*11
 -<,A,A)
\ "@AA'+'?'?'F'FzSUWY'Z'b'bco'p$'C'J'J'L'O'OWcWjWjqu'O'v$67GHPPQ[]_amn3DLLZY[]ij O=tQ]?]^def ,,v/A/A!/DE '']]_{{''1,% &,,$**'<=@@V[@\]acgijlprv]vwoo&:;>>fTY>Z[_aeghjnpt[tu%+d2<BLL9JlFE'+{{'?'?AWAW*L{{((L(  j(Z'	  1aAq!Q?JJLL(61Z?%G:U :-
: 	  &D(IIJJ2dJCHHJTQ  +//L(61Z?%G:U :-
:\I  ;;++q0$k   L(  j(Z'	  1aAq!Q?JJL%{{L(61Z?%G:U :-
:\I  +7*<*<'J<!/2:::r<XFQ T 9'Y,F)-)9TGf$EvE+!//))	
 	
c _s   	J
P))
P7)rS   r|  rz  r{  rR   r9  )NNNN)r$   r%   r&   r'   rM   r   r)   r*   rc  r   r   r   r   r,   r1   rh   r-   rk   rl   s   @r/   rx  rx    s    " 
 -1,0/3&*[
''[
 ))[
 ELL)	[

 $D>[
 'tn[
 d^[
 
u22	3[
 [
r.   rx  z
    VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrU 4S jr\      SS\\R                     S\\R                     S\\R                     S\\	   S\\	   S\\	   S	\
\\4   4S
 jj5       rSrU =r$ )VideoMAEForVideoClassificationir  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  (       a   [        R                  " UR                  5      OS U l	        UR                  S:  a+  [        R                  " UR                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rL   rM   
num_labelsrL  r9  rO  r   r  rQ   fc_normr   rm  
classifierrQ  rT   s     r/   rM   'VideoMAEForVideoClassification.__init__y  s      ++%f- <B;R;Rr||F$6$67X\NTN_N_bcNc"))F$6$68I8IJikititiv 	r.   rb   r   r  r   r#  r$  r   c                    Ub  UOU R                   R                  nU R                  UUUUUS9nUS   nU R                  b!  U R                  UR	                  S5      5      nO	USS2S4   nU R                  U5      n	Sn
UGb  U R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" U	R                  5       UR                  5       5      n
OU" X5      n
OU R                   R                  S:X  a=  [        5       nU" U	R                  SU R                  5      UR                  S5      5      n
O,U R                   R                  S:X  a  [!        5       nU" X5      n
U(       d  U	4USS -   nU
b  U
4U-   $ U$ [#        U
U	UR$                  UR&                  S	9$ )
a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import torch
>>> import numpy as np

>>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 16 frames
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
eating spaghetti
```Nr\  r   r   
regressionsingle_label_classificationmulti_label_classificationr\   r  )rS   r]  r9  r  r;  r  problem_typer  r   r)   longr|   r   squeezer
   r   r	   r   r!   r"   )rU   rb   r   r  r   r#  r$  r   ra  r    r3   r  r   s                r/   rh   &VideoMAEForVideoClassification.forward  s   x &1%<k$++B]B]--/!5#   
 "!*<<#"ll?+?+?+BCO-ad3O1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r.   )r  r  r  r9  )NNNNNN)r$   r%   r&   r'   rM   r   r   r)   r   r   r   r,   r   rh   r-   rk   rl   s   @r/   r  r  r  s      04,0)-,0/3&*N
u||,N
 ELL)N
 &	N

 $D>N
 'tnN
 d^N
 
u++	,N
 N
r.   r  )rx  rL  r8  r  )r   )Cr(   collections.abcry   r[   r   dataclassesr   typingr   r   r   numpyr8   r)   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   utils.constantsr   r   configuration_videomaer   
get_loggerr$   r   r   r1   rF   ModulerH   rN   r   floatr   r   r   r   r   r  r  r  r8  rL  re  rx  r  __all__r#   r.   r/   <module>r     sb   3   ! , ,     A A ! 9 F F Q 
 K 2 
		H	% 
:K : : 
:; : : : B2bii 2z %II%<<% 
% <<	%
 U\\*% % %<BBII BL &$		 $P299 "RYY  '. 'V(
bii (
V *o * *. U
+ U
 U
p9ubii 9ux 
n
4 n

n
b ]
%< ]
]
@ sr.   