
    <h|                     n   S r SSKJrJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
  SSKJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJrJr  SSKJrJrJr  SSKJr  \R>                  " \ 5      r! " S S\RD                  5      r# " S S\RD                  5      r$ S4S\RD                  S\RJ                  S\RJ                  S\RJ                  S\\RJ                     S\&S\&4S jjr' " S S\RD                  5      r( " S S\RD                  5      r) " S S \RD                  5      r* " S! S"\RD                  5      r+ " S# S$\RD                  5      r, " S% S&\5      r- " S' S(\RD                  5      r. " S) S*\RD                  5      r/\ " S+ S,\5      5       r0\ " S- S.\05      5       r1\" S/S09 " S1 S2\05      5       r2/ S3Qr3g)5zPyTorch ViViT model.    )CallableOptionalUnionN)nn)CrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )VivitConfigc                   >   ^  \ rS rSrSrU 4S jrSS\4S jjrSrU =r	$ )VivitTubeletEmbeddings$   az  
Construct Vivit Tubelet embeddings.

This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
(width // tubelet_size[2]).
c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  S   -  U R                  U R
                  S   -  -  U R                  U R
                  S   -  -  U l        UR                  U l        [        R                  " UR                  UR                  UR                  UR                  S9U l        g )N   r   r   )kernel_sizestride)super__init__
num_frames
image_sizetubelet_size
patch_sizenum_patcheshidden_size	embed_dimr   Conv3dnum_channels
projectionselfconfig	__class__s     `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vivit/modeling_vivit.pyr    VivitTubeletEmbeddings.__init__/   s     ++ ++ --__ 22$//!"446$//!"446 	
  ++))!3!3ATAT]c]p]p
    interpolate_pos_encodingc                 b   UR                   u  p4pVnU(       dP  X`R                  :w  d  XpR                  :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR                  SSSSS	5      nU R	                  U5      nUR                  S5      R                  SS5      nU$ )
NzImage image size (*z) doesn't match model (r   r   z).r   r	      )shaper"   
ValueErrorpermuter*   flatten	transpose)	r,   pixel_valuesr2   
batch_sizer!   r)   heightwidthxs	            r/   forwardVivitTubeletEmbeddings.forward?   s    >J>P>P;
e'V-F%SbSbJb$VHAeW4KDOO\]L^K__`aeapapqras`ttvw 
 $++Aq!Q:OOL) IIaL""1a(r1   )r'   r"   r!   r%   r$   r*   F)
__name__
__module____qualname____firstlineno____doc__r    boolr@   __static_attributes____classcell__r.   s   @r/   r   r   $   s    
 d  r1   r   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSrU =r$ )VivitEmbeddingsP   z|
Vivit Embeddings.

Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        [        R                  " [        R
                  " SU R                  R                  S-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR                  SS  U l        Xl        g )Nr   )r   r    r   	Parametertorchzerosr&   	cls_tokenr   patch_embeddingsr%   position_embeddingsDropouthidden_dropout_probdropoutr#   r$   r-   r+   s     r/   r    VivitEmbeddings.__init__W   s    ekk!Q8J8J&KL 6v >#%<<KK400<<q@&BTBTU$
  zz&"<"<= --ab1r1   
embeddingsr=   r>   returnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr   g      ?r	   r   bicubicF)sizemodealign_cornersdim)r6   rU   rQ   jit
is_tracingr$   r   reshaper8   r   
functionalinterpolateviewcat)r,   rZ   r=   r>   r%   num_positionsclass_pos_embedpatch_pos_embedrc   
new_height	new_widthsqrt_num_positionss               r/   r2   (VivitEmbeddings.interpolate_pos_encodinge   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr1   r2   c                 "   UR                   u  p4pVnU R                  XS9nU R                  R                  USS/5      n	[        R
                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )Nr2   r   rb   )	r6   rT   rS   tilerQ   rj   r2   rU   rX   )
r,   r;   r2   r<   r!   r)   r=   r>   rZ   
cls_tokenss
             r/   r@   VivitEmbeddings.forward   s    >J>P>P;
e**<*k
^^((*a);<
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r1   )rS   r-   rX   rT   r$   rU   rB   )rC   rD   rE   rF   rG   r    rQ   Tensorintr2   rH   r@   rI   rJ   rK   s   @r/   rM   rM   P   sQ    &D5<< &D &DUX &D]b]i]i &DPd  r1   rM   modulequerykeyvalueattention_maskscalingrX   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr]   )rc   dtype)ptrainingr   r   )rQ   matmulr:   r   rg   softmaxfloat32tor   rX   r   
contiguous)
ry   rz   r{   r|   r}   r~   rX   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r1   c            
          ^  \ rS rSrS\SS4U 4S jjr  S
S\\R                     S\	S\
\\R                  \R                  4   \\R                     4   4S jjrS	rU =r$ )VivitSelfAttention   r-   r[   Nc                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r   r    r&   num_attention_headshasattrr7   r-   rx   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr~   	is_causalr   Linearqkv_biasrz   r{   r|   r+   s     r/   r    VivitSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r1   	head_maskoutput_attentionsc                    UR                   u  pEnU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     n
U
" U U	UUUU R                  U R                  U R                   (       d  SOU R"                  S9u  pUR%                  5       S S	 U R&                  4-   nUR)                  U5      nU(       a  X4nU$ U4nU$ )
Nr]   r   r   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r~   rX   r   )r6   r{   ri   r   r   r:   r|   rz   r   r-   _attn_implementationloggerwarning_oncer   r   r~   r   r   r_   r   rf   )r,   hidden_statesr   r   r<   
seq_length_	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                  r/   r@   VivitSelfAttention.forward   s    %2$7$7!
HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 )@;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=2 O\M]r1   )
r   r   r-   r   r   r{   r   rz   r~   r|   NF)rC   rD   rE   rF   r   r    r   rQ   rw   rH   r   tupler@   rI   rJ   rK   s   @r/   r   r      sw    ]{ ]t ]. -1"'	1 ELL)1  	1
 
uU\\5<</0%2EE	F1 1r1   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )VivitSelfOutputi	  z
The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r-   r[   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r   r    r   r   r&   denserV   rW   rX   r+   s     r/   r    VivitSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r1   r   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r   r   rX   r,   r   r   s      r/   r@   VivitSelfOutput.forward  s$    

=1]3r1   r   )rC   rD   rE   rF   rG   r   r    rQ   rw   r@   rI   rJ   rK   s   @r/   r   r   	  sI    
>{ >t >
U\\  RWR^R^  r1   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )VivitAttentioni  r-   r[   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r   r    r   	attentionr   outputsetpruned_headsr+   s     r/   r    VivitAttention.__init__  s0    +F3%f-Er1   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rb   )lenr   r   r   r   r   r   rz   r{   r|   r   r   r   union)r,   r   indexs      r/   prune_headsVivitAttention.prune_heads#  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r1   r   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )r,   r   r   r   self_outputsattention_outputr   s          r/   r@   VivitAttention.forward5  sC     ~~m@QR;;|AF#%QR(88r1   )r   r   r   r   )rC   rD   rE   rF   r   r    r   rx   r   rQ   rw   r   rH   r   r   r@   rI   rJ   rK   s   @r/   r   r     s    "{ "t ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VivitIntermediateiC  c                 ^  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r    r   r   r&   intermediate_sizer   rV   rW   rX   
isinstance
hidden_actstrr
   intermediate_act_fnr+   s     r/   r    VivitIntermediate.__init__D  sv    YYv1163K3KL
zz&"<"<=f''--'-f.?.?'@D$'-'8'8D$r1   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   rX   )r,   r   s     r/   r@   VivitIntermediate.forwardM  s4    

=100?]3r1   )r   rX   r   rC   rD   rE   rF   r    r@   rI   rJ   rK   s   @r/   r   r   C  s    9 r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VivitOutputiU  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r   r    r   r   r   r&   r   rV   rW   rX   r+   s     r/   r    VivitOutput.__init__V  sB    YYv779K9KL
zz&"<"<=r1   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r/   r@   VivitOutput.forward[  s,    

=1]3%4r1   r   r   rK   s   @r/   r   r   U  s    >
 r1   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )
VivitLayerie  zNThis corresponds to the EncoderBlock class in the scenic/vivit implementation.c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r   r    chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr&   layer_norm_epslayernorm_beforelayernorm_afterr+   s     r/   r    VivitLayer.__init__h  s    '-'E'E$'/-f5!&) "V-?-?VEZEZ [!||F,>,>FDYDYZr1   c                     U R                  U R                  U5      UUS9nUS   nUSS  nXQ-   nU R                  U5      nU R                  U5      nU R	                  Xq5      nU4U-   nU$ )N)r   r   r   )r   r   r   r   r   )r,   r   r   r   self_attention_outputsr   r   layer_outputs           r/   r@   VivitLayer.forwardr  s    !%!!-0/	 "0 "
 2!4(, )8 ++M:((6 {{<?/G+r1   )r   r   r   r   r   r   r   r   )	rC   rD   rE   rF   rG   r    r@   rI   rJ   rK   s   @r/   r   r   e  s    X[ r1   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )VivitEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r   r    r-   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r,   r-   r   r.   s      r/   r    VivitEncoder.__init__  sR    ]]fF^F^@_#`@_1Jv$6@_#`a
&+# $as   A%c                 6   U(       a  SOS nU(       a  SOS n[        U R                  5       H9  u  pU(       a  Xa4-   nUb  X(   OS n
U	" XU5      nUS   nU(       d  M1  X{S   4-   nM;     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r   ).0vs     r/   	<genexpr>'VivitEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater   
attentions)	enumerater   r   r   )r,   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r/   r@   VivitEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO(IZ[M)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r1   )r-   r   r   )NFFTr   rK   s   @r/   r   r     s     , "!
 !
r1   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VivitPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r   r    r   r   r&   r   Tanh
activationr+   s     r/   r    VivitPooler.__init__  s9    YYv1163E3EF
'')r1   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r,   r   first_token_tensorpooled_outputs       r/   r@   VivitPooler.forward  s6     +1a40

#566r1   )r  r   r   rK   s   @r/   r  r    s    $
 r1   r  c                   F    \ rS rSr% \\S'   SrSrSr/ r	Sr
SrSrSrS rSrg)	VivitPreTrainedModeli  r-   vivitr;   Tc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       aI  UR                   R
                  R                  5         UR"                  R
                  R                  5         gg)zInitialize the weightsr   )meanstdNg      ?)r   r   r   r(   weightdatanormal_r-   initializer_ranger   zero_	Embeddingpadding_idxr   fill_rM   rS   rU   )r,   ry   s     r/   _init_weights"VivitPreTrainedModel._init_weights  sU   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)00!!'')&&++113 1r1   r   N)rC   rD   rE   rF   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr*  rI   r   r1   r/   r  r    s;    $O&*#N"&4r1   r  c                      ^  \ rS rSrSU 4S jjrS rS r\      SS\\	R                     S\\	R                     S\\   S\\   S	\S
\\   S\\\	R                     \4   4S jj5       rSrU =r$ )
VivitModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r   r    r-   rM   rZ   r   encoderr   r   r&   r   	layernormr  pooler	post_init)r,   r-   add_pooling_layerr.   s      r/   r    VivitModel.__init__  si    
 	 )&1#F+f&8&8f>S>ST->k&)D 	r1   c                 .    U R                   R                  $ r   )rZ   rT   )r,   s    r/   get_input_embeddingsVivitModel.get_input_embeddings  s    ///r1   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)zy
Prunes heads of the model.

Args:
    heads_to_prune:
        dict of {layer_num: list of heads to prune in this layer}
N)itemsr8  r   r   r   )r,   heads_to_pruner   r   s       r/   _prune_headsVivitModel._prune_heads  s<     +002LELLu%//;;EB 3r1   r;   r   r   r	  r2   r
  r[   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  X R                   R                  5      nU R                  XS9nU R                  UUUUUS9nUS   n	U R                  U	5      n	U R                  b  U R                  U	5      OSn
U(       d	  X4USS -   $ [        U	U
UR                  UR                  S9$ )a(
  
Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import VivitImageProcessor, VivitModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 3137, 768]
```Nz You have to specify pixel_valuesrs   )r   r   r	  r
  r   r   )r  pooler_outputr   r  )r-   r   r	  use_return_dictr7   get_head_maskr   rZ   r8  r9  r:  r   r   r  )r,   r;   r   r   r	  r2   r
  embedding_outputencoder_outputssequence_outputr  s              r/   r@   VivitModel.forward  s#   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@&&y++2O2OP	??<?k,,/!5# ' 
 *!,..98<8OO4UY#3oab6III)-')77&11	
 	
r1   )r-   rZ   r8  r9  r:  )T)NNNNFN)rC   rD   rE   rF   r    r?  rD  r   r   rQ   FloatTensorrH   r   r   r   r@   rI   rJ   rK   s   @r/   r6  r6    s    "0	C  5915,0/3).&*s
u001s
 E--.s
 $D>	s

 'tns
 #'s
 d^s
 
uU&&')CC	Ds
 s
r1   r6  a  
        ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for Kinetics-400.

        <Tip>

            Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
            setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
            position embeddings to the higher resolution.

        </Tip>
    )custom_introc                      ^  \ rS rSrU 4S jr\       SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\
S	\\
   S
\\\R                     \4   4S jj5       rSrU =r$ )VivitForVideoClassificationi~  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r<  r   )r   r    
num_labelsr6  r  r   r   r&   Identity
classifierr;  r+   s     r/   r    $VivitForVideoClassification.__init__  ss      ++%@
 OUN_N_bcNc"))F$6$68I8IJikititiv 	r1   r;   r   labelsr   r	  r2   r
  r[   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   n	U R                  U	SS2SSS24   5      n
SnUb~  U R                  S:X  a2  [        5       nU" U
R                  S5      UR                  S5      5      nO<[        5       nU" U
R                  SU R                  5      UR                  S5      5      nU(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import numpy as np
>>> import torch

>>> from transformers import VivitImageProcessor, VivitForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
LABEL_116
```N)r   r   r	  r2   r
  r   r   r]   r   )losslogitsr   r  )r-   rH  r  rU  rS  r   ri   r   r   r   r  )r,   r;   r   rW  r   r	  r2   r
  r   rL  rZ  rY  loss_fctr   s                 r/   r@   #VivitForVideoClassification.forward  s   z &1%<k$++B]B]**/!5%=#  
 "!*Aq!9:!#"9BRA+-B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r1   )rU  rS  r  )NNNNNFN)rC   rD   rE   rF   r    r   r   rQ   rN  
LongTensorrH   r   r   r   r@   rI   rJ   rK   s   @r/   rQ  rQ  ~  s    
  5915-1,0/3).&*~
u001~
 E--.~
 ))*	~

 $D>~
 'tn~
 #'~
 d^~
 
uU&&')>>	?~
 ~
r1   rQ  )r6  r  rQ  )r   )4rG   typingr   r   r   rQ   torch.utils.checkpointr   torch.nnr   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_vivitr   
get_loggerrC   r   Moduler   rM   rw   floatr   r   r   r   r   r   r   r   r  r  r6  rQ  __all__r   r1   r/   <module>rl     s    , ,    . ! 9 b b F Q 7 7 , 
		H	%)RYY )XLbii Ln %II%<<% 
% <<	%
 U\\*% % %>F FTbii &$RYY $N		 $"))  $+ $N(
299 (
V"))  4? 4 4> T
% T
 T
n L
"6 L
L
^ Pr1   