
    <h                        S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJrJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  \&RV                  " \,5      r-\\$" SS9 " S S\#5      5       5       r. " S S\
R^                  5      r0 " S S\
R^                  5      r1 " S S\
R^                  5      r2S\10r3 " S S\
R^                  5      r4 " S S \
R^                  5      r5 " S! S"\
R^                  5      r6 " S# S$\5      r7 " S% S&\
R^                  5      r8\$ " S' S(\5      5       r9 " S) S*\
R^                  5      r: " S+ S,\
R^                  5      r; SIS-\
R^                  S.\Rx                  S/\Rx                  S0\Rx                  S1\\Rx                     S2\=S3\=4S4 jjr> " S5 S6\
R^                  5      r? " S7 S8\5      r@ " S9 S:\
R^                  5      rA " S; S<\
R^                  5      rB\$" S=S9 " S> S?\95      5       rC " S@ SA\
R^                  5      rD\$" SBS9 " SC SD\95      5       rE\$" SES9 " SF SG\9\5      5       rF/ SHQrGg)JzPyTorch GIT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
GitVisionModelOutput4   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r$   r   torchFloatTensor__annotations__r%   r&   tupler'   __static_attributes__r(       \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/git/modeling_git.pyr"   r"   4   sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r3   r"   c                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\
S\R                  4
S	 jjrS
rU =r$ )GitEmbeddingsG   z;Construct the embeddings from word and position embeddings.c                 :  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  g )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr<   register_bufferr.   arangeexpandselfconfig	__class__s     r4   rD   GitEmbeddings.__init__J   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
r3   	input_idsr>   inputs_embedspast_key_values_lengthreturnc                 N   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XFU-   24   nUc  U R                  U5      nOUnU R                  S:X  a  U R	                  U5      nXx-  nU R                  U5      nU R                  U5      nU$ )Nr@   r   r=   )sizer>   rI   r<   rK   rL   rP   )	rV   rZ   r>   r[   r\   input_shape
seq_length
embeddingsrK   s	            r4   forwardGitEmbeddings.forwardY   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J'':5"&":":<"H-J^^J/
\\*-
r3   )rL   rP   r<   rK   rI   )NNNr   )r)   r*   r+   r,   r-   rD   r   r.   
LongTensorr/   intTensorrc   r2   __classcell__rX   s   @r4   r6   r6   G   sx    E
" 153759&'E,,- u//0   1 12	
 !$ 
 r3   r6   c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\	   S\\
   S\\
   S	\\R                     4S
 jjrSrU =r$ )GitSelfAttentionw   c                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eX0l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        UR                  R                  UR                  R                   -  S-  S	-   5      U l        UR$                  b  U =R"                  UR$                  -  sl        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R0                  " UR2                  5      U l        U=(       d    [7        US
S5      U l        U R8                  S:X  d  U R8                  S:X  aH  UR:                  U l        [&        R<                  " SUR:                  -  S	-
  U R                  5      U l        g g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r<   r=   relative_keyrelative_key_query) rC   rD   rG   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerX   r)   rf   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyvaluerN   attention_probs_dropout_probrP   rQ   r<   rJ   rE   distance_embeddingrV   rW   r<   rv   rX   s       r4   rD   GitSelfAttention.__init__x   s)    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr3   r&   attention_mask	head_maskpast_key_valueoutput_attentionspixel_values_presentr]   c           	      d   UR                   u  pxn	U R                  U5      R                  USU R                  U R                  5      R                  SS5      n
U(       a  U R                  OSnU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nUb  UR                  US S 2S S 2US 2S S 24   US S 2S S 2US 2S S 24   U R                  5      u  p[        R                  " US S 2S S 2S U2S S 24   U/SS9n[        R                  " US S 2S S 2S U2S S 24   U/SS9n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  U
R                   S   UR                   S   nnUbB  [        R                  " US-
  [        R                   UR"                  S	9R                  SS5      nO>[        R$                  " U[        R                   UR"                  S	9R                  SS5      n[        R$                  " U[        R                   UR"                  S	9R                  SS5      nUU-
  nU R'                  UU R(                  -   S-
  5      nUR+                  U
R,                  S
9nU R                  S:X  a  [        R.                  " SU
U5      nUU-   nOHU R                  S:X  a8  [        R.                  " SU
U5      n[        R.                  " SUU5      nUU-   U-   nU[0        R2                  " U R                  5      -  nUb  UU-   n[4        R6                  R9                  USS9nU R;                  U5      nUb  UU-  n[        R                  " UU5      nUR=                  SSSS5      R?                  5       nURA                  5       S S U RB                  4-   nUR                  U5      nUU4$ )Nr@   r   rp   r   dimrq   rr   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )"shaper   viewrs   ry   	transposer~   r   r   updaterv   r.   catmatmulr<   tensorlongr   rS   r   rJ   tor   einsummathsqrtr   
functionalsoftmaxrP   permute
contiguousr_   rz   )rV   r&   r   r   r   r   r   
batch_sizera   _query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                r4   rc   GitSelfAttention.forward   s    %2$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 -A((aHH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	
 %/=/D/D!Q*+[Avw9I-JDNN0,N 		9Q7F7A-=#>"OUVWI))[Aww1A%BDT$U[\]K !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r3   )rz   ry   r   rP   r~   r   rv   rJ   rs   r<   r   r   NNNNNFF)r)   r*   r+   r,   rD   r.   rg   r   r/   r
   boolr1   rc   r2   rh   ri   s   @r4   rk   rk   w   s     uJ 7;15*.,1/4R.||R. !!2!23R. E--.	R.
 !R. $D>R. 'tnR. 
u||	R. R.r3   rk   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )GitSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr:   )rC   rD   r   r   rG   denserL   rM   rN   rO   rP   rU   s     r4   rD   GitSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r3   r&   input_tensorr]   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rP   rL   rV   r&   r   s      r4   rc   GitSelfOutput.forward   5    

=1]3}'CDr3   rL   r   rP   
r)   r*   r+   r,   rD   r.   rg   rc   r2   rh   ri   s   @r4   r   r      6    >U\\  RWR^R^  r3   r   eagerc                      ^  \ rS rSrSU 4S jjrS r     SS\R                  S\\R                     S\\R                     S\\
   S\\   S	\\   S
\\R                     4S jjrSrU =r$ )GitAttentioni  c                    > [         TU ]  5         [        UR                     " XUS9U l        [        U5      U l        [        5       U l        g )N)r<   rv   )	rC   rD   GIT_SELF_ATTENTION_CLASSES_attn_implementationrV   r   outputsetpruned_headsr   s       r4   rD   GitAttention.__init__  sB    .v/J/JKy
	 $F+Er3   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rV   rs   ry   r   r   r   r   r   r   r   rz   union)rV   headsindexs      r4   prune_headsGitAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r3   r&   r   r   r   r   r   r]   c                 Z    U R                  UUUUUU5      u  pxU R                  Xq5      n	X4$ r   )rV   r   )
rV   r&   r   r   r   r   r   attn_outputself_attn_weightsattention_outputs
             r4   rc   GitAttention.forward  sA     *. *
&  ;;{B22r3   )r   r   rV   r   r   )r)   r*   r+   r,   rD   r   r.   rg   r   r/   r
   r   r1   rc   r2   rh   ri   s   @r4   r   r     s    ";* 7;15*.,1/43||3 !!2!233 E--.	3
 !3 $D>3 'tn3 
u||	3 3r3   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitIntermediatei5  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rC   rD   r   r   rG   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrU   s     r4   rD   GitIntermediate.__init__6  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r3   r&   r]   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rV   r&   s     r4   rc   GitIntermediate.forward>  s&    

=100?r3   r   r   ri   s   @r4   r   r   5  s(    9U\\ ell  r3   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	GitOutputiE  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rC   rD   r   r   r   rG   r   rL   rM   rN   rO   rP   rU   s     r4   rD   GitOutput.__init__F  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r3   r&   r   r]   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r4   rc   GitOutput.forwardL  r   r3   r   r   ri   s   @r4   r   r   E  r   r3   r   c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\	   S\\
   S\\
   S	\\R                     4S
 jjrS rSrU =r$ )GitLayeriS  c                    > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        [        U5      U l        [        U5      U l	        g )Nr   )rv   )
rC   rD   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rV   rW   rv   rX   s      r4   rD   GitLayer.__init__T  sI    '-'E'E$%fB+F3'r3   r&   r   r   r   r   r   r]   c           	          U R                  UUUUUUS9u  px[        U R                  U R                  U R                  U5      n	X4$ )N)r   r   r   )r   r   feed_forward_chunkr   r   )
rV   r&   r   r   r   r   r   r   self_attention_weightslayer_outputs
             r4   rc   GitLayer.forward\  sa     48>>/)!5 4B 4
0 1##T%A%A4CSCSUe
 33r3   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )rV   r   intermediate_outputr  s       r4   r  GitLayer.feed_forward_chunkt  s)    "//0@A{{#6Ir3   )r   r   r   r   r   r   r   )r)   r*   r+   r,   rD   r.   rg   r   r/   r
   r   r1   rc   r  r2   rh   ri   s   @r4   r   r   S  s    ( 7;15*.,1/44||4 !!2!234 E--.	4
 !4 $D>4 'tn4 
u||	40 r3   r   c                   (  ^  \ rS rSrU 4S jr        SS\R                  S\\R                     S\\R                     S\\	\
\\\R                        4      S\\   S\\   S	\\   S
\\   S\\   S\	\\R                     \4   4S jjrSrU =r$ )
GitEncoderiz  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        SU l	        g s  snf NF)
rC   rD   rW   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rV   rW   irX   s      r4   rD   GitEncoder.__init__{  sR    ]]vG_G_A`#aA`AHV$7A`#ab
&+# $b   A%r&   r   r   past_key_values	use_cacher   output_hidden_statesr   return_dictr]   c
           	      B   U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        Sn[	        U[        S 5      [        45      (       d  [        S5      eU(       a  Uc
  [        5       nU(       a  SOS n
U(       a  SOS n[        U R                  5       H=  u  pU(       a  X4-   n
Ub  X<   OS nU" UUUUUU5      nUS   nU(       d  M5  XS   4-   nM?     U(       a  X4-   n
U	(       d  [        S UUU
U4 5       5      $ [        UUU
US9$ )	NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzBThe `past_key_values` should be either a `Cache` object or `None`.r(   r   r   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r(   ).0vs     r4   	<genexpr>%GitEncoder.forward.<locals>.<genexpr>  s"      	A  s   	r%   r  r&   r'   )r  trainingrw   rx   r   typer
   ru   r   	enumerater  r1   r   )rV   r&   r   r   r  r  r   r  r   r  all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss                   r4   rc   GitEncoder.forward  s?    &&4==##p "	 /DJ+>??abb0*nO"6BD$5b4(4OA#$58H$H!.7.CilO(!$M *!,M  &91=M<O&O##  5&   14D D 	 "#%'		 	 	 '+++*	
 	
r3   )rW   r  r  )NNNNFFFT)r)   r*   r+   r,   rD   r.   rg   r   r/   r   r
   r1   r   r   rc   r2   rh   ri   s   @r4   r	  r	  z  s    , 7;15SW$(,1/4/4&*B
||B
 !!2!23B
 E--.	B

 "%uU5;L;L5M/N(N"OPB
 D>B
 $D>B
 'tnB
 'tnB
 d^B
 
uU\\"$;;	<B
 B
r3   r	  c                   .    \ rS rSr% \\S'   SrSrS rSr	g)GitPreTrainedModeli  rW   gitTc                    [        U[        5      (       a  [        R                  R	                  UR
                  SU R                  R                  S9  [        R                  R	                  UR                  R                  U R                  R                  S9  [        R                  R	                  UR                  R                  U R                  R                  S9  [        U[        R                  5      (       ak  UR                  R                  R	                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R	                  SU R                  R                  S9  UR                   b2  UR                  R                  UR                      R                  5         gg[        U[        R"                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R%                  S5        gg)zInitialize the weights        )meanstd)r.  Ng      ?)r   GitVisionEmbeddingsr   initnormal_class_embeddingrW   initializer_rangepatch_embeddingweightposition_embeddingr   databiaszero_rE   r9   rL   fill_)rV   modules     r4   _init_weights GitPreTrainedModel._init_weights  s   f122GGOOF22$++B_B_O`GGOOF2299t{{?\?\O]GGOOF55<<$++B_B_O`fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r3   r(   N)
r)   r*   r+   r,   r   r0   base_model_prefixsupports_gradient_checkpointingr<  r2   r(   r3   r4   r)  r)    s    &*#*r3   r)  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )r/  i  rW   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestrider8  rp   r   r>   r?   rA   )rC   rD   rW   rG   	embed_dimr|   r}   r   	Parameterr.   randnr2  Conv2dnum_channelsr4  num_patchesnum_positionsrE   r6  rR   rS   rT   rU   s     r4   rD   GitVisionEmbeddings.__init__  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr3   rb   heightwidthr]   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr@   g      ?r   rp   bicubicF)r_   modealign_cornersr   )r   r6  r5  	unsqueezer.   jit
is_tracingr>   r}   r   reshaper   r   r   interpolater   r   )rV   rb   rN  rO  rK  r6  rL  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r4   interpolate_pos_encoding,GitVisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr3   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().r   rp   r   r@   r   )r   r|   ru   r4  r5  r   r   flattenr   r2  rT   r.   r   r^  r6  r>   )rV   r`  r^  r   r   rN  rO  target_dtypepatch_embedsclass_embedsrb   s              r4   rc   GitVisionEmbeddings.forward"  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr3   )	r2  rW   rF  r|   rK  rL  r4  r}   r6  F)r)   r*   r+   r,   r   rD   r.   rg   rf   r^  r/   rc   r2   rh   ri   s   @r4   r/  r/    si    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r3   r/  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitVisionMLPi5  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rC   rD   rW   r	   r   activation_fnr   r   rG   r   fc1fc2rU   s     r4   rD   GitVisionMLP.__init__6  sb    #F$5$5699V//1I1IJ99V55v7I7IJr3   r&   r]   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rn  rm  ro  r   s     r4   rc   GitVisionMLP.forward=  s4    /**=9/r3   )rm  rW   rn  ro  r   ri   s   @r4   rk  rk  5  s)    KU\\ ell  r3   rk  r;  r   r   r   r   scalingrP   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr@   r   )r   r   )pr  r   rp   )r.   r   r   r   r   r   float32r   r   rP   r  r   )
r;  r   r   r   r   rs  rP   kwargsattn_weightsr   s
             r4   eager_attention_forwardry  E  s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r3   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )GitVisionAttentioni\  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rc  g      F)rC   rD   rW   rG   rF  rs   	num_headshead_dimru   scaleattention_dropoutrP   	is_causalr   r   k_projv_projq_projout_projrU   s     r4   rD   GitVisionAttention.__init___  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar3   r&   r   causal_attention_maskr   r]   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   rp   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r,  )r  rs  rP   )r   r  r  r  r   r}  r~  r   rW   r   r  ry  rw   rx   r   r  r  rP   rW  r   r  )rV   r&   r   r  r   r   ra   rF  querieskeysvaluesattention_interfacer   rx  s                 r4   rc   GitVisionAttention.forwards  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r3   )rW   rP   rF  r~  r  r  r}  r  r  r  r  )NNF)r)   r*   r+   r,   r-   rD   r.   rg   r   r   r1   rc   r2   rh   ri   s   @r4   r{  r{  \  s    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r3   r{  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )GitVisionEncoderLayeri  rW   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rC   rD   rG   rF  r{  	self_attnr   rL   rM   layer_norm1rk  mlplayer_norm2rU   s     r4   rD   GitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr3   r&   r   r  r   r]   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r&   r   r  r   )r  r  r  r  )rV   r&   r   r  r   residualrx  outputss           r4   rc   GitVisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr3   )rF  r  r  r  r  ri  )r)   r*   r+   r,   r   rD   r.   rg   r   r   r1   r/   rc   r2   rh   ri   s   @r4   r  r    sk    S S -2&||& &  %||	&
 $D>& 
u  	!& &r3   r  c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )GitVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`GitVisionEncoderLayer`].

Args:
    config: GitVisionConfig
rW   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rC   rD   rW   r   r  r  r  r  layersr  )rV   rW   r   rX   s      r4   rD   GitVisionEncoder.__init__  sT    mmERXRjRjLk$lLkq%:6%BLk$lm&+# %mr  r   r  r   r  r  r]   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr(   )r   r   r   r%   r&   r'   )rW   r   r  use_return_dictr!  r  r   )rV   r[   r   r  r   r  r  encoder_statesall_attentionsr&   idxencoder_layerr&  s                r4   rc   GitVisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r3   )rW   r  r  )NNNNN)r)   r*   r+   r,   r-   r   rD   r   r   r.   rg   r   r   r1   r   rc   r2   rh   ri   s   @r4   r  r    s    , ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r3   r  c                      ^  \ rS rSrS\4U 4S jjr\     SS\\R                     S\\
   S\\
   S\\
   S\\
   S	\\\4   4S
 jj5       rSrU =r$ )GitVisionTransformeri6  rW   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rC   rD   rW   rG   r/  rb   r   rL   rM   pre_layrnormr  encoderpost_layernorm)rV   rW   rF  rX   s      r4   rD   GitVisionTransformer.__init__8  sd    &&	-f5LL8M8MN'/ ll9:O:OPr3   r`  r   r  r^  r  r]   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nU(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )Nz You have to specify pixel_valuesr^  )r[   r   r  r  r   r   r  )rW   r   r  r  ru   rb   r  r  r  r   r&   r'   )	rV   r`  r   r  r^  r  r&   encoder_outputsr%   s	            r4   rc   GitVisionTransformer.forwardB  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r3   )rW   rb   r  r  r  NNNFN)r)   r*   r+   r,   r   rD   r   r   r.   r/   r   r   r1   r   rc   r2   rh   ri   s   @r4   r  r  6  s    Q Q  59,0/338&*&
u001&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
r3   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )GitVisionModelil  rW   r`  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rC   rD   r  vision_model	post_initrU   s     r4   rD   GitVisionModel.__init__v  s'     08r3   r]   c                 B    U R                   R                  R                  $ r   )r  rb   r4  rV   s    r4   get_input_embeddings#GitVisionModel.get_input_embeddings|  s      ++;;;r3   r   r  r^  r  c                 ^    Ub  UOU R                   R                  nU R                  UUUUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GitVisionModel

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = GitVisionModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```)r`  r   r  r^  r  )rW   r  r  )rV   r`  r   r  r^  r  s         r4   rc   GitVisionModel.forward  sA    8 &1%<k$++B]B]  %/!5%=# ! 
 	
r3   )r  r  )r)   r*   r+   r,   r   r0   main_input_namerD   r   Moduler  r   r   r.   r/   r   r   r1   r   rc   r2   rh   ri   s   @r4   r  r  l  s     $O <bii <  59,0/3).&*#
u001#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
r3   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )GitProjectioni  rW   c                 .  > [         TU ]  5         Xl        [        R                  " [        R
                  " UR                  R                  UR                  5      [        R                  " UR                  UR                  R                  S95      U l
        g r   )rC   rD   rW   r   
Sequentialr   r{   rG   rL   rM   visual_projectionrU   s     r4   rD   GitProjection.__init__  sd    !#IIf**668J8JKLL++1E1E1T1TU"
r3   rb   r]   c                 $    U R                  U5      $ r   )r  )rV   rb   s     r4   rc   GitProjection.forward  s    %%j11r3   )rW   r  )r)   r*   r+   r,   r   rD   r.   rg   rc   r2   rh   ri   s   @r4   r  r    s/    
y 
2%,, 25<< 2 2r3   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                      ^  \ rS rSrU 4S jrS rS rS rS\S\	R                  S\	R                  S	\	R                  4S
 jrSS jr\            SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\\\\	R(                     4      S\\   S\\   S\\   S\S\\   S	\\\	R                     \4   4S jj5       rSrU =r$ )GitModeli  c                 r  >^ [         TU ]  T5        TU l        [        T5      U l        [        TR                  5      U l        [        T5      U l	        [        T5      U l        TR                  b8  [        R                  " U4S j[        TR                  5       5       5      U l        U R#                  5         g )Nc              3      >#    U HE  n[         R                  " [        R                  " S S TR                  R
                  5      5      v   MG     g7f)r   N)r   rG  r.   zerosr{   rG   )r  r   rW   s     r4   r  $GitModel.__init__.<locals>.<genexpr>  s=      ;?A U[[Av/C/C/O/OPQQ?s   AA)rC   rD   rW   r6   rb   r  r{   image_encoderr	  r  r  r  r   r   ParameterListr  img_temperal_embeddingr  rU   s    `r4   rD   GitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r3   c                 .    U R                   R                  $ r   rb   rI   r  s    r4   r  GitModel.get_input_embeddings  s    ...r3   c                 $    XR                   l        g r   r  )rV   r   s     r4   set_input_embeddingsGitModel.set_input_embeddings  s    */'r3   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  r   r   )rV   heads_to_pruner  r   s       r4   _prune_headsGitModel._prune_heads  s<    
 +002LELLu%//;;EB 3r3   r_   r   r   r]   c           	          [         R                  " [         R                  " XX2S9SS9nUR                  US:H  [	        S5      5      nU$ )Nr   r   r   )diagonal-inf)r.   triuonesmasked_fillfloat)rV   r_   r   r   masks        r4   _generate_future_maskGitModel._generate_future_mask  s=    zz%**TLWXY	5=9r3   c                 n   UR                   S   nUR                   S   nUR                  nUR                  n	[        R                  " Xw4XS9n
[        R
                  " XvU-   4[        S5      UR                  U	S9n[        R                  " Xg4U	UR                  S9nUS:  a?  [        R                  " UR                   S   UR                   S   U-   4U	UR                  S9n[        R                  " X4SS9n[        R                  " XR                  U	5      4SS9n[        R                  " X4SS9S S S 24   nUc2  [        R
                  " UR                   S   UR                   S   4SUS9nUR                  [        R                  :w  a  [        S	5      e[        R                  " XQR                  S
9n[        S5      UU'   UR                  UR                   S   Xv-   Xt-   U-   45      nUR                  5       nUS S 2S S 2S U24   nUS S 2S S S 24   nUU-   US S 2S S 2S U24'   US S 2S S S 2S S 24   nU$ )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r.   r  fullr  r   r   r   ru   
zeros_likerT   clone)rV   tgtmemorytgt_maskr\   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r4   create_attention_maskGitModel.create_attention_mask  s0   ))A,\\!_
		;;
7TJJ#99:&M::	
	 kk!??
 "A%{{"HNN1$58N$NOH yy(0a8		9kk%&89qA#ii1=dAgF"*&+jj&,,q/6<<PQ?1S`ent&u#"((EJJ6PQQ!&!1!12IQZQZ![:?-67188$**1-z/CZEhkrErs
 2779)!Q*;<'4
31<v1EAq+:+-. 2!T1a-@""r3   rZ   r   r>   r`  r   r[   r  r  r   r  r^  r  c                 V   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUS   nSnUb5  [        U[        5      (       d  UR                  5       OUR                  5       nU R                  XPR                   R                  5      nSnUb  UR                  S:X  a  U R                  XKS9R                  nOUR                  S	:X  a  / n[!        UR"                  S   5       HM  nU R                  USS2USS2SS24   US9R                  nUU R$                  U   -  nUR'                  U5        MO     [(        R*                  " USS
9nO[        S5      eU R-                  U5      nU R/                  UUUUS9nUcG  [(        R0                  " UR"                  S   SUR"                  S   4UR2                  UR4                  S9nUR7                  UR                  S5      UR                  S5      -  SS5      n[(        R*                  " UU4SS
9nU R9                  UUR2                  UR4                  5      nU R;                  UUUUS9nUbk  [=        UUR2                  US   S9R?                  UR4                  5      nUS:  a  USS2SS2U* S2SS24   nO!USS2SS2US   * S2US   * S24==   U-  ss'   U RA                  UUUUUU	U
UUSLS9	nUS   nU(       d	  U4USS -   $ [C        UURD                  URF                  URH                  S9$ )a?  
Examples:

```python
>>> from transformers import AutoProcessor, AutoModel
>>> import requests
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = AutoModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> text = "this is an image of two cats"

>>> inputs = processor(images=image, text=text, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer@   z5You have to specify either input_ids or inputs_embedsr   r      r     r   z#pixel_values must be of rank 4 or 5)rZ   r>   r[   r\   rp   r   )r  r  r  r\   )tgt_len)r   r   r  r  r   r  r  r   r  )%rW   r   r  r  r  ru   %warn_if_padding_and_no_attention_maskr_   r   r
   get_seq_lengthget_head_maskr  ndimr  r%   r  r   r  appendr.   r   r  rb   r  r   r   repeatr  r  r   r   r  r   r  r&   r'   )rV   rZ   r   r>   r`  r   r[   r  r  r   r  r^  r  r`   ra   r\   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr&   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r4   rc   GitModel.forward  s,   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU ^
 "#& "/599  ..0$335 # &&y++2O2OP	$(!#  A%"&"4"4  #5 ###   ""a'"$!&|'9'9!'<!=I,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@A "> #())O"C !!FGG(,(>(>(O%??%'#9	 + 
 %,(-!''*A/?/E/Ea/HI&,,'..)% %>$D$D!!!$(A(F(Fq(II1a%
!
 		#<>N"OUVW --j:J:P:PRbRiRij #'"<"< ,#9	 #= #
 % "< 0 6 6B"b!(()  &)%71?U>U>VXY8Y%Z"'1{1~o.?+a.AR(RSWiiS,,2+/!5#!-T!9 ' 

 *!,#%(;;;&-+;;)77&11	
 	
r3   )rW   rb   r  r  r  r  r   )NNNNNNNNNNFN)r)   r*   r+   r,   rD   r  r  r  rf   r.   r   r   rg   r  r  r   r   r   r
   listr/   r   r1   r   rc   r2   rh   ri   s   @r4   r  r    s   &/0C# ekk 5<< \a\h\h 0#d  -115/3/3,004KO$(,0/3).&*c
ELL)c
 !.c
 u||,	c

 u||,c
 ELL)c
  -c
 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\\"$>>	?c
 c
r3   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                      ^  \ rS rSrS/rU 4S jrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\\\\
R                     4      S\	\   S\	\   S\	\   S\S\	\   S\\\
R                     \4   4S jj5       r SS jrSrU =r$ )GitForCausalLMi  zoutput.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g r   )
rC   rD   r  r*  r   r   rG   rF   r   r  rU   s     r4   rD   GitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r3   c                     U R                   $ r   r   r  s    r4   get_output_embeddings$GitForCausalLM.get_output_embeddings  s    {{r3   c                     Xl         g r   r  )rV   new_embeddingss     r4   set_output_embeddings$GitForCausalLM.set_output_embeddings  s    $r3   rZ   r   r>   r`  r   r[   labelsr  r  r   r  r^  r  r]   c                    Ub  UOU R                   R                  nUb  Sn	U R                  UUUUUUUU	U
UUUS9nUS   nU R                  U5      nSnUb  U R                  R                  R
                  S   R                  R                  R                  nUSS2US2SS24   R                  5       nUSS2SS24   R                  5       nU R                  " UR                  SU R                   R                  5      UR                  S5      4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

Examples:

Image captioning example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> import requests
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
>>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_caption)
two cats sleeping on a pink blanket next to remotes.
```

Visual question answering (VQA) example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> from huggingface_hub import hf_hub_download
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

>>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
>>> image = Image.open(file_path).convert("RGB")

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> question = "what does the front of the bus say at the top?"

>>> input_ids = processor(text=question, add_special_tokens=False).input_ids
>>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
>>> input_ids = torch.tensor(input_ids).unsqueeze(0)

>>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
>>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
['what does the front of the bus say at the top? special']
```

Video captioning example:

```python
>>> import av
>>> import numpy as np
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download
>>> from transformers import AutoProcessor, AutoModelForCausalLM

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

>>> # set seed for reproducibility
>>> np.random.seed(45)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # load video
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample frames
>>> num_frames = model.config.num_image_with_embedding
>>> indices = sample_frame_indices(
...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
... )
>>> frames = read_video_pyav(container, indices)

>>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

>>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
```
NF)r   r>   r`  r   r[   r  r  r   r  r^  r  r   r@   r   rF   )losslogitsr  r&   r'   )rW   r  r*  r   r  r  r   rV   r~   r   loss_functionr   rF   r   r  r&   r'   )rV   rZ   r   r>   r`  r   r[   r   r  r  r   r  r^  r  rw  r  r  r#  r"  num_image_tokensshifted_logitsr   s                         r4   rc   GitForCausalLM.forward  s   j &1%<k$++B]B]I(()%%'+/!5%=#  
 "!*_-#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r3   c                     UbC  UR                  5       nUR                  S   U:  a  UnOUR                  S   S-
  nUS S 2US 24   nUR                  nUc  UR                  U5      nUUUR                  S5      UUS.$ )Nr   r`  )rZ   r   r`  r  r  )r  r   new_onesget)	rV   rZ   r  r   r  rw  past_lengthremove_prefix_lengthr`   s	            r4   prepare_inputs_for_generation,GitForCausalLM.prepare_inputs_for_generation  s     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I  oo!&//<N #,"JJ~6."
 	
r3   )r*  r   )NNNNNNNNNNNFN)NNN)r)   r*   r+   r,   _tied_weights_keysrD   r  r  r   r   r.   rg   r   r
   r  r   r1   r   rc   r-  r2   rh   ri   s   @r4   r  r    s    **%  -115/3/3,004)-FJ$(,0/3).&*A
ELL)A
 !.A
 u||,	A

 u||,A
 ELL)A
  -A
 &A
 "%tELL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\\"$::	;!A
 A
H OS
 
r3   r  )r  r  r)  r  )r,  )Hr-   r   dataclassesr   typingr   r   r   r.   torch.utils.checkpointr   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_gitr   r   
get_loggerr)   rw   r"   r  r6   rk   r   r   r   r   r   r   r	  r)  r/  rk  rg   r  ry  r{  r  r  r  r  r  r  r  __all__r(   r3   r4   <module>r?     s      ! , ,    ! . ) B 9  G l l  : 
		H	% 	?; 	? 	?-BII -`u.ryy u.rBII   
.3299 .3dbii  		 $) $NI
 I
X * * *6P")) Pf299 . %II%<<% 
% <<	%
 U\\*% % %.L) L)`/6 /fT
ryy T
n3
299 3
l 
2
' 2

2
j
2BII 
2 
~
! ~

~
B 
s
' s

s
l Qr3   