
    <hJ                       S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
JrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJrJrJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&J'r'  \ " 5       (       a  SSK(J)r)  SSK*J+r+  \#RX                  " \-5      r. " S S\R^                  5      r0 SSK1J2r2  \2r0\.Rg                  S5         " S S\R^                  5      r7 " S S\R^                  5      r8 " S S\R^                  5      r9 " S S\5      r: " S S \R^                  5      r;\ " S! S"\5      5       r<\ " S# S$\<5      5       r= " S% S&\R^                  5      r> " S' S(\R^                  5      r? " S) S*\R^                  5      r@ " S+ S,\R^                  5      rA " S- S.\R^                  5      rB " S/ S0\5      rC\" S1S29 " S3 S4\<5      5       rD\" S5S29 " S6 S7\<\5      5       rE/ S8QrFg! \4 a     GN*\5 a    \.Rm                  S5         GNCf = f)9zPix2Struct modeling file    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Pix2StructLayerNorm=   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      j/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr&   Pix2StructLayerNorm.__init__>   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor(   float32powmeanrsqrtr+   r*   dtypefloat16bfloat16)r,   hidden_statesvariances      r0   forwardPix2StructLayerNorm.forwardF   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r2   )r+   r*   )gư>__name__
__module____qualname____firstlineno__r&   rA   __static_attributes____classcell__r/   s   @r0   r"   r"   =   s    $+ +r2   r"   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
Pix2StructVisionEmbeddingsd   a  
Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
is represented by a vector of `hidden_size` values.
configreturnNc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l
        [        R                  " UR                  5      U l        g N)r%   r&   r   Linearpatch_embed_hidden_sizer-   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr,   rO   r/   s     r0   r&   #Pix2StructVisionEmbeddings.__init__k   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r2   flattened_patchesc                     US S 2S S 2S4   R                  5       nUS S 2S S 2S4   R                  5       nUS S 2S S 2SS 24   nU R                  U5      nU R                  U5      nU R                  U5      nXE-   U-   nU R	                  U5      nU$ )Nr   r   r4   )longrU   rX   rY   r\   )r,   r_   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r0   rA   "Pix2StructVisionEmbeddings.forwardt   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  0>A
\\*-
r2   )rY   r\   rU   rX   )rD   rE   rF   rG   __doc__r   r&   r(   TensorrA   rH   rI   rJ   s   @r0   rM   rM   d   s<    7/ 7D 7 %,,  r2   rM   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )Pix2StructVisionAttention   c                 l  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        U R                  U R                  -  U l	        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        SU l        g NFbias)r%   r&   r-   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutr\   	inner_dimr   rS   querykeyvalueoutputgradient_checkpointingr]   s     r0   r&   "Pix2StructVisionAttention.__init__   s    !--"(++11//(?(?? YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r2   c                 N  ^ ^ UR                   SS u  mnUU 4S jnU" T R                  U5      5      nU" T R                  U5      5      n	U" T R                  U5      5      n
[        R
                  " XR                  SS5      5      nUGc  [        R                  " ST R                  Xf4UR                  UR                  S9nT R                  (       a  T R                  (       a  SUl        UR                  5       S:X  a)  X2SS2SSSS24   R                  UR                  5      -   nOyUb  X2R                  UR                  5      -   nOX[!        5       (       dI  [        R"                  " TU4UR                  UR                  S9nX2R                  UR                  5      -   nSU-
  nUR%                  US:H  [        R&                  " UR                  5      R(                  5      nX-  n[        R*                  " U[        R,                  " [        R&                  " UR                  5      R(                  5      5      n[.        R0                  R3                  US[        R4                  S	9R7                  U5      n[.        R0                  R9                  UT R8                  T R                  S
9nUb  X-  n[        R
                  " X5      nUR                  SS5      R;                  5       R=                  TST R>                  5      nT RA                  U5      nU4U4-   nU(       a  X4-   nU$ )z
Self-attention block
Nr4   c                    > U R                  5       R                  TSTR                  TR                  5      R	                  SS5      $ )
projectionr5   r   r4   )
contiguousviewrt   rr   	transpose)states
batch_sizer,   s    r0   to_projection_shape>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr2   r   r   devicer<   Tr5   )dimr<   ptraining)!shaperw   rx   ry   r(   matmulr   zerosrt   r   r<   r{   r   requires_gradr   r7   r   r)   masked_fillfinfominmaxtensorr   
functionalsoftmaxr8   type_asr\   r   r   rv   rz   )r,   r?   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr   s   `               @r0   rA   !Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE l,@,@A,FG !KKDLL*9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-//!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,b &'9Lll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr2   )
r\   r{   r-   rv   rx   rr   rt   rz   rw   ry   )NNNFrC   rJ   s   @r0   rk   rk      s"    ,& M Mr2   rk   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructVisionMlp   rO   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rn   r%   r&   r   rS   r-   d_ffwi_0wi_1worZ   r[   r\   r   dense_act_fnactr]   s     r0   r&   Pix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r2   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rR   r   r   r   r\   
isinstancer   r*   r(   ri   r<   int8r7   r,   r?   hidden_geluhidden_linears       r0   rA   Pix2StructVisionMlp.forward       hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r2   r   r\   r   r   r   )	rD   rE   rF   rG   r   r&   rA   rH   rI   rJ   s   @r0   r   r      s    /5 / r2   r   c                      ^  \ rS rSrS\SS4U 4S jjr   SS\R                  S\\R                     S\\R                     S	\	S\
\\R                  \R                  4   \\R                     4   4
S
 jjrSrU =r$ )Pix2StructVisionLayeri  rO   rP   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r.   )r%   r&   chunk_size_feed_forwardseq_len_dimrk   	attentionr   mlpr"   r-   layer_norm_epspre_mlp_layer_normpre_attention_layer_normr]   s     r0   r&   Pix2StructVisionLayer.__init__  ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r2   r?   r   	head_maskr   c                     UnU R                  U5      nU R                  UUUUS9nUS   nUSS  nXu-   nU R                  U5      n	U R                  U	5      U-   n	U	4U-   nU$ )N)r   r   r   r   r   )r   r   r   r   )
r,   r?   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r0   rA   Pix2StructVisionLayer.forward  s     ! 55mD!%)%/	 "0 "
 2!4(, )3 ..}=xx-=/G+r2   )r   r   r   r   r   r   )NNF)rD   rE   rF   rG   r   r&   r(   ri   r   boolr   tuplerA   rH   rI   rJ   s   @r0   r   r     s    k/ kD k 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	F r2   r   c                      ^  \ rS rSrS\SS4U 4S jjr     SS\R                  S\\R                     S\\R                     S	\	S
\	S\	S\
\\4   4S jjrSrU =r$ )Pix2StructVisionEncoderi2  rO   rP   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r%   r&   rO   r   
ModuleListrangenum_hidden_layersr   layerr{   )r,   rO   _r/   s      r0   r&    Pix2StructVisionEncoder.__init__3  sT    ]]5QWQiQiKj#kKja$9&$AKj#kl
&+# $ls   A%r?   r   r   r   output_hidden_statesreturn_dictc                 6   U(       a  SOS nU(       a  SOS n[        U R                  5       H9  u  pU(       a  Xq4-   nUb  X9   OS nU
" XX5      nUS   nU(       d  M1  XS   4-   nM;     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frR   r   .0vs     r0   	<genexpr>2Pix2StructVisionEncoder.forward.<locals>.<genexpr>V  s     m$[q$[s   	last_hidden_stater?   
attentions)	enumerater   r   r   )r,   r?   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r0   rA   Pix2StructVisionEncoder.forward9  s     #7BD$5b4(4OA#$58H$H!.7.CilO(kM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r2   )rO   r{   r   )NNFFT)rD   rE   rF   rG   r   r&   r(   ri   r   r   r   r   r   rA   rH   rI   rJ   s   @r0   r   r   2  s    ,/ ,D , 26,0"'%* "
||"
 !."
 ELL)	"

  "
 #"
 "
 
uo%	&"
 "
r2   r   c                   @    \ rS rSr% \\S'   Sr\S 5       rS r	S r
Srg)	Pix2StructPreTrainedModeli^  rO   Fc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r(   r   r   r   )r,   r   
input_maskdummy_inputss       r0   r   &Pix2StructPreTrainedModel.dummy_inputsd  s6    LL.	\\*-
!*"&0

 r2   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        5      (       Gaf  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR                  R                  R
                  R                  SX#S-  -  S9  [        UR                  S5      (       aE  UR                  R                  b.  UR                  R                  R
                  R!                  5         UR"                  R                  R
                  R                  SX#S-  -  S9  [        UR"                  S5      (       aE  UR"                  R                  b.  UR"                  R                  R
                  R!                  5         UR$                  R                  R
                  R                  SX$S-  -  S9  [        UR$                  S5      (       aG  UR$                  R                  b/  UR$                  R                  R
                  R!                  5         ggg[        U[&        5      (       Ga  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R(                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R*                  OU R                   R*                  nUR,                  R                  R
                  R                  SX#U-  S-  -  S9  UR.                  R                  R
                  R                  SX#S-  -  S9  UR0                  R                  R
                  R                  SX#S-  -  S9  UR2                  R                  R
                  R                  SX&U-  S-  -  S9  UR4                  (       a4  UR6                  R                  R
                  R                  SX#S-  -  S9  gg[        U[8        R:                  5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR                  R
                  R                  SX#S-  -  S9  UR<                  b2  UR                  R
                  UR<                     R!                  5         gg[        U[>        5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR@                  R                  R
                  R                  SX#S-  -  S9  g[        U[8        RB                  [8        RD                  45      (       a  [8        RF                  RI                  UR                  R
                  RK                  [L        RN                  5      SU R                   RP                  S9RK                  UR                  RR                  5      UR                  l        UR                  b%  UR                  R
                  R!                  5         gg[        U[        5      (       a4  UR                  b&  UR                  R
                  R                  S5        gg[        U[8        R:                  5      (       ax  UR                  R
                  R                  SU R                   RP                  S9  UR<                  b2  UR                  R
                  UR<                     R!                  5         ggg)zInitialize the weights      ?        g      )r:   stdrp   N)*rO   initializer_factorr   r"   r*   datafill_ Pix2StructTextDenseGatedActDenser   text_configr-   r   r   normal_hasattrrp   zero_r   r   Pix2StructTextAttentionrq   	num_headsrw   rx   ry   rz   has_relative_attention_biasrelative_attention_biasr   rV   padding_idxPix2StructTextModellm_headrS   Conv2dinittrunc_normal_r7   r(   r8   initializer_ranger<   )r,   modulefactorr-   r   rr   rt   s          r0   _init_weights'Pix2StructPreTrainedModel._init_weightso  s   //f122MM$$Vc\2 @AA dkk+;<< ''33[[,, 
 4>dkkK[3\3\4;;**//bfbmbmbrbrDKK##++&UYDY:Z+[v{{F++0@0@0L  %%++-KK##++&UYDY:Z+[v{{F++0@0@0L  %%++-II!!))sD.8Q)Rvyy&))fiinn.H		##))+ /I) 788
 dkk+;<< ''33[[,,  1;4;;HX0Y0Y'',,_c_j_j_v_v 
 dkk+;<< ''11[[**  LL$$,,#6TfFfkoEo;p,qJJ""**PTCT9U*VLL$$,,#6RVEV;W,XMM  %%--3FQcGchlFl<m-n11..55::BBQWlp[pQqBr 2-- dkk+;<< ''33[[,,  MM&&CVPT?T5U&V!!-""6#5#56<<> . 344 dkk+;<< ''33[[,,  NN!!&&..CVX\G\=].^BII 677 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( ' 344}}(""((- )--MM&&CT[[5R5R&S!!-""6#5#56<<> . .r2   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r5   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rO   decoder_start_token_idpad_token_id
ValueErrorr   r(   fullr   cat	new_zerosclonemasked_fill_)r,   r   r  r  shifted_input_idss        r0   _shift_right&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)<  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r2   r   N)rD   rE   rF   rG   r   __annotations___can_compile_fullgraphpropertyr   r  r   rH   r   r2   r0   r   r   ^  s,    " M?`!r2   r   c                     ^  \ rS rSr% \\S'   SrSrS/rS\	4U 4S jjr
S rS\\\\   4   S	S
4S jr\      SS\\R&                     S\\R&                     S\\R&                     S\\   S\\   S\\   S	\\\4   4S jj5       rSrU =r$ )Pix2StructVisionModeli  rO   r_   Tr   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  S9U l
        U R                  5         g Nr   )r%   r&   rO   rM   rd   r   encoderr"   r-   r   	layernorm	post_initr]   s     r0   r&   Pix2StructVisionModel.__init__  sS     4V<.v6,V-?-?VEZEZ[ 	r2   c                 .    U R                   R                  $ rR   )rd   rU   r,   s    r0   get_input_embeddings*Pix2StructVisionModel.get_input_embeddings  s    ///r2   heads_to_prunerP   Nc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr)  r   r   prune_heads)r,   r1  r   headss       r0   _prune_heads"Pix2StructVisionModel._prune_heads  s<    
 +002LELLu%//;;EB 3r2   r   r   r   r   r   c           	      &   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUc   UR                  SS9S:g  R                  5       nU R                  X0R                   R                  5      nU R                  U5      nU R                  UUUUUUS9nUS   n	U R                  U	5      n	U(       d
  U	4n
XSS -   $ [        U	UR                  UR                  S9$ )	a  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
    Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
    [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
    paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

Example:

```python
>>> import requests
>>> from PIL import Image
>>> from transformers import AutoProcessor, Pix2StructVisionModel

>>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 2048, 768]
```
Nz%You have to specify flattened_patchesr5   r  r   )r   r   r   r   r   r   r   )rO   r   r   use_return_dictr  sumfloatget_head_maskr   rd   r)  r*  r   r?   r   )r,   r_   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r0   rA   Pix2StructVisionModel.forward  s4   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN &&y++2O2OP	??+<=,,)/!5# ' 
 *!,..9+-L!""555-)77&11
 	
r2   )rO   rd   r)  r*  )NNNNNN)rD   rE   rF   rG   r   r"  main_input_namesupports_gradient_checkpointing_no_split_modulesr   r&   r/  dictintlistr6  r   r   r(   ri   r   r   r   r   rA   rH   rI   rJ   s   @r0   r&  r&    s    "")O&*#01
/ 
0C4T#Y+? CD C  5915,0,0/3&*N
#ELL1N
 !.N
 ELL)	N

 $D>N
 'tnN
 d^N
 
u00	1N
 N
r2   r&  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )r   iL  rO   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rn   r   r]   s     r0   r&   )Pix2StructTextDenseGatedActDense.__init__M  r   r2   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rR   r   r   s       r0   rA   (Pix2StructTextDenseGatedActDense.forwardU  r   r2   r   	rD   rE   rF   rG   r   r&   rA   rH   rI   rJ   s   @r0   r   r   L  s    /3 / r2   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructTextLayerFFii  rO   c                    > [         TU ]  5         [        U5      U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r(  )r%   r&   r   DenseReluDenser"   r-   layer_norm_epsilon
layer_normr   rZ   r[   r\   r]   s     r0   r&   Pix2StructTextLayerFF.__init__j  sK    >vF-f.@.@fF_F_`zz&"5"56r2   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ rR   )rS  rQ  r\   )r,   r?   forwarded_statess      r0   rA   Pix2StructTextLayerFF.forwardr  s;    ??=9../?@%5E(FFr2   )rQ  r\   rS  rM  rJ   s   @r0   rO  rO  i  s    73 7 r2   rO  c                   z   ^  \ rS rSr S	S\S\\   4U 4S jjjr\S
S j5       r	SS jr
         SS jrSrU =r$ )r  iy  rO   	layer_idxc                   > [         TU ]  5         X l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        X0l        Uc-  [        R                  SU R                   R"                   S35        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        U R                  (       a0  [$        R0                  " U R                  U R                  5      U l        [5        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fro   )r%   r&   r  relative_attention_num_bucketsrelative_attention_max_distancer-   rq   rr   r  rt   r[   r\   rv   rY  loggerwarning_oncer/   rD   r   rS   rw   rx   ry   rz   rV   r  setpruned_headsr{   r,   rO   r  rY  r/   s       r0   r&    Pix2StructTextAttention.__init__z  so    	+F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(E&+#r2   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r4   r   )r7   r(   ra   absr   
zeros_likelogr;  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r0   _relative_position_bucket1Pix2StructTextAttention._relative_position_bucket  s   . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r2   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  USU R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r<   r   F)rk  rl  rm  )r4   r   r   r   )r  r*   r   r(   arangera   r7   rr  r[  r\  permute	unsqueeze)
r,   query_length
key_lengthr   cache_positioncontext_positionmemory_positionrj  relative_position_bucketvaluess
             r0   compute_bias$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A;;==	 $B $
  --.FG	*44Q7r2   c                    UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUb[  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aU  U(       aN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  U
OSn
UR%                  UUU R                  SU
05      u  nnU(       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nUc  UR                   S   nUb  UOU
S   S-   nU R*                  (       db  [&        R,                  " SU R                  UU4UR.                  UR0                  S	9nU R2                  (       a  U R4                  (       a  SUl        O.U R9                  UUUR.                  U
S
9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S   24   nUU-   nU R:                  (       aS  [&        R<                  " UR                   S   5      nSU[?        U R:                  5      '   USS2URA                  5       4   nOUnUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UU RL                  U R4                  S9nUb  UU-  n[&        R(                  " UU5      nUR                  SS5      RO                  5       nUR                  USU RP                  5      nU RS                  U5      nUU4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr4   r5   r   rz  Tr   r   )r   rz  r   r  r   )*r   rw   r   rt   rr   r   r   r
   
is_updatedgetrY  cross_attention_cacheself_attention_cachelayerskeysr~  rx   ry   updater(   r   r  r   r   r<   r{   r   r   r  r`  r)   rG  r   r   r   r   r;  r   r\   r   rv   rz   )r,   r?   maskkey_value_statesr   past_key_valuer   rx  	use_cacher   rz  r   r   is_cross_attentionr   r  curr_past_key_valuecurrent_statesr   r   r   ry  real_seq_lengthcausal_maskr   r   r   r   s                               r0   rA   Pix2StructTextAttention.forward  s   $ "/!4!4Ra!8
 .T9zz-0#((RtG^G^_iijkmno %*^EX*Y*Y'2266t~~FJ!&4&J&J#&4&I&I#"0-?)].Z,33DNNCHHJ.55dnnELLL.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fkk+../Gr2   )r\   r{   r  r-   rv   rx   rr   rY  rt   rz   r`  rw   r  r\  r[  ry   FN)T       )NN)	NNNNNNFFN)rD   rE   rF   rG   r   r   rF  r&   staticmethodrr  r  rA   rH   rI   rJ   s   @r0   r  r  y  si    jn,*,ZbcfZg, ,> -  - `0 l lr2   r  c                   R   ^  \ rS rSrSS\\   4U 4S jjjr       SS jrSrU =r	$ ) Pix2StructTextLayerSelfAttentioniP  rY  c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr  rY  r   r%   r&   r  r   r"   r-   rR  rS  r   rZ   r[   r\   ra  s       r0   r&   )Pix2StructTextLayerSelfAttention.__init__Q  sR    0W`
 .f.@.@fF_F_`zz&"5"56r2   c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r  r   r   r  r  r   rz  r   r   rS  r   r\   )r,   r?   r   r   r   r  r  r   rz  normed_hidden_statesr   r   s               r0   rA   (Pix2StructTextLayerSelfAttention.forwardY  sr      $}=>> '+)/) * 	
 &5Ea5H(II "%5ab%99r2   r   r\   rS  r  )NNNNFFN
rD   rE   rF   rG   r   rF  r&   rA   rH   rI   rJ   s   @r0   r  r  P  s:    7XVY] 7 7  r2   r  c                   T   ^  \ rS rSrSS\\   4U 4S jjjr        SS jrSrU =r	$ )!Pix2StructTextLayerCrossAttentioniu  rY  c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   r  )r,   rO   rY  r/   s      r0   r&   *Pix2StructTextLayerCrossAttention.__init__v  sP    0UZfop-f.@.@fF_F_`zz&"5"56r2   c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r  r  r   r   r  r  rx  r   rz  r   r   r  )r,   r?   r  r   r   r   r  r  rx  r   rz  r  r   r   r   s                  r0   rA   )Pix2StructTextLayerCrossAttention.forward|  sw      $}=>> -'+)%/) * 
 %||4DQ4G'HH/$4QR$88r2   r  rR   )NNNNFNFNr  rJ   s   @r0   r  r  u  s<    7(3- 7 7  r2   r  c                   \   ^  \ rS rSrSS\\   4U 4S jjjr            SS jrSrU =r	$ )Pix2StructTextBlocki  rY  c                    > [         TU ]  5         [        UUUS9U l        [	        UUS9U l        [        U5      U l        g )Nr  )rY  )r%   r&   r  self_attentionr  encoder_decoder_attentionrO  r   ra  s       r0   r&   Pix2StructTextBlock.__init__  sH    >(C
 *K*
&
 )0r2   c                     U R                  UUUUU	U
UUS9nUS   nUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUS LnU(       a  U R                  UUUUUU	US   S-   U
US9	nUS   nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUUSS  -   nU R                  U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4nUU-   $ )N)r   r   r   r  r  r   rz  r   r   i  )r   r   r5   )r  r   r   r   r  rx  r  r   )r  r<   r(   r=   isinfanyr   r   clampr  r   )r,   r?   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr  r  r   r   rz  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r0   rA   Pix2StructTextBlock.forward  s     "&!4!4)'+)/) "5 	"
 /q12126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; :-+B/!3#"3 'E 
'# 4A6M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM "***r2   )r  r   r  r  )NNNNNNNNFFTNr  rJ   s   @r0   r  r    sK    1XVY] 1 1& "#&*#'C+ C+r2   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc            #         ^  \ rS rSr% \\S'   S/rS/rSrU 4S jr	S r
\              S"S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                      S\\   S\\   S\\   S\\   S\\R                     S\\   S\\R                     S\\\R                  S4   \4   4S jj5       r S#S	\\R                   S4   S\R                   S\R                   S\S\4
S jjr\S	\R                   S\S\S\R4                  S\R                   S\4S  j5       rS!rU =r$ )$r
  i  rO   r  zlm_head.weightTc                 P  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        U[        US:H  5      US9PM     sn5      U l        [        UR
                  UR                  S9U l        [        R                   " UR"                  5      U l        [        R&                  " UR
                  UR                  SS9U l        U R+                  5         SU l        g s  snf )Nr   r  r   Fro   )r%   r&   r   rV   
vocab_sizer-   embed_tokensr   r   
num_layersr  r   r   r"   rR  final_layer_normrZ   r[   r\   rS   r  r+  r{   )r,   rO   r   r/   s      r0   r&   Pix2StructTextModel.__init__  s     LL):):F<N<NO]] v0011A $FQRSV`ab1

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   ( D#c                     Xl         g rR   )r  r,   new_embeddingss     r0   set_input_embeddings(Pix2StructTextModel.set_input_embeddings  s    *r2   r   r   r  r  inputs_embedsr   cross_attn_head_maskpast_key_valuesr  r   r   labelsr   rz  rP   .c                 	   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nU R
                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	Ub  Ub  [        S5      eUb&  UR                  5       nUR                  SUS   5      nO"Ub  UR                  5       SS nO[        S5      eUc%  U R                  c   S5       eU R                  U5      nUu  nnU	(       aE  UcB  U R                   R                  (       a  [        [        5       [        5       5      nO
[        5       nSnUb  US   nOUb  UR!                  5       nUc#  ["        R$                  " UUU-   UR&                  S	9nUc8  Ub  UR!                  5       U-   OUn["        R(                  " UUUR&                  S	9nU R                   R*                  (       a7  U R-                  UUU[/        U[        5      (       a  UR0                  OUU
5      nOVUSS2SSSS24   nUR3                  UR4                  S
9nSU-
  ["        R6                  " UR4                  5      R8                  -  nUbL  UR                  5       u  nnnUU4nUc  ["        R(                  " UUR&                  S	9nU R;                  U5      nOSnU R=                  X`R                   R>                  5      nU R=                  XpR                   R>                  5      nU(       a  SOSnU
(       a  SOSnU
(       a  SOSnSnSnU RA                  U5      n [C        U RD                  5       Hi  u  n!n"UU!   n#UU!   n$U(       a  UU 4-   nU"" U UUUUUU#U$UU	U
US9n%U%S   n U%S   nUb  U%U
(       a  SOS   nU
(       d  MR  UU%S   4-   nUc  M`  UU%S   4-   nMk     U RG                  U 5      n U RA                  U 5      n U RI                  U 5      n&U(       a  UU 4-   nSn'Ub  UR3                  U&R&                  5      n[J        RL                  " SSS9n(U(" U&RO                  5       R                  SU&R                  S5      5      URO                  5       R                  S5      5      n'U(       d  [Q        S U'U&UUUU4 5       5      $ [S        U'U&UUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
    embeddings so you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
    Training](./t5#training).
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoProcessor, Pix2StructTextModel

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
```
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer5   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsr   r   )r<   r   r   )r   r  r  r  r   rz  r   r   r4      r  r:   )ignore_index	reductionc              3   .   #    U H  nUc  M  Uv   M     g 7frR   r   r   s     r0   r   .Pix2StructTextModel.forward.<locals>.<genexpr>  s"      A  s   	)losslogitsr  r?   r   cross_attentions)*rO   r  r   r   r9  r{   r   r]  warningr  sizer   r  is_encoder_decoderr
   r	   get_seq_lengthr(   ru  r   r)   
is_decoder_update_causal_maskr   r  r7   r<   r   r   invert_attention_maskr<  r  r\   r   r   r  r  r   CrossEntropyLossr   r   r   ))r,   r   r   r  r  r  r   r  r  r  r   r   r  r   rz  kwargsinput_shaper   r   past_key_values_lengthmask_seq_lengthr  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r?   r   r   r   r  r   r  r  loss_fcts)                                            r0   rA   Pix2StructTextModel.forward  s#   f "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]&&4==YNNl I ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J0{{--"5lnln"U"..!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!22o/BCC  44$!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7rd(,%]3(4OA|'lO)=a)@&#$58H$H!(%/- /+E.#"3-M *!,M
 *!,M$00=CTaZ[0\-  !/=3C2E!E(4+?=QRCSBU+U(C  5F --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD  #%"(   1++%1
 	
r2   r   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fsdpa)r  r  is_trainingr   r5   )sequence_lengthtarget_lengthr<   rz  r   )cudaxpunpu)rO   _attn_implementationr  r   r(   ri   r    r  is_compileabler   _ignore_causal_mask_sdpar   r<   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r,   r   r  rz  r  r   past_seen_tokensusing_compilable_cacher<   r  r  r  	min_dtypes                r0   r  'Pix2StructTextModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr2   r  r  r<   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr  )
fill_valuer<   r   r   )diagonalr  r5   r   )r   r(   r   r   r  r   triuru  reshapeexpandr  r   r7   r   )r   r  r  r<   rz  r   r  r  r   mask_lengthpadding_masks              r0   r  IPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position/  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r2   )r\   r  r  r{   r   r  )NNNNNNNNNNNNNN)F)rD   rE   rF   rG   r   r"  rD  _tied_weights_keysrC  r&   r  r   r   r(   
LongTensorFloatTensorri   r   r   r   r   r   rA   r  r  rF  r<   r  rH   rI   rJ   s   @r0   r
  r
    sR    ! ./*+&*#,&+  156:=A>B48157;+/$(,0/3-1&*59T
E,,-T
 !!2!23T
  ((9(9:	T

 !)):): ;T
   0 01T
 E--.T
 'u||4T
 "%T
 D>T
 $D>T
 'tnT
 ))*T
 d^T
 !!1!12T
" 
uU&&+,.OO	P#T
 T
z #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r2   r
  zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c            &       n  ^  \ rS rSr% \\S'   SrS/rS\4U 4S jjrS r	S r
S\R                  4S	 jrS
 rS rS r\                SS\\R(                     S\\R(                     S\\R*                     S\\R,                     S\\R(                     S\\R(                     S\\R.                     S\\\\R(                           S\\   S\\R*                     S\\R.                     S\\   S\\   S\\   S\\   S\\R*                     S\\\R(                     \4   4"S jj5       rSrU =r$ )"Pix2StructForConditionalGenerationih  rO   r_   zdecoder.lm_head.weightc                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        U R                  5         g rR   )
r%   r&   r&  vision_configr)  r
  r  decoderis_vqar+  r]   s     r0   r&   +Pix2StructForConditionalGeneration.__init__r  sK     ,V-A-AB*6+=+=>mm 	r2   c                 6    U R                   R                  5       $ rR   )r  r/  r.  s    r0   r/  7Pix2StructForConditionalGeneration.get_input_embeddings}  s    ||0022r2   c                 :    U R                   R                  U5        g rR   )r  r  r  s     r0   r  7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r2   rP   c                 6    U R                   R                  5       $ rR   )r  get_output_embeddingsr.  s    r0   r  8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r2   c                 :    U R                   R                  U5        g rR   )r  set_output_embeddingsr  s     r0   r  8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r2   c                     U R                   $ rR   )r  r.  s    r0   get_decoder.Pix2StructForConditionalGeneration.get_decoder      ||r2   c                     U R                   $ rR   )r)  r.  s    r0   get_encoder.Pix2StructForConditionalGeneration.get_encoder  r"  r2   r   r   r   r   decoder_head_maskr  r>  r  r  decoder_inputs_embedsr  r   r   r   rz  c                 <   Ub  UOU R                   R                  R                  nUb  UOU R                   R                  nUc  U R	                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU
bX  UcU  UcR  U R                  U
5      nUb  UO2UR                  U R                   R                  5      R                  5       nSUSS2S4'   U R                  UUUU	UUUUUUUU
UUS9nU(       d  UU-   $ [        UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  UR"                  UR$                  S9	$ )	a}  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
    Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
    `num_channels` * `patch_size` * `patch_size`

    The process of flattening the pixel patches is done by `Pix2StructProcessor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss for the decoder.

Example:

Inference:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> # autoregressive generation
>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A stop sign is on a street corner.

>>> # conditional generation
>>> text = "A picture of"
>>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A picture of a stop sign with a red stop sign
```

Training:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "A stop sign is on the street corner."

>>> inputs = processor(images=image, return_tensors="pt")
>>> labels = processor(text=text, return_tensors="pt").input_ids

>>> # forward pass
>>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss
>>> print(f"{loss.item():.5f}")
5.94282
```N)r_   r   r   r   r   r   r   r   r4   r   )r   r   r  r  r  r  r   r  r  r   r   r  r   rz  )	r  r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rO   r  r  r9  r)  r   r   lenr   ner  r;  r  r   r  r  r  r?   r   r  r   )r,   r_   r   r   r   r   r&  r  r>  r  r  r'  r  r   r   r   rz  r?   decoder_outputss                      r0   rA   *Pix2StructForConditionalGeneration.forward  s   d "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1'!5/!5#) ' 
" "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r2   )r  r)  r  )NNNNNNNNNNNNNNNN) rD   rE   rF   rG   r   r"  rB  r  r&   r/  r  r   Moduler  r  r   r$  r   r   r(   r  r  
BoolTensorri   r   r   r   r   r   rA   rH   rI   rJ   s   @r0   r  r  h  s    )O23	/ 	3:4ryy 4;  :>6:8<=A159=7;EI+/-18<$(,0/3&*59#q
#E$5$56q
 !!2!23q
 $E$4$45	q

 !))9)9 :q
 E--.q
 $E$5$56q
 'u||4q
 "%e.?.?(@"ABq
 "%q
 ))*q
  (5q
 D>q
 $D>q
 'tnq
  d^!q
" !!1!12#q
$ 
uU&&');;	<%q
 q
r2   r  )r   r  r&  r
  )Grh   rg  typingr   r   r(   torch.utils.checkpointr   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   configuration_pix2structr   r   r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerrD   r]  r1  r"   apex.normalizationrK   infoImportError	Exceptionr  rM   rk   r   r   r   r   r&  r   rO  r  r  r  r  r
  r  __all__r   r2   r0   <module>rF     s!     "    ! C C ) > 9  .   e d  !!;J 
		H	%+")) +2	/&
KKij! !H^		 ^D")) :(6 (V)
bii )
X y! y! y!x l
5 l
 l
`ryy :BII  Sbii Sn!ryy !J#		 #LT+4 T+n 
n3 n
nb 
T
)BO T

T
nQ/  	 	
NN_`	s   /G) )H2HH