
    <h                     .   S r SSKrSSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  \R>                  " \ 5      r! " S S\RD                  5      r# " S S\RD                  5      r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\5      r(\ " S S\5      5       r) " S S\)5      r*\" SS 9 " S! S"\)5      5       r+\" S#S 9 " S$ S%\)\5      5       r,S%S/r-g)&z/PyTorch TrOCR decoder model (based on RoBERTa).    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )TrOCRConfigc                      ^  \ rS rSrSrS\S\4U 4S jjrSS\R                  S\S\R                  4U 4S	 jjjr	S
r
U =r$ )TrOCRLearnedPositionalEmbedding*   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr   r   	__class__s      `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/trocr/modeling_trocr.pyr   (TrOCRLearnedPositionalEmbedding.__init__/   s"     ++5}E    	input_idspast_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr   )dtypedevicer   )shapetorcharangelongweightr*   expand	unsqueezer   forwardr   )r    r%   r&   r'   bszseq_lenr!   s         r"   r3   'TrOCRLearnedPositionalEmbedding.forward5   s     $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r$   )r   )r   N)__name__
__module____qualname____firstlineno____doc__intr   r-   Tensorr3   __static_attributes____classcell__r!   s   @r"   r   r   *   sH    Fs F3 F; ;s ;^c^j^j ; ;r$   r   c            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )TrOCRScaledWordEmbeddingD   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r   r   padding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)r   r   rE   )r    r   r   rD   rE   r!   s        r"   r   !TrOCRScaledWordEmbedding.__init__I   s    D&r$   r%   c                 <   > [         TU ]  U5      U R                  -  $ rG   )r   r3   rE   )r    r%   r!   s     r"   r3    TrOCRScaledWordEmbedding.forwardM   s    wy)D,<,<<<r$   rE   )      ?)r7   r8   r9   r:   r;   r<   r   floatr   r-   r=   r3   r>   r?   r@   s   @r"   rB   rB   D   sJ    's '3 'S '_ghm_n ' '= = =r$   rB   c            	          ^  \ rS rSrSrSS\S\S\\   4U 4S jjjr\SS\S\S\\   4S jj5       r	\
R                  " 5       SS	\
R                  S
\4S jj5       r SS	\
R                  S\S
\\   4S jjrSrU =r$ )"TrOCRSinusoidalPositionalEmbeddingQ   zDThis module produces sinusoidal positional embeddings of any length.num_positionsr   rD   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XU5      U l        U R                  S[        R                  " S5      5        g )Nr   _float_tensorr   )
r   r   r   r   rD   get_embeddingweightsregister_bufferr-   FloatTensor)r    rQ   r   rD   r!   s       r"   r   +TrOCRSinusoidalPositionalEmbedding.__init__T   sP    *&))-T_e.?.?.BCr$   r   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   )r)   r   dimr+   N)mathlogr-   expr.   int64rM   r2   catsincosviewzerostoget_default_dtype)r   r   rD   half_dimembs        r"   rT   0TrOCRSinusoidalPositionalEmbedding.get_embedding\   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r$   r%   r&   c                 J   UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nU R
                  b  X`R
                  R                  S5      :  a+  U R                  X`R                  U R                  5      U l        U R
                  R                  U R                  5      U l        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       nU$ )Nr   r   r+   )size"create_position_ids_from_input_idsrD   re   r*   rU   rT   r   rS   index_selectrc   detach)r    r%   r&   r4   r5   r'   max_posxs           r"   r3   *TrOCRSinusoidalPositionalEmbedding.forwardo   s     ~~'>>yJZJZ\rsvv

 ""Q&0<<7\\->->q-A#A--g7I7I4K[K[\DL||t'9'9:LL%%a):):2)>?DDSSUV]]_r$   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
r   rZ   )ner<   r-   cumsumtype_asr/   )r    r%   rD   r&   maskincremental_indicess         r"   rl   ETrOCRSinusoidalPositionalEmbedding.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r$   )r   r   rD   rU   rG   )r   )r7   r8   r9   r:   r;   r<   r   r   staticmethodrT   r-   no_gradr=   r3   rl   r>   r?   r@   s   @r"   rO   rO   Q   s    NDc D# DHUXM D D 1c 1# 1HUXM 1 1$ ]]_ s  & bc
8
847
8QYZ]Q^
8 
8r$   rO   c                     ^  \ rS rSrSr       SS\S\S\\   S\\   S\\   S\\   S	\\   S
\\   S\\   4U 4S jjjr	      SS\
R                  S\\
R                     S\\   S\\
R                     S\\
R                     S\\   S\\
R                     S\\
R                  \\
R                     \\\
R                        4   4S jjrSrU =r$ )TrOCRAttention   z>Multi-headed attention from 'Attention Is All You Need' paper.	embed_dim	num_headskdimvdimdropout
is_decoderbiasis_cross_attention	layer_idxc                 2  > [         TU ]  5         X l        Ub  UOUU l        Ub  UOUU l        X0l        X`l        X#-  U l        U R                  U-  U R                  :X  d  [        SU R                   SU S35      eU R                  S-  U l	        Xpl
        Xl        [        R                  " U R                  X(S9U l        [        R                  " U R                  X(S9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r   )r   r   r~   r   r   r   r   head_dim
ValueErrorscalingr   r   r   Lineark_projv_projq_projout_proj)r    configr~   r   r   r   r   r   r   r   r   r!   s              r"   r   TrOCRAttention.__init__   s     	" ,D)	 ,D)	"!.	)T^^;MdnnM] ^;b"  }}d*$"ii		9@ii		9@ii	4@		)TBr$   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionscache_positionreturnc                 
   USLnUR                  5       u  pnU R                  U5      U R                  -  nUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUbN  U(       d  UOSnWR'                  UUU R                  SU05      u  nnU(       a  SUR
                  U R                  '   XR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR(                  " U6 nUR(                  " U6 nUR(                  " U6 nUR                  S5      n[*        R,                  " UUR%                  SS5      5      nUR                  5       XR                   -  U
U4:w  a.  [/        SXR                   -  U
U4 SUR                  5        35      eUbz  UR                  5       U	SU
U4:w  a#  [/        S	U	SU
U4 SUR                  5        35      eUR                  XR                   U
U5      U-   nUR                  XR                   -  U
U5      n[0        R2                  R5                  USS
9nUb  UR                  5       U R                   4:w  a*  [/        SU R                   4 SUR                  5        35      eUR                  SSSS5      UR                  XR                   U
U5      -  nUR                  XR                   -  U
U5      nU(       a=  UR                  XR                   U
U5      nUR                  XR                   -  U
U5      nOSn[0        R2                  R7                  UU R6                  U R8                  S9n[*        R,                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [/        SXR                   XR"                  4 SUR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR)                  XU5      nU R;                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr+   r   r   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size rZ   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )rk   r   r   
isinstancer
   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   rc   r   r   	transposeupdatereshaper-   bmmr   r   
functionalsoftmaxr   r   r   )r    r   r   r   r   r   r   r   r   r4   tgt_lenr~   query_statesr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r"   r3   TrOCRAttention.forward   s    .T9"/"4"4"6i {{=1DLL@%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r$   )r   r~   r   r   r   r   r   r   r   r   r   r   r   )NN        FTFN)NNNNFN)r7   r8   r9   r:   r;   r<   r   rM   boolr   r-   r=   r	   tupler3   r>   r?   r@   s   @r"   r|   r|      sv   H #"#&%*#-2$(!C !C 	!C
 sm!C sm!C %!C TN!C tn!C %TN!C D>!C !CL 48*.1526,115p2||p2 #5<<0p2 !	p2
 !.p2 "%,,/p2 $D>p2 !.p2 
u||Xell3XeELL>Q5RR	Sp2 p2r$   r|   c                   >  ^  \ rS rSrSS\4U 4S jjjr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\
   S\\
   S\\R                     4S jjrSrU =r$ )TrOCRDecoderLayeri(  r   c                 j  > [         TU ]  5         UR                  U l        [	        UU R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        UR                   (       am  [	        UU R                  UR
                  UR"                  UR"                  UR                  SSUS9	U l        [        R                  " U R                  5      U l        [        R(                  " U R                  UR*                  5      U l        [        R(                  " UR*                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r~   r   r   r   r   )r~   r   r   r   r   r   r   r   )r   r   hidden_sizer~   r|   decoder_attention_headsattention_dropout	self_attnr   r   activation_functionactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normr   cross_attention_hidden_sizeencoder_attnencoder_attn_layer_normr   decoder_ffn_dimfc1fc2final_layer_norm)r    r   r   r!   s      r"   r   TrOCRDecoderLayer.__init__)  s=   ++'nn44,,
 ~~#F$>$>?"(";";$&LL$@! ... 88777700#'#
!D ,.<<+GD(99T^^V-C-CD99V33T^^D "T^^ <r$   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   	use_cacher   c           
         UnU R                  UUUUUU
S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnUb`  UnU R                  UUUUUUU
S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU(       a  XU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size *(decoder_attention_heads,)*.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   r   r   r   N)r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r    r   r   r   r   r   r   r   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                  r"   r3   TrOCRDecoderLayer.forwardM  s   > ! ,0>>'))+/) ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+!65 :-"3- 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< "+=>>Gr$   )r   r   r   r~   r   r   r   r   r   r   r   rG   )	NNNNNNFTN)r7   r8   r9   r:   r   r   r-   r=   r   r	   r   r3   r>   r?   r@   s   @r"   r   r   (  s    "={ "= "=N 268<9=26=A*.,1$(15Q||Q !.Q  (5	Q
 !) 6Q "%,,/Q %-U\\$:Q !Q $D>Q D>Q !.Q Qr$   r   c                   4    \ rS rSr% \\S'   SrSrS/rS r	Sr
g)	TrOCRPreTrainedModeli  r   modelTr   c                 B   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_stdr   r   r   Conv1dr0   datanormal_r   zero_	EmbeddingrD   )r    moduler   s      r"   _init_weights"TrOCRPreTrainedModel._init_weights  s    kk""fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r$    N)r7   r8   r9   r:   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r>   r   r$   r"   r   r     s"    &*#,-	?r$   r   c                   X   ^  \ rS rSrSrS\4U 4S jjr             SS jrSrU =r	$ )TrOCRDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

Args:
    config: TrOCRConfig
r   c           
      z  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        UR                  (       a&  [        UR                   UR                  5      U l        O@[%        UR                   U R                  -   S-   UR                  U R                  5      U l        UR&                  (       a&  [(        R*                  " UR                  5      U l        OS U l        [(        R,                  " [/        UR0                  5       Vs/ sH  n[3        XS9PM     sn5      U l        SU l        U R9                  5         g s  snf )NrL   rK   r   )r   F)r   r   r   decoder_layerdrop	layerdroppad_token_idrD   scale_embeddingr\   sqrtr   rB   
vocab_sizeembed_tokensuse_learned_position_embeddingsr   max_position_embeddingsembed_positionsrO   layernorm_embeddingr   r   
ModuleListrangedecoder_layersr   r   gradient_checkpointing	post_init)r    r   rE   ir!   s       r"   r   TrOCRDecoder.__init__  sT    ~~11!..7=7M7Mdii 2 23SV4v1143C3CQ\
 11#B6CaCacicucu#vD #E..1A1AAAE""  $D  %%')||F4F4F'GD$'+D$mmUZ[a[p[pUq$rUqPQ%6v%KUq$rs&+#	 %ss   F8c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUb"  UnUR                  SUR                  S   5      nO.Ub   UR                  5       SS nUSS2SS2S4   nO[        S5      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	SnU	(       aB  [        U[        5      (       d-  [        R                  S5        Sn[        R                   " U5      nUb  UR#                  5       OS	nUc  U R%                  U5      nU R                   R&                  (       a  U R)                  UUS
9nOU R)                  UUS
9nUU-   nU R*                  b  U R+                  U5      n[,        R.                  R1                  UU R0                  U R                  S9nUR                  n[3        X/UU5      nUb  Ub  [5        XHR6                  US   S9nU(       a  SOSnU
(       a  SOSnU
(       a  Ub  SOSn[9        XV/SS/5       Hn  u  nnUc  M  UR                  5       S	   [;        U R<                  5      :w  d  M7  [        SU S[;        U R<                  5       SUR                  5       S	    S35      e   [?        U R<                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [@        RB                  " / 5      nUU RD                  :  a  ML  U" UUUUUb  UU   OSUb  UU   OSUU
U	US9
nUS	   nU
(       d  My  UUS   4-  nUc  M  UUS   4-  nM     U(       a  UU4-  nU(       a  URG                  5       nU(       d  [I        S UUUUU4 5       5      $ [K        UUUUUS9$ )aa  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
        on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
        shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer+   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz^`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )r&   r   )r   r   	head_maskcross_attn_head_maskzThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r   c              3   .   #    U H  nUc  M  Uv   M     g 7frG   r   ).0vs     r"   	<genexpr>'TrOCRDecoder.forward.<locals>.<genexpr>  s      rA rs   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)&r   r   output_hidden_statesr   use_return_dictr   rc   r,   rk   r   r   loggerwarning_oncer   r	   r
   from_legacy_cacheget_seq_lengthr   r   r   r   r   r   r   r   r   r)   ziplenr   	enumerater-   randr   to_legacy_cacher   r   )r    r%   r   r   r   r  r  r  inputs_embedsr   r   r  return_dictr   inputinput_shapereturn_legacy_cacher&   	embed_posr   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                                r"   r3   TrOCRDecoder.forward  s%   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E!r5;;r?;I&',,.s3K!!Q(+Edee&&4==##t "	#Z??\
 #'1CCOTOETE`!?!?!Afg  --i8M;;66,,UKa,bI,,YOe,fI%	1##/ 44]CM--mt||VZVcVc-dkk:8N

 !,1G1S%?&(;(;[QS_&"
 #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos."3#-M *!,M  =#3"55(4(]1-=,??(7 #9<  -!11-==?O ':K^]qr  
 9+++%1
 	
r$   )r   r   r   r   r   r   r   rD   )NNNNNNNNNNNNN)
r7   r8   r9   r:   r;   r   r   r3   r>   r?   r@   s   @r"   r   r     sJ    { B "#!!P
 P
r$   r   a  
    The TrOCR Model with a language modeling head. Can be used for summarization.
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    )custom_introc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TrOCRDecoderWrapperi  c                 D   > [         TU ]  U5        [        U5      U l        g rG   )r   r   r   decoderr    r   r!   s     r"   r   TrOCRDecoderWrapper.__init__  s     #F+r$   c                 &    U R                   " U0 UD6$ rG   r0  )r    argskwargss      r"   r3   TrOCRDecoderWrapper.forward  s    ||T,V,,r$   r4  )r7   r8   r9   r:   r   r3   r>   r?   r@   s   @r"   r.  r.    s    ,- -r$   r.  zy
    The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and
    c            "         ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\              SS
\\R                     S\\R                      S\\R"                     S\\R                     S\\R                      S\\R                      S\\\\R"                           S\\R"                     S\\R                     S\\   S\\   S\\   S\\   S\\R                      S\\\4   4S jj5       rSrU =r$ )TrOCRForCausalLMi  zoutput_projection.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   is_encoder_decoderr   r   r.  r   r   r   r   r   output_projectionr  r1  s     r"   r   TrOCRForCausalLM.__init__  sZ     $)! (0
!#6+=+=v?P?PW\!] 	r$   c                 B    U R                   R                  R                  $ rG   r   r0  r   r    s    r"   get_input_embeddings%TrOCRForCausalLM.get_input_embeddings  s    zz!!...r$   c                 8    XR                   R                  l        g rG   r?  )r    values     r"   set_input_embeddings%TrOCRForCausalLM.set_input_embeddings  s    */

'r$   c                     U R                   $ rG   r<  r@  s    r"   get_output_embeddings&TrOCRForCausalLM.get_output_embeddings  s    %%%r$   c                     Xl         g rG   rH  )r    new_embeddingss     r"   set_output_embeddings&TrOCRForCausalLM.set_output_embeddings  s    !/r$   c                 $    XR                   l        g rG   r   r0  )r    r0  s     r"   set_decoderTrOCRForCausalLM.set_decoder  s    $

r$   c                 .    U R                   R                  $ rG   rP  r@  s    r"   get_decoderTrOCRForCausalLM.get_decoder  s    zz!!!r$   r%   r   r   r   r  r  r  r  labelsr   r   r  r  r   r   c                 N   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUUS9nU R                  US   5      nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a	  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import (
...     TrOCRConfig,
...     TrOCRProcessor,
...     TrOCRForCausalLM,
...     ViTConfig,
...     ViTModel,
...     VisionEncoderDecoderModel,
... )
>>> import requests
>>> from PIL import Image

>>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
>>> # init vision2text model with random weights
>>> encoder = ViTModel(ViTConfig())
>>> decoder = TrOCRForCausalLM(TrOCRConfig())
>>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

>>> # If you want to start from the pretrained model, load the checkpoint with `VisionEncoderDecoderModel`
>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

>>> # load image from the IAM dataset
>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> pixel_values = processor(image, return_tensors="pt").pixel_values
>>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

>>> # training
>>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
>>> model.config.pad_token_id = processor.tokenizer.pad_token_id
>>> model.config.vocab_size = model.config.decoder.vocab_size

>>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
>>> outputs = model(pixel_values, labels=labels)
>>> loss = outputs.loss
>>> round(loss.item(), 2)
5.30

>>> # inference
>>> generated_ids = model.generate(pixel_values)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> generated_text
'industry, " Mr. Brown commented icily. " Let us have a'
```N)r%   r   r   r   r  r  r  r  r   r   r  r  r   r   r+   r   )losslogitsr  r   r  r  )r   r   r  r  r   r0  r<  r   rc   r   r   r  r   r  r  )r    r%   r   r   r   r  r  r  r  rV  r   r   r  r  r   r   rY  rX  loss_fctoutputs                       r"   r3   TrOCRForCausalLM.forward  sK   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5#) % 
  ''
3')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r$   )r   r<  )NNNNNNNNNNNNNN)r7   r8   r9   r:   _tied_weights_keysr   rA  rE  rI  rM  rQ  rT  r   r   r-   
LongTensorr=   rW   r   r   r   r   r3   r>   r?   r@   s   @r"   r9  r9    s    55	/0&0%"  1515=A=A,07;EI59-1$(,0/3&*15u
E,,-u
 !.u
  ((9(9:	u

 !))9)9 :u
 ELL)u
 'u||4u
 "%e.?.?(@"ABu
   1 12u
 ))*u
 D>u
 $D>u
 'tnu
 d^u
 !.u
  
u77	8!u
 u
r$   r9  ).r;   r\   typingr   r   r-   r   torch.nnr   activationsr   cache_utilsr	   r
   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_trocrr   
get_loggerr7   r  r   r   rB   ModulerO   r|   r   r   r   r.  r9  __all__r   r$   r"   <module>rm     s(   6  "   % ! 5 ) : l - , , 
		H	%;bll ;4
=r|| 
=;8 ;8|V2RYY V2rv2 vr ?? ? ?$w
' w
t -. -- 
V
+_ V

V
r 5
6r$   