
    <hb                       S r SSKrSSKrSSKJrJr  SSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  \&" 5       (       a  SSK,J-r-  SSK.J/r/  \)R`                  " \15      r2 " S S\Rf                  5      r4 " S S\Rf                  5      r5 " S S\Rf                  5      r6 " S S\Rf                  5      r7 " S S\Rf                  5      r8 " S S\Rf                  5      r9 " S S \Rf                  5      r: " S! S"\5      r; " S# S$\Rf                  5      r<\% " S% S&\!5      5       r= " S' S(\=5      r>\% " S) S*\=5      5       r?\%" S+S,9 " S- S.\=\5      5       r@\% " S/ S0\=5      5       rA\%" S1S,9 " S2 S3\=5      5       rB\% " S4 S5\=5      5       rC\% " S6 S7\=5      5       rD/ S8QrEg)9zPyTorch UMT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )UMT5LayerNorm=   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zU
Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/umt5/modeling_umt5.pyr)   UMT5LayerNorm.__init__>   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor+   float32powmeanrsqrtr.   r-   dtypefloat16bfloat16)r/   hidden_statesvariances      r3   forwardUMT5LayerNorm.forwardF   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r5   )r.   r-   )gư>)__name__
__module____qualname____firstlineno__r)   rD   __static_attributes____classcell__r2   s   @r3   r%   r%   =   s    $+ +r5   r%   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5DenseActDenseW   configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r(   r)   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr
   dense_act_fnactr/   rP   r2   s     r3   r)   UMT5DenseActDense.__init__X   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r5   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)rX   r^   r\   
isinstancerY   r-   r+   Tensorr?   int8r:   r/   rB   s     r3   rD   UMT5DenseActDense.forward_   s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r5   )r^   r\   rX   rY   	rF   rG   rH   rI   r!   r)   rD   rJ   rK   rL   s   @r3   rN   rN   W   s    /z / r5   rN   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5DenseGatedActDensen   rP   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rR   )r(   r)   r   rU   rV   rW   wi_0wi_1rY   rZ   r[   r\   r
   r]   r^   r_   s     r3   r)   UMT5DenseGatedActDense.__init__o   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r5   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rb   )r^   rm   rn   r\   rc   rY   r-   r+   rd   r?   re   r:   )r/   rB   hidden_geluhidden_linears       r3   rD   UMT5DenseGatedActDense.forwardw   s    hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r5   )r^   r\   rm   rn   rY   rh   rL   s   @r3   rj   rj   n   s    /z / r5   rj   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5LayerFF   rP   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr1   )r(   r)   is_gated_actrj   DenseReluDenserN   r%   rV   layer_norm_epsilon
layer_normr   rZ   r[   r\   r_   s     r3   r)   UMT5LayerFF.__init__   s_    "8"@D"3F";D'F<U<UVzz&"5"56r5   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ rb   )r|   rz   r\   )r/   rB   forwarded_statess      r3   rD   UMT5LayerFF.forward   s;    ??=9../?@%5E(FFr5   )rz   r\   r|   rh   rL   s   @r3   ru   ru      s    7z 7 r5   ru   c                   T  ^  \ rS rSrSrSS\\   4U 4S jjjrS\R                  S\R                  4S jr
S rSS	 jr     SS
\R                  S\\R                     S\\\R                        S\\R                     S\\R                     S\\R                     4S jjrSrU =r$ )UMT5Attention   z/
T5's attention using relative_attention_bias.
	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        [7        5       U l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrS   )r(   r)   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerV   d_kvkey_value_proj_dim	num_headsn_headsr[   r\   	inner_dimr   loggerwarning_oncer2   rF   r   rU   qkvo	Embeddingrelative_attention_biassetpruned_heads)r/   rP   r   r   r2   s       r3   r)   UMT5Attention.__init__   se    +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(Er5   
projectionreturnc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      R	                  SSSS5      nU$ )Nr8   r   r7   r    r	   )sizer   r   viewpermute)r/   r   new_projection_shapenew_projections       r3   _shapeUMT5Attention._shape   sQ    )0"5tG^G^8__#)=>FFq!QPQRr5   c                    SnU R                   nU R                  nU R                  (       dC  US-  nX!S:  R                  [        R
                  5      U-  -  n[        R                  " U5      nO,[        R                  " U[        R                  " U5      5      * nUS-  nX:  n[        R                  " UR                  5       U-  5      [        R                  " XE-  5      -  nXsU-
  -  nXWR                  [        R
                  5      -   n[        R                  " U[        R                  " XS-
  5      5      nU[        R                  " XaU5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r7   r    )r   r   r   r:   r+   longabsmin
zeros_likelogfloatmath	full_likewhere)	r/   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larges	            r3   _relative_position_bucket'UMT5Attention._relative_position_bucket   s/   * 99;;AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 II/557)CDtxxP\PhGii	y!89	%.ejj1I%I"%*YY&8RbcTc(d&
" 	EKKE_``r5   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nO	USS2S4   n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  U5      nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r?   device)r7   r   r    r   )	r   r-   r   r+   aranger   r   r   	unsqueeze)
r/   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r3   compute_biasUMT5Attention.compute_bias   s    >1188??F!$||L

SYZ[\^b[bc-ag6,,zFSTXZ[T[\+>#'#A#ABS#T --.FG	*44Q7r5   rB   encoder_hidden_statespast_key_valueattention_masklayer_head_maskr   c                 X   UR                   S S u  pxUS Ln	U R                  U5      n
U
R                  USU R                  U R                  5      R                  SS5      n
Ub[  [        U[        5      (       aF  UR                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbL  U	(       d  UOS nUR%                  XU R                  SU05      u  pU	(       a  SUR                  U R                  '   [&        R(                  " XR                  SS5      5      nUb  XR+                  5       -   OUnUR                   S   nU R,                  (       d9  [&        R.                  " SU R                  UU4UR0                  UR2                  S9nO.U R5                  UUUR0                  US	9nUS S 2S S 2U* S 2S S 24   nUb#  US S 2S S 2S S 2S UR                   S   24   nUU-   nU R6                  (       aS  [&        R8                  " UR                   S   5      nS
U[;        U R6                  5      '   US S 2UR=                  5       4   nOUnUU-  n[>        R@                  RC                  URE                  5       SS9RG                  U5      n[>        R@                  RI                  UU RH                  U RJ                  S9nUb  UU-  n[&        R(                  " UU5      nUR                  SS5      RM                  5       nUR                  XxS5      nU RO                  U5      nUU4$ )Nr7   r8   r    r   Tr	   )r   r?   )r   r   r   dim)ptraining)(shaper   r   r   r   	transposerc   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater+   matmulget_seq_lengthr   zerosr   r?   r   r   r,   listboolr   
functionalsoftmaxr   type_asr\   r   
contiguousr   )r/   rB   r   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskmaskposition_bias_maskedattn_weightsattn_outputs                            r3   rD   UMT5Attention.forward  s    "/!4!4Ra!8
 3$>vvm,#((RtG^G^_iijkmno %*^EX*Y*Y'2266t~~FJ!&4&J&J#&4&I&I#"02D.-."<,33DNNCHHJ.55dnnELLL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
 &@DN--dnn= l,@,@A,FG KYJd*'D'D'FFjt%%b)
//!KKDLL*j9&--W]WcWcM !--FMMR` . M *!Qa*?@M%(Aq2HJ4D4DR4H2H)HIK)K7M::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&zrBff[)L((r5   )rV   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )FN)NNNNNNN)rF   rG   rH   rI   __doc__r   intr)   r+   rd   r   r   r   tuplerD   rJ   rK   rL   s   @r3   r   r      s    "XVY] " ": %,, - ^$ 9=8<152615\)||\)  (5\) !u||!45	\)
 !.\) "%,,/\) !.\) \)r5   r   c                   L   ^  \ rS rSrSS\\   4U 4S jjjr    SS jrSrU =r	$ )UMT5LayerSelfAttentionic  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NTr   r   rx   )r(   r)   r   SelfAttentionr%   rV   r{   r|   r   rZ   r[   r\   r/   rP   r   r2   s      r3   r)   UMT5LayerSelfAttention.__init__d  sN    *6t_hi'F<U<UVzz&"5"56r5   c                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   nU$ )Nr   r   r   r   r   r    )r|   r   r\   )	r/   rB   r   r   r   r   normed_hidden_statesattention_outputoutputss	            r3   rD   UMT5LayerSelfAttention.forwardj  sk      $}=-- )+)) . 
 &5Ea5H(II "%5ab%99r5   )r   r\   r|   rb   )NNNN
rF   rG   rH   rI   r   r   r)   rD   rJ   rK   rL   s   @r3   r   r   c  s0    7(3- 7 7  r5   r   c                   N   ^  \ rS rSrSS\\   4U 4S jjjr     SS jrSrU =r	$ )UMT5LayerCrossAttentioni  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr   rx   )r(   r)   r   EncDecAttentionr%   rV   r{   r|   r   rZ   r[   r\   r   s      r3   r)    UMT5LayerCrossAttention.__init__  sO    ,VQVbkl'F<U<UVzz&"5"56r5   c           	          U R                  U5      nU R                  UUUUUUS9nXR                  US   5      -   n	U	4USS  -   n
U
$ )Nr   r   r   r   r   r   r    )r|   r  r\   )r/   rB   r   r   r   r   r   r   r   layer_outputr  s              r3   rD   UMT5LayerCrossAttention.forward  sm      $}=// "7)+)) 0 
 %||4DQ4G'HH/$4QR$88r5   )r  r\   r|   rb   r   r  rL   s   @r3   r  r    s3    7(3- 7 7 # r5   r  c                   V   ^  \ rS rSrSS\\   4U 4S jjjr         SS jrSrU =r	$ )	UMT5Blocki  r   c                 j  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        U R
                  R                  [        XS95        U R                  (       a"  U R
                  R                  [        XS95        U R
                  R                  [        U5      5        g )Nr   )
r(   r)   r   r   
ModuleListlayerappendr   r  ru   r   s      r3   r)   UMT5Block.__init__  sv     ++]]_


0MN??JJ5fRS

+f-.r5   c           	         U R                   S   " UUUUU
S9u  pUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n[        R                  " [        R                  " U5      R                  5       US-
  U5      n[        R                  " X* US9nS nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU
S9u  pUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n[        R                  " [        R                  " U5      R                  5       US-
  U5      n[        R                  " X* US9nU R                   S   " U5      nUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n[        R                  " [        R                  " U5      R                  5       US-
  U5      n[        R                  " X* US9nU4nU	(       a  UX4-  nU$ )Nr   r   i  )r   maxr    r
  r8   )r  r?   r+   r@   finfor  r   isinfanyclampr   )r/   rB   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr  s                    r3   rD   UMT5Block.forward  s    ,0::a=)+)),
( %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KK<[YM "!__R1Fd1R04

1&;5 :--1-M ""emm3!KK(;(;<@@	#kk%++m*D*H*H*JIX\L\^gh %M|Q\ ] 

2}5 %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KK<[YM ")>>Gr5   )r   r  rb   )	NNNNNNFFNr  rL   s   @r3   r  r    s?    /(3- / / "##'; ;r5   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	UMT5ClassificationHeadi  z-Head for sentence-level classification tasks.rP   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )N)r   )r(   r)   r   rU   rV   denserZ   classifier_dropoutr\   
num_labelsout_projr_   s     r3   r)   UMT5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr5   rB   r   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rb   )r\   r(  r+   tanhr+  rf   s     r3   rD   UMT5ClassificationHead.forward  sN    ]3

=1

=1]3m4r5   )r(  r\   r+  )rF   rG   rH   rI   r   r!   r)   r+   rd   rD   rJ   rK   rL   s   @r3   r&  r&    s4    7Ez EU\\ ell  r5   r&  c                   T    \ rS rSr% \\S'   SrSrSrS/r	S/r
\S 5       rS rS	 rS
rg)UMT5PreTrainedModeli  rP   transformerTr  rY   c                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r+   tensorr   r   )r/   r5  
input_maskdummy_inputss       r3   r9   UMT5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r5   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g	[        U[        [        [        [        45      (       Ga  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       aL  U R                   R                  (       d1  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       av  UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  UR                   R$                  R
                  R'                  5         g	g	[        U[(        5      (       ar  [        US5      (       a`  UR*                  R                  R
                  R                  SUS-  S9  UR*                  R$                  R
                  R'                  5         g	g	[        U[,        5      (       GaQ  UR.                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aE  UR.                  R$                  b.  UR.                  R$                  R
                  R'                  5         UR0                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aG  UR0                  R$                  b/  UR0                  R$                  R
                  R'                  5         g	g	g	[        U[2        5      (       GaQ  UR4                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR4                  S5      (       aE  UR4                  R$                  b.  UR4                  R$                  R
                  R'                  5         UR6                  R                  R
                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aG  UR6                  R$                  b/  UR6                  R$                  R
                  R'                  5         g	g	g	[        U[:        5      (       Ga  UR<                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR<                  S5      (       aE  UR<                  R$                  b.  UR<                  R$                  R
                  R'                  5         UR>                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR>                  S5      (       aE  UR>                  R$                  b.  UR>                  R$                  R
                  R'                  5         UR6                  R                  R
                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aG  UR6                  R$                  b/  UR6                  R$                  R
                  R'                  5         g	g	g	[        U[@        5      (       GaZ  U R                   R"                  nU R                   RB                  nU R                   RD                  nURF                  R                  R
                  R                  SX#U-  S-  -  S9  URH                  R                  R
                  R                  SX#S-  -  S9  URJ                  R                  R
                  R                  SX#S-  -  S9  URL                  R                  R
                  R                  SX%U-  S-  -  S9  URN                  (       a4  URP                  R                  R
                  R                  SX#S-  -  S9  g	g	g	)
zInitialize the weights      ?        )r=   stdlm_head
qa_outputs      
classifierrT   N))rP   initializer_factorrc   r%   r-   datafill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharednormal_hasattrtie_word_embeddingsr?  r@  rV   rT   zero_UMT5ForTokenClassificationrB  r&  r(  r+  rN   rX   rY   rW   rj   rm   rn   r   r   r   r   r   r   r   r   r   )r/   modulefactorrV   r   r   s         r3   _init_weights!UMT5PreTrainedModel._init_weights  s   //fm,,MM$$Vc\2, (	
 
 MM  %%--3FSL-Ivy))$++2Q2Q%%**22#2Nv|,,!!((--553F{{ObObgkNkDl5m!!&&++113 -  :;;v|,,!!((--553FSL5Q!!&&++113 -  677LL$$,,#6kkFYFY^bEb;c,dv||V,,1B1B1N!!&&,,.OO""''//SfI\I\aeHe>f/gv//FOO4H4H4T$$))//1 5U/ 122 II!!))s;;CVCV[_B_8`)avyy&))fiinn.H		##))+II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I) 677KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I).. kk))G!%!1!1kk++GHHOO  ((cvL^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cvL^B^cgAg7h(i11..55::BBQWhl[lQmBn 2 /r5   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r8   )r    .r   r    ).r   z1self.model.config.pad_token_id has to be defined.)rP   decoder_start_token_idpad_token_id
ValueErrorr   r+   fullr   cat	new_zerosclonemasked_fill_)r/   r5  rV  rW  shifted_input_idss        r3   _shift_right UMT5PreTrainedModel._shift_rightP  s    !%!C!C{{//!)6  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r5    N)rF   rG   rH   rI   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr9  rR  r_  rJ   ra  r5   r3   r1  r1    sI    %&*#!$!F @oD!r5   r1  c                   (  ^  \ rS rSrSU 4S jjrS r             SS jr SS\\R                  S4   S\R                  S\R                  S	\
S
\4
S jjr\S\R                  S\S\S\R                  S\R                  S\4S j5       rSrU =r$ )	UMT5Stackil  c           
        > [         TU ]  U5        X l        UR                  U l        [        R
                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l	        [        UR                  UR                  S9U l        [        R                  " UR                  5      U l        SU l        U R%                  5         g s  snf )Nr  rx   F)r(   r)   embed_tokensr   r   r  range
num_layersr  blockr%   rV   r{   final_layer_normrZ   r[   r\   gradient_checkpointing	post_init)r/   rP   rl  ir2   s       r3   r)   UMT5Stack.__init__m  s     ( ++]]ERXRcRcLd#eLdqIf$BLd#ef
 -fnn&B[B[ \zz&"5"56 ',# $fs   Cc                     Xl         g rb   )rl  r/   new_embeddingss     r3   set_input_embeddingsUMT5Stack.set_input_embeddingsy  s    *r5   c                 	   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S
5        Sn	Uc)  U R                  c  [        S5      eU R                  U5      nUu  nnU	SL a   U R
                  (       d  [        SU  S35      eU R
                  (       aM  U	(       aE  UcB  U R                   R                  (       a  [        [!        5       [!        5       5      nO[!        5       nOU R
                  (       d  S nUb  UR#                  5       OSnUc#  [$        R&                  " UUU-   UR(                  S9nUc4  [+        5       (       d%  UU-   n[$        R,                  " UUUR(                  S9nU R
                  (       a7  U R/                  UUU[1        U[        5      (       a  UR2                  OUU
5      nO\UbW  US S 2S S S S 24   nUR5                  UR6                  S9nSU-
  [$        R8                  " UR6                  5      R:                  -  nOS nU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [$        R,                  " UUR(                  S9nU R=                  U5      nOS nU R?                  X`R                   R@                  5      nU R?                  XpR                   R@                  5      nU(       a  SOS nU
(       a  SOS nU
(       a  U R
                  (       a  SOS nU RC                  U5      n[E        U RF                  5       H_  u  nnUU   n UU   n!U(       a  UU4-   nU" UUUUU U!UU	U
US9
n"U"S   nU
(       d  M:  UU"S   4-  nU R
                  (       d  MV  UU"S   4-  nMa     U RI                  U5      nU RC                  U5      nU(       a  UU4-   nU(       d  [K        S UUUUU4 5       5      $ [M        UUUUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer8   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr   r   )r?   r<  ra  )r  r   r  r   r  r  r   r    r7   c              3   .   #    U H  nUc  M  Uv   M     g 7frb   ra  ).0r   s     r3   	<genexpr>$UMT5Stack.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_statepast_key_valuesrB   
attentionscross_attentions)'rP   r  r  output_hidden_statesuse_return_dictr   rX  r   r   rq  r   r   r   rl  is_encoder_decoderr   r   r   r+   r   r   r   r,   _update_causal_maskrc   r   r:   r?   r  r   invert_attention_maskget_head_maskrn  r\   	enumeratero  rp  r   r   )#r/   r5  r   r   r  r}  	head_maskcross_attn_head_maskr  r  r  r  return_dictr   err_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrB   rs  layer_moduler   r  layer_outputss#                                      r3   rD   UMT5Stack.forward|  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii ??_4;;11&9,.,.&YO&2nO #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN??22o/BCC  44$!K '(D$)9:K%..}/B/B.CK,M<O<O0P0T0TTKK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d%64??rPT]3(4OA|'lO)=a)@&#$58H$H!(%'F /+E.#"3-M *!,M  =#3"55???(]1-=,??(3  56 --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r5   r   r"   input_tensorr   r  r  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r=  flex_attentionr   Fsdpa)r}  r  is_trainingr    r8   )sequence_lengthtarget_lengthr?   r   r   )cudaxpunpu)rP   _attn_implementationr  rc   r+   rd   r#   r   is_compileabler   _ignore_causal_mask_sdpar   r?   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer  r   _unmask_unattended)r/   r   r  r   r  r  past_seen_tokensusing_compilable_cacher?   r  r  r   	min_dtypes                r3   r  UMT5Stack._update_causal_mask&  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr5   r  r  r?   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer?   r   r    )diagonalr~  r8   r   )r   r+   r  r   rY  r   triur   reshapeexpandr\  r   r:   masked_fill)r   r  r  r?   r   r   kwargsr   r  mask_lengthpadding_masks              r3   r  ?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionj  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r5   )ro  r\   rl  rp  rq  r   rb   )NNNNNNNNNNNNN)F)rF   rG   rH   rI   r)   rx  rD   r   r+   rd   r   r   r  staticmethodr   r?   r  rJ   rK   rL   s   @r3   rj  rj  l  s    
+
 "#!!g
` #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r5   rj  c            &       P  ^  \ rS rSr% SrSr\\S'   SS/rU 4S jr	S r
S	 rS
 rS rS rS r\                S!S\\R&                     S\\R(                     S\\R&                     S\\R*                     S\\R(                     S\\R(                     S\\R,                     S\\\\R(                           S\\   S\\R,                     S\\R,                     S\\   S\\   S\\   S\\   S\\R&                     S\\\R(                     \4   4"S jj5       rS rU =r$ )"rF  i  a?  
Examples:

```python
>>> from transformers import UMT5Model, AutoTokenizer

>>> model = UMT5Model.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
>>> label = "<extra_id_0> verhandelt"
>>> inputs = tokenizer(inputs, return_tensors="pt")
>>> labels = tokenizer(label=label, return_tensors="pt")

>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state
```umt5rP   encoder.embed_tokens.weightdecoder.embed_tokens.weightc                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         g NFT)r(   r)   r   r   
vocab_sizerV   rJ  copydeepcopyr   r  tie_encoder_decoderrj  encodernum_decoder_layersrn  decoderrr  r/   rP   encoder_configdecoder_configr2   s       r3   r)   UMT5Model.__init__  s     ll6#4#4fnnEv.$)!#( -2* =v.$(!-2*$*$=$=! = 	r5   c                     U R                   $ rb   rJ  r/   s    r3   get_input_embeddingsUMT5Model.get_input_embeddings      {{r5   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rb   rJ  r  rx  r  rv  s     r3   rx  UMT5Model.set_input_embeddings  +    $)).9)).9r5   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g rb   rP   rM  _tie_or_clone_weightsr  rl  rJ  r  r  s    r3   _tie_weightsUMT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r5   c                     U R                   $ rb   r  r  s    r3   get_encoderUMT5Model.get_encoder      ||r5   c                     U R                   $ rb   r  r  s    r3   get_decoderUMT5Model.get_decoder  r  r5   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  	attentionprune_headsr/   heads_to_pruner  headss       r3   _prune_headsUMT5Model._prune_heads  s<    
 +002LELLu%//;;EB 3r5   r5  r   r4  r6  r  decoder_head_maskr  encoder_outputsr  r}  decoder_inputs_embedsr  r  r  r  r   r   c                 \   Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUU
UUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUU	UUUUUUUUUS9nU(       d  UU-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, UMT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5Model.from_pretrained("google/umt5-small")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
>>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
>>> decoder_input_ids = model._shift_right(decoder_input_ids)

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr5  r   r}  r  r  r  r  r   r    r7   r  rB   r  r5  r   r}  r  r   r  r  r  r  r  r  r  r   )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions)rP   r  r  r  rc   r   lenr  r   r  r  rB   r  r  )r/   r5  r   r4  r6  r  r  r  r  r  r}  r  r  r  r  r  r   rB   decoder_outputss                      r3   rD   UMT5Model.forward  s^   b "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r5   )r  r  rJ  NNNNNNNNNNNNNNNN)rF   rG   rH   rI   r   
model_typer!   rb  _tied_weights_keysr)   r  rx  r  r  r  r  r   r   r+   
LongTensorFloatTensor
BoolTensorrd   r   r   r   r   r   rD   rJ   rK   rL   s   @r3   rF  rF    s   " J79VW(:OC  156:8<=A159=7;EI+/048<$(,0/3&*59#D
E,,-D
 !!2!23D
 $E$4$45	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%D
  -D
  (5D
 D>D
 $D>D
 'tnD
  d^!D
" !!1!12#D
$ 
uU&&');;	<%D
 D
r5   rF  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc            (         ^  \ rS rSrSrSr/ SQrU 4S jrS rS r	S r
S	 rS
 r\                 S S\\R                      S\\R"                     S\\R                      S\\R$                     S\\R"                     S\\R"                     S\\R&                     S\\\\R&                           S\\   S\\R"                     S\\R"                     S\\R                      S\\   S\\   S\\   S\\   S\\R                      S\\\R"                     \4   4$S jj5       rS\R&                  4S jrSrU =r$ )!rG  iv  a  
Examples:

```python
>>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

>>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

>>> outputs = model(**inputs)
>>> loss = outputs.loss
```r  )r  r  zlm_head.weightc                 L  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         g )NFTrS   )r(   r)   rV   	model_dimr   r   r  rJ  r  r  r   r  r  rj  r  r  rn  r  rU   r?  rr  r  s       r3   r)   %UMT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( -2* =v.$(!-2*$*$=$=! =yy1B1BO 	r5   c                     U R                   $ rb   r  r  s    r3   r  1UMT5ForConditionalGeneration.get_input_embeddings  r  r5   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rb   r  rv  s     r3   rx  1UMT5ForConditionalGeneration.set_input_embeddings  r  r5   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g rb   r  r  s    r3   r  )UMT5ForConditionalGeneration._tie_weights  r  r5   c                     U R                   $ rb   r  r  s    r3   r  (UMT5ForConditionalGeneration.get_encoder  r  r5   c                     U R                   $ rb   r  r  s    r3   r  (UMT5ForConditionalGeneration.get_decoder  r  r5   r5  r   r4  r6  r  r  r  r  r  r}  r  labelsr  r  r  r  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUU
UUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nUb  Uc  Uc  U R                  U5      nU R                  UUUU	UUUUUUUUUS9nUS   nU R                   R                  (       a  UU R                  S-  -  nU R                  U5      nSnUb[  [        S	S
9nUR                  UR                  5      nU" UR                  SUR!                  S5      5      UR                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  UR(                  UR*                  UR,                  UR&                  UR(                  S9	$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

>>> # training
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
>>> outputs = model(input_ids=input_ids, labels=labels)
>>> loss = outputs.loss
>>> logits = outputs.logits

>>> # inference
>>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
>>> outputs = model.generate(input_ids)
>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
```Nr  r   r    r7   r  r  rA  rU  ignore_indexr8   	losslogitsr  r  r  r  r   r   r  )rP   r  r  r  rc   r   r  r_  r  rM  r  r?  r   r:   r   r   r   r   r  rB   r  r  r  )r/   r5  r   r4  r6  r  r  r  r  r  r}  r  r  r  r  r  r  r   rB   r  sequence_output	lm_logitsr  loss_fctoutputs                            r3   rD   $UMT5ForConditionalGeneration.forward  s.   j "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r5   c                 $    U R                  U5      $ rb   )r_  )r/   r  s     r3   %prepare_decoder_input_ids_from_labelsBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsa  s      ((r5   )r  r  r?  r  rJ  )NNNNNNNNNNNNNNNNN)rF   rG   rH   rI   r   r  r  r)   r  rx  r  r  r  r   r   r+   r  r	  r
  rd   r   r   r   r   r   rD   r'  rJ   rK   rL   s   @r3   rG  rG  v  s     Ji0:O  156:8<=A159=7;@D+/59=A-1$(,0/3&*59%_
E,,-_
 !!2!23_
 $E$4$45	_

 !))9)9 :_
 E--._
 $E$5$56_
 'u||4_
 "%ell(;"<=_
 "%_
   1 12_
  ((9(9:_
 ))*_
 D>_
 $D>_
  'tn!_
" d^#_
$ !!1!12%_
& 
uU&&'8	9'_
 _
D)ELL ) )r5   rG  c                   8  ^  \ rS rSrSrSrS/rU 4S jrS rS r	S r
S	 rS
 r\       SS\\R                      S\\R"                     S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\R"                     \4   4S jj5       rSrU =r$ )rH  ie  a  
Examples:

```python
>>> from transformers import UMT5EncoderModel, AutoTokenizer

>>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> input_ids = tokenizer(article, return_tensors="pt").input_ids
>>> outputs = model(input_ids)
>>> hidden_state = outputs.last_hidden_state
```r  r  c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        X R                  5      U l        U R                  5         g NF)r(   r)   r   r   r  rV   rJ  r  r  r  r  rj  r  rr  )r/   rP   r  r2   s      r3   r)   UMT5EncoderModel.__init__y  sf     ll6#4#4fnnEv.#( ,1) = 	r5   c                     U R                   $ rb   r  r  s    r3   r  %UMT5EncoderModel.get_input_embeddings  r  r5   c                 F    Xl         U R                  R                  U5        g rb   )rJ  r  rx  rv  s     r3   rx  %UMT5EncoderModel.set_input_embeddings  s    $)).9r5   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g rb   )rP   rM  r  r  rl  rJ  r  s    r3   r  UMT5EncoderModel._tie_weights  s2    ;;**&&t||'@'@$++N +r5   c                     U R                   $ rb   r  r  s    r3   r  UMT5EncoderModel.get_encoder  r  r5   c                     UR                  5        HD  u  p#U R                  R                  U   R                  S   R                  R                  U5        MF     g)r  r   N)r  r  ro  r  r   r  r  s       r3   r  UMT5EncoderModel._prune_heads  sG    
 +002LELLu%++A.<<HHO 3r5   r5  r   r  r}  r  r  r  r   c           
      f    Ub  UOU R                   R                  nU R                  UUUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, UMT5EncoderModel

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )rP   r  r  )	r/   r5  r   r  r}  r  r  r  r  s	            r3   rD   UMT5EncoderModel.forward  sK    F &1%<k$++B]B],,)'/!5# ' 
 r5   )r  rJ  )NNNNNNN)rF   rG   rH   rI   r   r  r  r)   r  rx  r  r  r  r   r   r+   r  r	  r   r   r   r   rD   rJ   rK   rL   s   @r3   rH  rH  e  s     J78
:
O
P  156:1559,0/3&*-E,,-- !!2!23- E--.	-
   1 12- $D>- 'tn- d^- 
uU&&'8	9- -r5   rH  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $         ^  \ rS rSrS/rSS/rS\4U 4S jjr\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\4   4 S jj5       rSrU =r$ )UMT5ForSequenceClassificationi  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rP   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         SU l        g r+  )r(   r)   rF  r2  r&  classification_headrr  model_parallelr_   s     r3   r)   &UMT5ForSequenceClassification.__init__  s>     $V,#9&#A  	#r5   r5  r   r4  r6  r  r  r  r  r}  r  r  r  r  r  r  r   c                 4   Ub  UOU R                   R                  nUb  SnUc%  U	b"  [        SU R                  R                   35      eUc"  U
c  Uc  [        S5      eU R                  U5      nU R                  UUUUUUUUU	U
UUUUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      5      S:  a  [        S5      eUR                   u  nnnUUSS24   R#                  US	U5      SS2S	SS24   nU R%                  U5      nSnUGb  UR                  UR                  5      nU R                   R&                  c  U R                   R(                  S:X  a  S
U R                   l        OyU R                   R(                  S:  aN  UR*                  [        R,                  :X  d  UR*                  [        R.                  :X  a  SU R                   l        OSU R                   l        U R                   R&                  S
:X  aT  [1        5       nU R                   R(                  S:X  a&  U" UR3                  5       UR3                  5       5      nOU" UU5      nOU R                   R&                  S:X  aG  [5        5       nU" UR#                  S	U R                   R(                  5      UR#                  S	5      5      nO-U R                   R&                  S:X  a  [7        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [9        UUUR:                  UR<                  UR>                  UR@                  URB                  URD                  URF                  S9	$ )ak	  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r4  r6  r  r  r  r  r}  r  r  r  r  r  r   r    z7All examples must have the same number of <eos> tokens.r8   
regressionsingle_label_classificationmulti_label_classificationr  )$rP   r  NotImplementedErrorr2   rF   rX  r_  r2  eqeos_token_idr:   r   r  r+   unique_consecutivesumr   r   r=  problem_typer*  r?   r   r   r   squeezer   r   r   r  r  r  r  r   r   r  )r/   r5  r   r4  r6  r  r  r  r  r}  r  r  r  r  r  r  r  r!  eos_maskr   r  r0   sentence_representationr   r  r#  r$  s                              r3   rD   %UMT5ForSequenceClassification.forward  sR   | &1%<k$++B]B]I!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9/!5+'"7/!5# # 
  "!*<< 8 89<<_=S=STu''Q89A=VWW%4%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r5   )r=  r>  r2  )NNNNNNNNNNNNNNN)rF   rG   rH   rI   "_keys_to_ignore_on_load_unexpectedr  r!   r)   r   r   r+   r  rd   r   r	  r   r   r   r   rD   rJ   rK   rL   s   @r3   r:  r:    s    +s)s&79VW$z $  15158<=A,0487;=A59=A-1$(,0/3&*!P
E,,-P
 !.P
 $E$4$45	P

 !))9)9 :P
 ELL)P
 $ELL1P
 'u||4P
 "$u'8'8"9:P
   1 12P
  ((9(9:P
 ))*P
 D>P
 $D>P
 'tnP
  d^!P
" 
u55	6#P
 P
r5   r:  c                   @  ^  \ rS rSrS/rS/rS\4U 4S jjr\        SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )rO  i{  r;  z'transformer.encoder.embed_tokens.weightrP   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rb   )r(   r)   r*  rH  r2  r   rZ   r)  r\   rU   r0   rB  rr  r_   s     r3   r)   #UMT5ForTokenClassification.__init__  sj      +++F3zz&";";<))F$6$68I8IJ 	r5   r5  r   r  r}  r  r  r  r  r   c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  XSS 4nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
N)r   r  r}  r  r  r  r   r8   r7   )r  r   rB   r  )rP   r  r2  r\   rB  r   r   r*  r   rB   r  )r/   r5  r   r  r}  r  r  r  r  r  rB   r   r  r#  r$  s                  r3   rD   "UMT5ForTokenClassification.forward  s    6 &1%<k$++B]B]"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDam,F)-)9TGf$EvE$!//))	
 	
r5   )rB  r\   r*  r2  )NNNNNNNN)rF   rG   rH   rI   rO  r  r!   r)   r   r   r+   rd   r   r   r   r   rD   rJ   rK   rL   s   @r3   rO  rO  {  s    *r)s&CD	z 	  -115,004)-,0/3&*7
ELL)7
 !.7
 ELL)	7

  -7
 &7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 7
r5   rO  c            &       J  ^  \ rS rSrSS/rU 4S jrS rS rS rS r	S	 r
\                SS
\\R                     S\\R                     S\\R                     S\\R                      S\\R                     S\\R                     S\\R"                     S\\\\R"                           S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\R                     \4   4"S jj5       rSrU =r$ )rI  i  r  r  c                 p  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        UR$                  U l        [        R&                  " UR                  UR$                  5      U l        U R+                  5         g r  )r(   r)   rV   r  r   r   r  rJ  r  r  r   r  r  rj  r  r  rn  r  r*  rU   r@  rr  r  s       r3   r)   !UMT5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#( -2* =v.$(!-2*$*$=$=! = ++))FNNF4E4EF 	r5   c                     U R                   $ rb   r  r  s    r3   r  -UMT5ForQuestionAnswering.get_input_embeddings  r  r5   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rb   r  rv  s     r3   rx  -UMT5ForQuestionAnswering.set_input_embeddings  r  r5   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g rb   r  r  s    r3   r  %UMT5ForQuestionAnswering._tie_weights  r  r5   c                     U R                   $ rb   r  r  s    r3   r  $UMT5ForQuestionAnswering.get_encoder  r  r5   c                     U R                   $ rb   r  r  s    r3   r  $UMT5ForQuestionAnswering.get_decoder  r  r5   r5  r   r4  r6  r  r  r  r  start_positionsend_positionsr}  r  r  r  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U
b  SnUc"  Uc  Uc  [        S5      eU R	                  U5      nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUSUUUUUUUUS	9nUS   nU R                  U5      nUR                  SS
S9u  nnUR                  S
5      R                  5       nUR                  S
5      R                  5       nSnU	b  U
b  [        U	R                  5       5      S:  a*  U	R                  S
5      R                  UR                   5      n	[        U
R                  5       5      S:  a*  U
R                  S
5      R                  UR                   5      n
UR                  S5      nU	R#                  SU5      n	U
R#                  SU5      n
[%        US9nU" UU	5      nU" UU
5      nUU-   S-  nU(       d  UU4USS -   U-   nUb  U4U-   $ U$ ['        UUUUR(                  UR*                  UR,                  UR.                  UR0                  UR*                  UR,                  S9
$ )aY  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NFrA  r  r   r    r7   r  )r5  r   r}  r  r   r  r  r  r  r  r  r  r8   r   r  )
r  start_logits
end_logitsr  r  r  r  r   r   r  )rP   r  r  rX  r_  r  rc   r   r  r  r@  splitrK  r   r   r:   r   r  r   r   r  rB   r  r  r  )r/   r5  r   r4  r6  r  r  r  r  rb  rc  r}  r  r  r  r  r  rB   r  r!  r   re  rf  
total_lossignored_indexr#  
start_lossend_lossr$  s                                r3   rD    UMT5ForQuestionAnswering.forward  s<   x &1%<k$++B]B]!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1'!5/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r5   )r  r  r  r*  r@  rJ  r  )rF   rG   rH   rI   r  r)   r  rx  r  r  r  r   r   r+   r  r	  r
  rd   r   r   r   r   rD   rJ   rK   rL   s   @r3   rI  rI    s   79VW2:O  156:8<=A159=7;@D6:4859=A$(,0/3&*#Z
E,,-Z
 !!2!23Z
 $E$4$45	Z

 !))9)9 :Z
 E--.Z
 $E$5$56Z
 'u||4Z
 "%ell(;"<=Z
 "%"2"23Z
   0 01Z
   1 12Z
  ((9(9:Z
 D>Z
 $D>Z
  'tn!Z
" d^#Z
$ 
uU&&')LL	M%Z
 Z
r5   rI  )rH  rG  rI  r:  rO  rF  r1  )Fr   r  r   typingr   r   r+   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   configuration_umt5r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrF   r   Moduler%   rN   rj   ru   r   r   r  r  r&  r1  rj  rF  rG  rH  r:  rO  rI  __all__ra  r5   r3   <module>r}     s      "   A A ! C C ) > 9   .   +  !!;J			H	%+BII +4		 .RYY <")) $B)BII B)JRYY 8bii <F* FTRYY $ o!/ o! o!dt# tn	 O
# O
 O
d 
g)#6 g)
g)T i* i iX `
$7 `
`
F I
!4 I
 I
X N
2 N
 N
br5   