
    <h\                       S r SSKrSSKrSSKrSSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  \&" 5       (       a  SSK,J-r-  SSK.J/r/  \)R`                  " \15      r2STS\Rf                  S\4S\4S\4S\Rf                  4
S jjr5S\Rf                  S\4S\4S\Rf                  4S jr6STS\Rf                  S\4S\4S\4S\Rf                  4
S jjr7S\4S\Rf                  4S jr8S\Rf                  S\4S\Rf                  4S  jr9S!\Rf                  S\4S"\Rt                  S\Rf                  4S# jr;S!\Rf                  S$\4S\<\Rf                  \Rf                  4   4S% jr=S!\Rf                  S$\4S\Rf                  4S& jr>S'\Rf                  S(\Rf                  S)\4S\Rf                  4S* jr? " S+ S,\	R                  5      rA SS-KBJCrC  \CrA\2R                  S.5         " S0 S1\	R                  5      rH " S2 S3\	R                  5      rI " S4 S5\	R                  5      rJ " S6 S7\	R                  5      rK " S8 S9\	R                  5      rL " S: S;\	R                  5      rM " S< S=\	R                  5      rN " S> S?\	R                  5      rO " S@ SA\	R                  5      rP " SB SC\	R                  5      rQ " SD SE\5      rR\% " SF SG\5      5       rS " SH SI\S5      rTSJrU\% " SK SL\S5      5       rV\%" SMSN9 " SO SP\S\5      5       rW\% " SQ SR\S5      5       rX/ SSQrYg! \E a     GN=\F a    \2R                  S/5         GNVf = f)UzPyTorch LongT5 model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 l   U R                   U   * U-  n[        U R                   5      (       d?  [        U R                   5      nXR==   U-  ss'   [        R                  " XPR
                  S9$ S/U R                  -  nSU4Xb'   [        USSS2   S5      n[        R                  R                  XSUS9n U $ )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr)   ndimsumr   
functionalr/   )r"   r#   r$   r%   pad_len	new_shaper/   s          b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler=   ?   s    wws|mi'Gqww<<M	'!{{9GG44(QVV
C7|CH
c$B$i
C
!:YGAH    c                 4   U R                   U   U-  S:w  a  [        XUSS9n U R                   U   U-  nU R                   SU X14-   U R                   US-   S -   nSU;   a)  [        R                  " X@R                  U R
                  S9$ U R                  U5      $ )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
r   )r%   Nr   r)   device)r2   r=   r5   emptyr)   rA   reshape)r"   r#   r$   
num_blocksoutput_shapes        r<   _split_into_blocksrF   O   s    
 	wws|i1$Q3!<*J774C=J#::QWWcAg[=QQLL{{<wwqxxHH99\""r>   	block_dimsequence_dimc                    U R                   U   nS/U R                  -  nSXQ'   [        USSS2   S5      n[        R                  R                  XSUS9n / n[        S5       HK  n[        S	S5      /U R                  -  n[        XwU-   5      X'   [        U5      nUR                  X   5        MM     [        R                  " XbS
9$ )zConcatenate three consecutive blocks for each input block for local attentiont.

For more information, see: https://huggingface.co/papers/2112.07916.
r*   )r   r   Nr+   r,   r-   r.   r   r   r$   )r2   r7   r8   r   r9   r/   rangeslicetupleappendr5   cat)	r"   rG   rH   r%   rD   r/   blocks_listiindicess	            r<   _concatenate_3_blocksrS   ^   s    
 #J(QVV
CCN
c$B$i
C
!:YGA&(K1X D>"QVV+"1*n5.1:&  99[33r>   c                     [         R                  " SU -  [         R                  S9nXU *  nUR                  S5      UR                  S5      -
  nU$ )z:Makes 3-blocked relative position ids for local attention.r   r(   r   r   )r5   arangeint32	unsqueeze)r#   position_idscenter_position_idsrelative_position_idss       r<   "_make_3block_relative_position_idsr[   w   sP    <<IU[[AL&)<(22158K8U8UVW8XX  r>   local_attention_maskc                     [        U5      n[        R                  " U5      U:  nUSSSS2SS24   nUR                  U R                  5      n[        R
                  " X5      $ )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r[   r5   abstorA   logical_and)r\   r#   rZ   locality_masks       r<   _mask_local_attention_maskrb      s]    >yIII34y@M!$a"23M!$$%9%@%@AM1AAr>   attention_maskrA   c                     [        XSS9n[        USSS9nUR                  S5      nUR                  S5      n[        R                  " X45      n[        XQ5      nUR                  S5      R                  U5      $ )z;Prepare attention mask to be applied for a local attention.r   rJ      rG   rH   r+   )rF   rS   rW   r5   r`   rb   r_   )rc   r#   rA   _blocked_attention_mask_3blocked_attention_maskr\   s         r<   _get_local_attention_maskrj      s     1PQR45LXYhij5??C7AA"E ,,-D_56JV))!,//77r>   global_block_sizec                   ^^ U R                   SS u  nmS[        R                  S[        R                  4UU4S jjn[        R                  " X R                  S9T-  n[        R
                  " USS9U-
  n[        R                  " U S	:g  S
S5      R                  U R                  5      n[        R                  " XT-   S
-
  5      R                  U R                  5      n[        R                  " SUR                  UR                  S9n[        R                  " Xg:  Xg5      nX`-  U S-
  -   nU" U5      nTT-  nUS:  a@  [        R                  " USS9R                  R                  US5      R                  SS5      n	O+[        R                  " USUR                  UR                  S9n	[        R
                  " [        R                   " X(5      SS9S-
  n
U
R#                  U R                  5      n
[        R                  " X:*  SS5      n
UR                  [        R$                  5      U
R                  [        R$                  5      4$ )a  Obtain the "fixed block" global id corresponding to each input token.

This implementation is a simplified version of the original Flaxformr implementation adopted from:
https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
the whole fixed block, are assigned to the preceding block.

Padding tokens from the original sequence are represented by -1.
Nre   	block_idsr&   c                 X  > [         R                  " T5      T-  TS-
  :H  nUR                  U R                  5      n[         R                  " XS:  5      nUR                  S5      R                  S5      R                  U R                  5      S-
  n[         R                  " X:  X5      n U $ )Nr   r   r+   )
r5   rU   r_   rA   r`   r8   rW   typer)   where)rm   
block_endstrue_block_endsfull_blocksrk   seq_lens       r<   handle_orphan_tokens:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    ll7+.??DUXYDYY
]]9#3#34
++JQG%))"-77;@@QTUUKK	 7P	r>   rA   r   )axis              ?g     @r+   r@   r   rJ   )r2   r5   Tensor	ones_likerA   cumsumrp   ro   r)   floortensormaxvaluesrepeat	transposer6   onesr_   int)rc   rk   
batch_sizeru   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsrt   s    `         @r<   _make_global_fixed_block_idsr      s    )..r2J    ~>S>STWhh||$41=@PP;;~,c7;@@AUAUVD{{4#:S#@AFF~G[G[\$)LL;K;Q;QZjZqZq$r!{{8:J )9nq>PQ+,<=..KQ"')),<""E"L"L"S"ST_ab"c"m"mnoqr"s"'++!1!7!7@P@W@W#
 ejj&IrRUVV+..~/D/DE%7%RTUWXY  +-?-D-DUYY-OOOr>   c                     [        X5      u  p#UR                  S   n[        R                  " XBR                  S9nXRS   -
  nUR                  [        R                  5      $ )zBCreate the relative position tensor for local -> global attention.r+   rw   .N)r   r2   r5   rU   rA   ro   int64)rc   rk   rm   r   global_seq_lenglobal_positionsside_relative_positions          r<    _make_side_relative_position_idsr      sW    $@$c!I'--b1N||N;K;KL-)0DD!&&u{{33r>   hidden_statesrm   r   c           	      r   UR                  US:  [        R                  " X!R                  UR                  S95      n[
        R                  R                  UR                  [        R                  5      US-   5      SS2SS2SS24   n[        R                  " SXR                  U R                  5      5      $ )zFCompute individual block aggregates by summing over individual blocks.r   r@   r   Nr+   z...nd,...ng->...gd)rp   r5   r   r)   rA   r   r9   one_hotro   r   einsum)r   rm   r   one_hot_block_idss       r<   _create_global_aggregatesr      s    
 Q^??S\ScScdI --innU[[.I>\]K]^_`bcehfheh_hi<<,m=S=STaTgTg=hiir>   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )LongT5LayerNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zW
Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parameterr5   r   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r<   r   LongT5LayerNorm.__init__   s/     	ll5::k#:; #r>   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )Nre   r+   T)keepdim)r_   r5   float32powmeanrsqrtr   r   r)   float16bfloat16)r   r   variances      r<   forwardLongT5LayerNorm.forward   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r>   )r   r   )gư>)__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r<   r   r      s    $+ +r>   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseActDensei  configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr   r   r   s     r<   r   LongT5DenseActDense.__init__	  sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r>   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)r   r   r   
isinstancer   r   r5   r{   r)   int8r_   )r   r   s     r<   r   LongT5DenseActDense.forward  s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r>   )r   r   r   r   	r   r   r   r   r   r   r   r   r   r   s   @r<   r   r     s    /| / r>   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseGatedActDensei  r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r	   r   r   r   s     r<   r   !LongT5DenseGatedActDense.__init__  s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r>   c                     U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r<   r    LongT5DenseGatedActDense.forward'  sQ    hhtyy78		-0#3]3.r>   )r   r   r   r   r   r   r   s   @r<   r   r     s    /| / r>   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5LayerFFi1  r   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r<   r   LongT5LayerFF.__init__2  s_    ":6"BD"5f"=D)&..f>W>WXzz&"5"56r>   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r   )r   r   r   )r   r   forwarded_statess      r<   r   LongT5LayerFF.forward<  s;    ??=9../?@%5E(FFr>   )r   r   r   r   r   s   @r<   r   r   1  s    7| 7 r>   r   c                      ^  \ rS rSr  S
S\S\\   4U 4S jjjrS r\	SS j5       r
SS jr         SS jrS	rU =r$ )LongT5AttentioniD  r   	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        [7        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   s       r<   r   LongT5Attention.__init__E  so    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r>   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g Nr   r   rJ   lenr   r   r   r  r   r   r   r   r   r   unionr   headsindexs      r<   prune_headsLongT5Attention.prune_headsh      u:?7<<!8!8$:K:K
 $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r>   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   re   r   r_   r5   longr^   min
zeros_likelogfloatmath	full_likerp   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r<   _relative_position_bucket)LongT5Attention._relative_position_bucketx  s   , AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r>   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )%Compute binned relative position biasNr@   r  r  r  re   r   r   r   )r   r   rA   r5   rU   r  r_   r$  r   r   r   permuterW   )
r   query_length
key_lengthrA   cache_positioncontext_positionmemory_positionr  relative_position_bucketr   s
             r<   compute_biasLongT5Attention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r>   c                    UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUb[  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  U
OSn
UR%                  UUU R                  SU
05      u  nnU(       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nUc  UR                   S   nUb  UOU
S   S-   nU R*                  (       db  [&        R,                  " SU R                  UU4UR.                  UR0                  S	9nU R2                  (       a  U R4                  (       a  SUl        O.U R9                  UUUR.                  U
S
9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S   24   nUU-   nU R:                  (       aS  [&        R<                  " UR                   S   5      nSU[?        U R:                  5      '   USS2URA                  5       4   nOUnUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UU RL                  U R4                  S9nUb  UU-  n[&        R(                  " UU5      nUR                  SS5      RO                  5       nUR                  USU RP                  5      nU RS                  U5      nUU4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nre   r+   r   r-  Tr   rg   rA   r)   )rA   r-  r   rJ   ptraining)*r2   r   viewr   r   r   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater5   matmulr   r6   rA   r)   r  r7  requires_gradr1  r  r   r4   boolr   r9   softmaxr  type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biaspast_key_valuelayer_head_maskr+  	use_cacheoutput_attentionsr-  r   
seq_lengthis_cross_attentionquery_statesr9  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr,  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r<   r   LongT5Attention.forward  s   $ "/!4!4Ra!8
 .T9vvm,#((RtG^G^_iijkmno %*^EX*Y*Y'2266t~~FJ!&4&J&J#&4&I&I#"0-?)]."<,33DNNCHHJ.55dnnELLL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fff[)./Gr>   )r   r   r  r   r   r   r   r   r   r   r   r  r   r   r   r   r   FNT       )NN)	NNNNNNFFN)r   r   r   r   r   r   r   r   r  staticmethodr$  r1  r   r   r   r   s   @r<   r   r   D  st     %*#'	!,!, C=	!, !,F;  -  - ^. l lr>   r   c                   v   ^  \ rS rSrSS\S\SS4U 4S jjjrS r\SS j5       r	S	\
4S
 jr    SS jrSrU =r$ )LongT5LocalAttentioni+  r   r   r&   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        U R                  U R                  -  U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        U R                  (       a0  [         R,                  " U R                  U R                  5      U l        [1        5       U l        SU l        g )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr#   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   s      r<   r   LongT5LocalAttention.__init__,  sQ    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r>   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g r  r  r  s      r<   r   LongT5LocalAttention.prune_headsF  r  r>   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r  r  r  s           r<   r$  .LongT5LocalAttention._relative_position_bucketV     . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r>   block_lengthc                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ r'  metaNr   r@   r(  r)  r   r   r   rA   ro   r5   rU   r  r$  r   r   r   r*  rW   r   rk  target_devicer/  r.  r  r0  r   s           r<   r1  !LongT5LocalAttention.compute_bias       ++2299>>&H ((//66 	
  ,,q<'7uzzR_`*F ,D!G47G47PP#'#A#A#.;;==	 $B $
  --.FG	*44Q7AA!Dr>   c                 "  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jnU" T R                  U5      5      n	U" T R                  U5      5      n
U" T R                  U5      5      n[	        U	T R
                  SS9n	[	        U
T R
                  SS9n
[	        UT R
                  SS9n[        U
SSS9n
[        USSS9n[        R                  " SX5      nUc  T R                  (       dz  [        R                  " SST R                  T R
                  ST R
                  -  4UR                  UR                  S	9nT R                  (       a  T R                  (       a  S
Ul        OT R#                  T R
                  5      nUb/  [        R$                  " US:  SS5      nX2R'                  SS5      -   nX-  n[(        R*                  R-                  UR/                  5       SS9R1                  U5      n[(        R*                  R3                  UT R2                  T R                  S9nUb  X-  nUR5                  UR                  5      nU" [        R                  " SX5      5      nUS S 2S U2S S 24   nT R7                  U5      nUU4nU(       a  X4-   nU$ )Nre   c                 T   > U R                  TSTR                  TR                  5      $ 
projectionr+   r8  r   r   statesr   r   s    r<   r2   +LongT5LocalAttention.forward.<locals>.shape  "    ;;z2t||T=T=TUUr>   c                 Z   > U R                  5       R                  TSTR                  5      $ rC   r+   rE  r8  r   ry  s    r<   unshape-LongT5LocalAttention.forward.<locals>.unshape  %    $$&++JDNNKKr>   r   rJ   rf   ...qhd,...khd->...hqkr   r4  Tr   ry       _r+   r5  ...hqk,...khd->...qhd)r2   r   r   r   rF   r#   rS   r5   r   r   r6   r   rA   r)   r  r7  rA  r1  rp   r   r   r9   rC  r  rD  r   ro   r   )r   r   r   rG  rI  rK  rL  r2   r  rN  rQ  rR  rS  rW  rX  rY  r   s   `               @r<   r   LongT5LocalAttention.forward  sW    "/!4!4Ra!8
J	V	L
 TVVM23466-01
TVVM23 *,AN'
DNNJ
),AN +:QRS
,\QUVW #\
  33 %4<<T^^9KLU[UbUbjpjvjv! ..4==26M/ $ 1 1$.. A{{4!8S%8 -q!0D D}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'9L#((););<ell+BL_`!![j[!"34ff[) 

 /Gr>   )r#   r   r   r  r   r   r   r   r   rc  r   r   r  r   r   r   r   r   Fr\  NNNF)r   r   r   r   r   rB  r   r  r_  r$  r   r1  r   r   r   r   s   @r<   ra  ra  +  sc    ,| ,$ ,[_ , ,4;  -  - ^ 6 K Kr>   ra  c                      ^  \ rS rSrSS\S\SS4U 4S jjjrS r\SS j5       r	S	\
4S
 jrS\R                  S\R                  S\R                  4S jr    SS jrSrU =r$ )LongT5TransientGlobalAttentioni  r   r   r&   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                   U R                  SS9U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [3        5       U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [9        UR                  UR:                  S9U l        g )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   rc  r#   rk   r   r   r   r   r   r   r   r   r   r   r   r  r  global_relative_attention_biasr   r   global_input_layer_normrd  s      r<   r   'LongT5TransientGlobalAttention.__init__  s    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.!'!9!9**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E ++24,,t?b?bdhdpdp2qD/'6v~~6KdKd'e$r>   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g r  r  r  s      r<   r  *LongT5TransientGlobalAttention.prune_heads  r  r>   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r  r  r  s           r<   r$  8LongT5TransientGlobalAttention._relative_position_bucket  rj  r>   rk  c                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ rm  ro  rp  s           r<   r1  +LongT5TransientGlobalAttention.compute_biasN  rs  r>   r   r   c                 x   [         R                  " US   US S 2S S S 24   5      S S 2S S4   n[         R                  " US:  SS5      n[        XR                  5      nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      nXG-   nU$ )Nr   .r   ry   r  r(  )r   r   r   re   )r5   eqrp   r   rk   r$  r   r   r   r  r*  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r<   compute_side_bias0LongT5TransientGlobalAttention.compute_side_biasf  s    #hhtI8J1dTU:8VWXY[_adXde#kk*=*A3N!A$H^H^!_(,(F(F"#.;;==	 )G )
% 778UV	 %%l3	1=""r>   c                 >	  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jn[        Ub  UO"[        R                  " UR                   S S 5      T R                  5      u  pU
R                   S   n[        XU5      nT R                  U5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      n[        UT R                  SS9n[        UT R                  SS9n[        UT R                  SS9n[        USSS9n[        USSS9nS/UR                  S-   -  nUR                   S   US'   UR                  S5      R                  U5      nUR                  S5      R                  U5      n[        R                   " UU/SS9n[        R                   " UU/SS9n[        R"                  " SX5      nUb=  [%        UT R                  UR&                  5      n[        R(                  " US	:  S
S5      nOS nUGct  T R*                  (       dz  [        R,                  " SST R.                  T R                  ST R                  -  4UR&                  UR0                  S9nT R2                  (       a  T R4                  (       a  SUl        OT R9                  T R                  5      nUb  UUR;                  SS5      -   nUR=                  UR0                  5      nUc  [        R                  " TU5      nT R?                  X*5      n[        UT R                  SS9R;                  SS5      nUR=                  UR0                  5      RA                  UR&                  5      n[        R                   " UU/SS9nUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UT RL                  T R4                  S9nUb  UU-  nUR=                  UR0                  5      nU" [        R"                  " SUU5      5      nUS S 2S U2S S 24   nT RO                  U5      nUU4nU(       a  UU4-   nU$ )Nre   c                 T   > U R                  TSTR                  TR                  5      $ rv  rx  ry  s    r<   r2   5LongT5TransientGlobalAttention.forward.<locals>.shape  r|  r>   c                 Z   > U R                  5       R                  TSTR                  5      $ r~  r  ry  s    r<   r  7LongT5TransientGlobalAttention.forward.<locals>.unshape  r  r>   r+   r   rJ   rf   r  r   ry   r  r   r4  Trg   r5  r  )(r2   r   r5   r   rk   r   r  r   r   r   rF   r#   rS   r7   rW   r   rO   r   rj   rA   rp   r   r6   r   r)   r  r7  rA  r1  r   ro   r  r_   r   r9   rC  r  rD  r   r   )r   r   r   rG  rI  rK  rL  r2   r  rm   r   _global_seq_lenglobal_inputsrN  rQ  rR  side_key_statesside_value_statesrepsrS  r\   side_position_biasrW  rX  rY  r   s   `                        @r<   r   &LongT5TransientGlobalAttention.forward{  s9    "/!4!4Ra!8
J	V	L )E$D%**]5H5H"5M*N"")
%	
 -22261-O\44]C TVVM23466-01
TVVM23} 56!$&&"78 *,AN'
DNNJ
),AN +:QRS
,\QUVW so**Q./""1%Q)33A6==dC-77:AA$G YY
O<!D
yy,0A!BJ 5|P#<T4>>S`SgSg#h #(;;/Ca/Ge#T #'  33 %4<<T^^9KL!== ,,!
 ..4==26M/ $ 1 1$.. A#/ -0D0N0NqRS0T T)..v||<M |zz*j9!%!7!7!Q!34F\^!_!i!ijkmn!o!3!8!8!F!I!I&--!X!II}6H&IrRM-}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9L#((););<ell+BLR^_`!![j[!"34ff[)./Gr>   )r#   r   r   rk   r  r  r   r   r   r   r   rc  r   r   r  r   r   r   r   r   r  r\  r  )r   r   r   r   r   rB  r   r  r_  r$  r   r1  r5   r{   r  r   r   r   r   s   @r<   r  r    s    f| f$ f[_ f f>;  -  - ^ 0#ell # #Y^YeYe #0 u ur>   r  c                   R   ^  \ rS rSrSS\\   4U 4S jjjr       SS jrSrU =r	$ )LongT5LayerSelfAttentioni  r   c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r  s       r<   r   !LongT5LayerSelfAttention.__init__  sQ    ,W`
 *&..f>W>WXzz&"5"56r>   c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r   rG  rI  rH  rJ  rK  r-  r   r   )r   r  r   )r   r   rc   rG  rI  rH  rJ  rK  r-  normed_hidden_statesattention_outputrY  s               r<   r    LongT5LayerSelfAttention.forward  st      $}=-- '+)/) . 	
 &5Ea5H(II "%5ab%99r>   )r  r   r   r[  )NNNNFFN
r   r   r   r   r   r   r   r   r   r   r   s   @r<   r  r    s:    7XVY] 7 7  r>   r  c                   X   ^  \ rS rSrSrSS\\   4U 4S jjjr    S	S\4S jjr	Sr
U =r$ )
LongT5LayerLocalSelfAttentioni  z$Local self attention used in encoderr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g N)r   r   )r   r   ra  LocalSelfAttentionr   r   r   r   r   r   r   r   r  s       r<   r   &LongT5LayerLocalSelfAttention.__init__  sI    "6v"w)&..f>W>WXzz&"5"56r>   kwargsc                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ N)r   rG  rI  rK  r   r   )r   r  r   
r   r   rc   rG  rI  rK  r  r  r  rY  s
             r<   r   %LongT5LayerLocalSelfAttention.forward!  sk      $}=22 '+/ 3 
 &5Ea5H(II "%5ab%99r>   )r  r   r   r[  r  r   r   r   r   __doc__r   r   r   r   r   r   r   r   s   @r<   r  r    s>    .7XVY] 7 7   r>   r  c                   X   ^  \ rS rSrSrSS\\   4U 4S jjjr    S	S\4S jjr	Sr
U =r$ )
'LongT5LayerTransientGlobalSelfAttentioni7  z/Transient-Global self attention used in encoderr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r   r   r  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r  s       r<   r   0LongT5LayerTransientGlobalSelfAttention.__init__:  sN    ,J-
) *&..f>W>WXzz&"5"56r>   r  c                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ r  )r   r  r   r  s
             r<   r   /LongT5LayerTransientGlobalSelfAttention.forwardB  sk      $}=<< '+/ = 
 &5Ea5H(II "%5ab%99r>   )r  r   r   r[  r  r  r   s   @r<   r  r  7  s>    97XVY] 7 7   r>   r  c                   T   ^  \ rS rSrSS\\   4U 4S jjjr        SS jrSrU =r	$ )LongT5LayerCrossAttentioniY  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r<   r   "LongT5LayerCrossAttention.__init__Z  sO    .vSXdmn)&..f>W>WXzz&"5"56r>   c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r   rF  rG  rI  rH  rJ  r+  rK  r-  r   r   )r   r  r   )r   r   rF  rc   rG  rI  rH  rJ  r+  rK  r-  r  r  layer_outputrY  s                  r<   r   !LongT5LayerCrossAttention.forward`  sy      $}=// -'+)%/) 0 
 %||4DQ4G'HH/$4QR$88r>   )r  r   r   r   )NNNNFNFNr  r   s   @r<   r  r  Y  s<    7(3- 7 7  r>   r  c                   \   ^  \ rS rSrSS\\   4U 4S jjjr            SS jrSrU =r	$ )LongT5Blocki  r   c                 $  > [         TU ]  5         UR                  U l        UR                  (       a  [        nOGUR                  S:X  a  [
        nO0UR                  S:X  a  [        nO[        SUR                   S35      e[        R                  " 5       U l
        U R                  R                  U" XUS95        U R                  (       a"  U R                  R                  [        XS95        U R                  R                  [        U5      5        g )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrN   r  r   )r   r   r   r   attention_layerr   s        r<   r   LongT5Block.__init__  s     ++6O**g5;O**.@@EO!889<  ]]_


Fgpq	
 ??JJ7TU

-/0r>   c                 \   U R                   S   " UUUUU	U
UUS9nUS   nUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU	US   S-   U
UUS9
nUS   nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUUSS  -   nU R                   S   " U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4U-   $ )Nr   )rc   rG  rI  rH  rJ  rK  r-  r   i  )r  r   r+   )	rF  rc   rG  rI  rH  r+  rJ  rK  r-  )
r  r)   r5   r   isinfanyfinfor   clampr   )r   r   rc   rG  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasrI  cross_attn_layer_head_maskrH  rJ  rK  return_dictr-  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputss                      r<   r   LongT5Block.forward  s     "&A)'+)/)	"
 /q12126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM!__R1Fd1R&*jjm!65; :-+B/!3#"3-'# 4A6M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O 

2}5 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM 00	
r>   )r   r  r[  )NNNNNNNNFFTNr  r   s   @r<   r  r    sK    1XVY] 1 14 "#&*#'D
 D
r>   r  c                   N    \ rS rSr% \\S'   SrSrS/rSr	\
S 5       rS rS	 rS
rg)LongT5PreTrainedModeli  r   transformerTr  Fc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r5   r   r   r   )r   r  
input_maskdummy_inputss       r<   r  "LongT5PreTrainedModel.dummy_inputs  s8     LL.	\\*-
!*"&0

 r>   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        [        [        45      (       a  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       aN  U R                   R                  (       d2  UR                  R                  R
                  R                  SUS-  S9  ggg[        U[        5      (       GaQ  UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR                   S5      (       aE  UR                   R$                  b.  UR                   R$                  R
                  R'                  5         UR(                  R                  R
                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aG  UR(                  R$                  b/  UR(                  R$                  R
                  R'                  5         ggg[        U[,        5      (       Ga  UR.                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aE  UR.                  R$                  b.  UR.                  R$                  R
                  R'                  5         UR0                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aE  UR0                  R$                  b.  UR0                  R$                  R
                  R'                  5         UR(                  R                  R
                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aG  UR(                  R$                  b/  UR(                  R$                  R
                  R'                  5         ggg[        U[2        [4        [6        45      (       Ga  U R                   R"                  nU R                   R8                  nU R                   R:                  nUR<                  R                  R
                  R                  SX#U-  S-  -  S9  UR>                  R                  R
                  R                  SX#S-  -  S9  UR@                  R                  R
                  R                  SX#S-  -  S9  URB                  R                  R
                  R                  SX%U-  S-  -  S9  URD                  (       a}  URF                  R                  R
                  R                  SX#S-  -  S9  [        U[6        5      (       a4  URH                  R                  R
                  R                  SX#S-  -  S9  gggg)zInitialize the weightsrz   ry   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharednormal_hasattrtie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   ra  r  r   r   r   r   r   r   r   r   r  )r   modulefactorr   r   r   s         r<   _init_weights#LongT5PreTrainedModel._init_weights  s   //fo..MM$$Vc\2.LN` abb MM  %%--3FSL-Ivy))$++2Q2Q%%**22#2N 3R) 344 II!!))s;;CVCV[_B_8`)avyy&))fiinn.H		##))+II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I) 899KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I)2FHf ghh kk))G!%!1!1kk++GHHOO  ((cvL^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cvL^B^cgAg7h(i11..55::BBQWhl[lQmBnf&DEE99@@EEMM fT0A&B N  F 2 ir>   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r+   )r   .rJ   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r5   fullr2   rO   	new_zerosclonemasked_fill_)r   r  r
  r  shifted_input_idss        r<   _shift_right"LongT5PreTrainedModel._shift_right$  s    !%!C!C{{//!)8  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r>   r,   N)r   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r  r  r   r,   r>   r<   r  r    s?    %&*#&" .b!r>   r  c                   (  ^  \ rS rSrSU 4S jjrS r             SS jr SS\\R                  S4   S\R                  S\R                  S	\
S
\4
S jjr\S\R                  S\S\S\R                  S\R                  S\4S j5       rSrU =r$ )LongT5Stacki@  c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        Ub  UR                  U R                  l        UR                  U l        UR                  U l	        U R                  S-   U l
        [        R                  " [        UR                  5       Vs/ sH  n[        U[        US:H  5      US9PM     sn5      U l        [#        UR
                  UR$                  S9U l        [        R(                  " UR*                  5      U l        SU l        U R1                  5         g s  snf )Nr   r   r  r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   rc  r#   r  rK   
num_layersr  rB  blockr   r   final_layer_normr   r   r   r  	post_init)r   r   r  rQ   r   s       r<   r   LongT5Stack.__init__A  s    LL):):FNNK#'3':':D$ ++"//**Q.]] v0011A FQ!VXYZ1

 !0FD]D] ^zz&"5"56&+# 	s   < Ec                     Xl         g r   )r  r   new_embeddingss     r<   set_input_embeddings LongT5Stack.set_input_embeddings[  s    *r>   c                 	   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S
5        Sn	Uc%  U R                  c   S5       eU R                  U5      nUu  nnU R
                  (       aM  U	(       aE  UcB  U R                   R                  (       a  [        [!        5       [!        5       5      nO[!        5       nOU R
                  (       d  S nUb  UR#                  5       OSnUc#  [$        R&                  " UUU-   UR(                  S9nUc4  [+        5       (       d%  UU-   n[$        R,                  " UUUR(                  S9nU R
                  (       a7  U R/                  UUU[1        U[        5      (       a  UR2                  OUU
5      nO=U R                   R4                  S:X  a!  [7        X R8                  UR(                  5      nOUnU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [$        R,                  " UUR(                  S9nU R;                  U5      nOS nU R=                  X`R                   R>                  5      nU R=                  XpR                   R>                  5      nU(       a  SOS nU
(       a  SOS nU
(       a  U R
                  (       a  SOS nS nS nU RA                  U5      n[C        U RD                  5       H  u  n n!UU    n"UU    n#U(       a  UU4-   nU!" UUUUUUU"U#UU	U
UUS9n$U$S   nU$S   nU R
                  (       a  Ub  U$U
(       a  SOS   nU
(       d  Md  UU$S   4-   nU R
                  (       d  M  UU$S   4-   nM     U RG                  U5      nU RA                  U5      nU(       a  UU4-   nU(       d  [I        S UUUUU4 5       5      $ [K        UUUUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer+   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsr   rw   r  r,   )rI  r  rH  rJ  rK  r  r-  r   r   re      c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r,   ).0r   s     r<   	<genexpr>&LongT5Stack.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)&r   rJ  rK  output_hidden_statesuse_return_dictr   r  sizer8  r  r7  r   r   r  is_encoder_decoderr   r   get_seq_lengthr5   rU   rA   r   r   _update_causal_maskr   r<  r  rj   r#   invert_attention_maskget_head_maskr  r   	enumerater  r   rM   r   )%r   r  rc   r  r  r+  	head_maskcross_attn_head_maskr2  rJ  rK  r5  r  r-  err_msg_prefixinput_shaper   rL  past_key_values_lengthmask_seq_lengthrU  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrG  r  r   rQ   layer_modulerI  r  layer_outputss%                                        r<   r   LongT5Stack.forward^  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	 $$0p2pp0 --i8M!,
J??_4;;11&9,.,.&YO&2nO #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN??22o/BCC  44$!K [[//7:3NNNTaThThiK(K ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7DOOrRV(,%]3(4OA|'lO)=a)@&#$58H$H!(%/- /+E.#"3'-M& *!,M
 *!,M#8#D0=CTaZ[0\-  !/=3C2E!E???+?=QRCSBU+U(M  5P --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r>   rc   r    input_tensorr-  r2  rK  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2ry   flex_attentionr   Fsdpa)r+  rB  is_trainingr   r+   )sequence_lengthtarget_lengthr)   r-  r   )cudaxpunpu)r   _attn_implementationr  r   r5   r{   r!   r9  is_compileabler   _ignore_causal_mask_sdpar7  r)   r2   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrA   ro   r  r  _unmask_unattended)r   rc   rO  r-  r2  rK  past_seen_tokensusing_compilable_cacher)   rU  rV  rU  	min_dtypes                r<   r:  LongT5Stack._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr>   rU  rV  r)   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr,  )
fill_valuer)   rA   r   )diagonalrw   r+   r   )r$   r5   r  r  r  rA   triurU   rC   expandr  r2   r_   masked_fill)rc   rU  rV  r)   r-  r   r  rU  rb  mask_lengthpadding_masks              r<   r^  ALongT5Stack._prepare_4d_causal_attention_mask_with_cache_positionT  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r>   )r  r#   r   r  r   r  r   rc  r   )NNNNNNNNNNNNNr  )r   r   r   r   r   r&  r   r   r5   r{   r
   rB  r:  r_  r   r)   r^  r   r   r   s   @r<   r  r  @  s    4+
 "#!!o
p #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r>   r  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c            &       J  ^  \ rS rSrS/rSS/rS\4U 4S jjrS rS r	S	 r
S
 rS rS r\                S S\\R"                     S\\R$                     S\\R"                     S\\R&                     S\\R$                     S\\R$                     S\\R(                     S\\\\R$                           S\\   S\\R(                     S\\R(                     S\\   S\\   S\\   S\\   S\\R"                     S\\\R$                     \4   4"S jj5       rSrU =r$ )!r  i  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         g )NFT)r   r   r   r   r  r   r  copydeepcopyr   rJ  tie_encoder_decoderr  encodernum_decoder_layersr  decoderr!  r   r   encoder_configdecoder_configr   s       r<   r   LongT5Model.__init__  s     ll6#4#4fnnEv.$)!#( -2*">;;?v.$(!-2*$*$=$=!">;;? 	r>   c                     U R                   $ r   r  r   s    r<   get_input_embeddings LongT5Model.get_input_embeddings      {{r>   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  ru  r&  rw  r$  s     r<   r&   LongT5Model.set_input_embeddings  +    $)).9)).9r>   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   r   r  _tie_or_clone_weightsru  r  r  rw  r~  s    r<   _tie_weightsLongT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r>   c                     U R                   $ r   ru  r~  s    r<   get_encoderLongT5Model.get_encoder      ||r>   c                     U R                   $ r   rw  r~  s    r<   get_decoderLongT5Model.get_decoder  r  r>   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gz
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
Nitemsru  r  	attentionr  r   heads_to_pruner  r  s       r<   _prune_headsLongT5Model._prune_heads  <    
 +002LELLu%//;;EB 3r>   r  rc   r  r  r>  decoder_head_maskr?  encoder_outputsr2  r+  decoder_inputs_embedsrJ  rK  r5  r  r-  r&   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUU	UUUUUUUUUS9nU(       d  UU-   $ [        UR                  UR                   UR"                  UR$                  UR&                  UR                  UR"                  UR$                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, LongT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

>>> # Let's try a very long encoder input.
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1

>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr  rc   r+  r>  rK  r5  r  r   r   re   r1  r   r3  r  rc   r+  r2  r  r  r>  r?  rJ  rK  r5  r  r-  )r1  r2  decoder_hidden_statesdecoder_attentionsr4  encoder_last_hidden_stater  encoder_attentions)r   rJ  r6  r  rv  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningru  r   r   r	  rw  r   r1  r2  r   r3  r4  )r   r  rc   r  r  r>  r  r?  r  r2  r+  r  rJ  rK  r5  r  r-  r   decoder_outputss                      r<   r   LongT5Model.forward  s   b "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r>   )rw  ru  r  )NNNNNNNNNNNNNNNN)r   r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r   r  r&  r  r  r  r  r   r   r5   
LongTensorFloatTensor
BoolTensorr{   rM   r
   rB  r   r   r   r   r   r   s   @r<   r  r    s    	R*& 89VW| &:
O
C  156:8<=A159=7;EI+/048<$(,0/3&*59#J
E,,-J
 !!2!23J
 $E$4$45	J

 !))9)9 :J
 E--.J
 $E$5$56J
 'u||4J
 "%e.?.?(@"ABJ
 "%J
  -J
  (5J
 D>J
 $D>J
 'tnJ
  d^!J
" !!1!12#J
$ 
uU&&');;	<%J
 J
r>   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc            (         ^  \ rS rSrS/r/ SQrS\4U 4S jjrS rS r	S r
S	 rS
 r\                 S S\\R                      S\\R"                     S\\R                      S\\R$                     S\\R"                     S\\R"                     S\\R&                     S\\\\R&                           S\\   S\\R"                     S\\R"                     S\\R                      S\\   S\\   S\\   S\\   S\\R                      S\\\R"                     \4   4$S jj5       rS\R&                  4S jrSrU =r$ )!r  iY  rn  )ro  rp  zlm_head.weightr   c                 L  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         g )NFTr   )r   r   r   	model_dimr   r   r  r  rr  rs  r   rJ  rt  r  ru  rv  r  rw  r   r  r!  rx  s       r<   r   'LongT5ForConditionalGeneration.__init__d  s     ll6#4#4fnnEv.$)!#( -2*">;;?v.$(!-2*$*$=$=!">;;?yy1B1BO 	r>   c                     U R                   $ r   r}  r~  s    r<   r  3LongT5ForConditionalGeneration.get_input_embeddings{  r  r>   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r$  s     r<   r&  3LongT5ForConditionalGeneration.set_input_embeddings~  r  r>   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   r  r~  s    r<   r  +LongT5ForConditionalGeneration._tie_weights  r  r>   c                     U R                   $ r   r  r~  s    r<   r  *LongT5ForConditionalGeneration.get_encoder  r  r>   c                     U R                   $ r   r  r~  s    r<   r  *LongT5ForConditionalGeneration.get_decoder  r  r>   r  rc   r  r  r>  r  r?  r  r2  r+  r  labelsrJ  rK  r5  r  r-  r&   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nUb  Uc  Uc  U R                  U5      nU R                  UUUU	UUUUUUUUUS9nUS   nU R                   R                  (       a  UU R                   S-  -  nU R#                  U5      nSnUb[  [%        S	S
9nUR'                  UR(                  5      nU" UR+                  SUR-                  S5      5      UR+                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [/        UUUR0                  UR2                  UR4                  UR6                  UR8                  UR2                  UR4                  S9	$ )ar  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
>>> model = LongT5ForConditionalGeneration.from_pretrained(
...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
... )

>>> # Let's try a very long input.
>>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
>>> input_ids = inputs.input_ids

>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
abstractthe aim of this article is to provide an overview of the literature on the role of dog
```Nr  r   r   re   r  r  r  r	  )ignore_indexr+   )	losslogitsr2  r  r  r4  r  r  r  )r   rJ  r6  r  rv  r  r  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  ru  r   r   r	  r  rw  r  r  r  r   r_   rA   r8  r7  r   r2  r   r3  r4  r1  )r   r  rc   r  r  r>  r  r?  r  r2  r+  r  r  rJ  rK  r5  r  r-  r   r  sequence_output	lm_logitsr  loss_fctoutputs                            r<   r   &LongT5ForConditionalGeneration.forward  so   j "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD \OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r>   c                 $    U R                  U5      $ r   )r  )r   r  s     r<   %prepare_decoder_input_ids_from_labelsDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels7  s      ((r>   )rw  ru  r  r  r  )NNNNNNNNNNNNNNNNN)r   r   r   r   r  r  r   r   r  r&  r  r  r  r   r   r5   r  r  r  r{   rM   r
   rB  r   r   r   r  r   r   r   s   @r<   r  r  Y  s    	R*& j| .:
O
  156:8<=A159=7;@D+/59=A-1$(,0/3&*59%f
E,,-f
 !!2!23f
 $E$4$45	f

 !))9)9 :f
 E--.f
 $E$5$56f
 'u||4f
 "%ell(;"<=f
 "%f
   1 12f
  ((9(9:f
 ))*f
 D>f
 $D>f
  'tn!f
" d^#f
$ !!1!12%f
& 
uU&&'8	9'f
 f
P)ELL ) )r>   r  c                   >  ^  \ rS rSrS/rS/rS\4U 4S jjrS rS r	S r
S	 rS
 r\       SS\\R                      S\\R"                     S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\R"                     \4   4S jj5       rSrU =r$ )r  i;  ro  rw  r   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        X R                  5      U l        U R                  5         g )NF)r   r   r   r   r  r   r  rr  rs  rJ  rt  r  ru  r!  )r   r   ry  r   s      r<   r   LongT5EncoderModel.__init__@  sf     ll6#4#4fnnEv.#( -2*">;;? 	r>   c                     U R                   $ r   r}  r~  s    r<   r  'LongT5EncoderModel.get_input_embeddingsL  r  r>   c                 F    Xl         U R                  R                  U5        g r   )r  ru  r&  r$  s     r<   r&  'LongT5EncoderModel.set_input_embeddingsO  s    $)).9r>   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g r   )r   r  r  ru  r  r  r~  s    r<   r  LongT5EncoderModel._tie_weightsS  s2    ;;**&&t||'@'@$++N +r>   c                     U R                   $ r   r  r~  s    r<   r  LongT5EncoderModel.get_encoderW  r  r>   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gr  r  r  s       r<   r  LongT5EncoderModel._prune_headsZ  r  r>   r  rc   r>  r+  rK  r5  r  r&   c           
      f    Ub  UOU R                   R                  nU R                  UUUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )r   r6  ru  )	r   r  rc   r>  r+  rK  r5  r  r  s	            r<   r   LongT5EncoderModel.forwardb  sK    F &1%<k$++B]B],,)'/!5# ' 
 r>   )ru  r  )NNNNNNN)r   r   r   r   r  r  r   r   r  r&  r  r  r  r   r   r5   r  r  rB  r   rM   r   r   r   r   r   s   @r<   r  r  ;  s   78*4&
| 
:OC  156:1559,0/3&*.E,,-. !!2!23. E--.	.
   1 12. $D>. 'tn. d^. 
uU&&'8	9. .r>   r  )r  r  r  r  )r   )Zr  rr  r  r  typingr   r   r   r5   r   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   configuration_longt5r   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerr   r   r{   r   r=   rF   rS   r[   rb   rA   rj   rM   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionwarningr   r   r   r   ra  r  r  r  r  r  r  r  r  __HEAD_MASK_WARNING_MSGr  r  r  __all__r,   r>   r<   <module>r     s       ' '   % ! C C ) > 9  . Q   /  !!;J 
		H	%  3 3 W\WcWc  #%,, #3 #S #U\\ #4U\\ 4c 4 4Y\ 4ejeqeq 42!# !%,, !BU\\ Bc BV[VbVb B8ell 8s 8TYT`T` 8ejeqeq 8 .PLL.P58.P
5<<%&.Pb4U\\ 4VY 4^c^j^j 4	j<<	j,1LL	jJM	j
\\	j+bii +2	/"O
KKef")) ,ryy &BII &dbii dN299 DCRYY CN!ryy !HBII >bii D#		 #L\
, \
~ ^!O ^! ^!BJ' J\
  
' 
 
D 
Z)%:O Z)
Z)z U. U Up kk<  	 	
NN[\	s   =M
 
M-M-,M-