
    <h~                        S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
JrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  \R<                  " \5      r  " S S\RB                  5      r" " S S\RF                  5      r$ " S S\RF                  5      r% " S S\5      r&\ " S S\5      5       r'\ " S S\'5      5       r(\" SS9 " S S\'\5      5       r)/ S Qr*g)!zPyTorch XGLM model.    N)OptionalUnion)nn   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )
XGLMConfigc            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )XGLMScaledWordEmbedding'   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)super__init__r   )selfr   r   r   r   	__class__s        ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/xglm/modeling_xglm.pyr    XGLMScaledWordEmbedding.__init__,   s    D&    	input_idsc                 <   > [         TU ]  U5      U R                  -  $ r   )r   forwardr   )r    r%   r!   s     r"   r'   XGLMScaledWordEmbedding.forward0   s    wy)D,<,<<<r$   r   )      ?)__name__
__module____qualname____firstlineno____doc__intr   floatr   torchTensorr'   __static_attributes____classcell__r!   s   @r"   r   r   '   sJ    's '3 'S '_ghm_n ' '= = =r$   r   c            	          ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5       SS
\\R                     S\4S jj5       rSrU =r$ )!XGLMSinusoidalPositionalEmbedding4   zDThis module produces sinusoidal positional embeddings of any length.num_positionsr   r   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g )N   )r   r   offsetr   r   make_weights)r    r:   r   r   r!   s       r"   r   *XGLMSinusoidalPositionalEmbedding.__init__7   s8    *&-++5}Rr$   r   c                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtorA   rC   rD   register_buffer)r    r   r   r   emb_weightss        r"   r>   .XGLMSinusoidalPositionalEmbedding.make_weights>   s\    ((T4##%..t||/A/A$,,J]J].^KYFr$   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r<   i'  r   )rC   r   dimN)mathlogr2   exparangeint64r1   	unsqueezecatsincosviewzerosrH   get_default_dtype)r   r   r   half_dimembs        r"   rF   /XGLMSinusoidalPositionalEmbedding.get_embeddingF   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r$   position_idspast_key_values_lengthc                    UR                  5       u  p4XR                  -  nSU-   U-   nXPR                  R                  S5      :  a&  U R                  XPR                  U R
                  5        U R                  R                  SUR                  S5      5      R                  X4U R                  R                  S   5      R                  5       $ )Nr<   r   rO   )
sizer=   rA   r>   r   r   index_selectrY   shapedetach)r    r_   r`   bszseq_lenmax_poss         r"   r'   )XGLMSinusoidalPositionalEmbedding.forward[   s    #((*# g+ 66\\&&q))g'9'94;K;KL||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvr$   )r   r=   r   r   )Nr   )r+   r,   r-   r.   r/   r0   r   r   r>   staticmethodrF   r2   no_gradr3   r'   r4   r5   r6   s   @r"   r8   r8   4   s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( ]]_	wHU\\$: 	w[^ 	w 	wr$   r8   c                     ^  \ rS rSrSr    SS\S\S\\   S\\   S\\   S\\   4U 4S	 jjjr	      SS
\
R                  S\\
R                     S\\   S\\
R                     S\\
R                     S\S\\
R                     S\\
R                  \\
R                     \\\
R                        4   4S jjrSrU =r$ )XGLMAttentionh   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rs   )r   r   ro   rp   rq   head_dim
ValueErrorscalingrr   rt   r   Lineark_projv_projq_projout_proj)r    ro   rp   rq   rr   rs   rt   r!   s          r"   r   XGLMAttention.__init__k   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr$   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionscache_positionreturnc                 f   USLnUR                  5       u  pnU(       a  UR                  S   OU
nU R                  U5      U R                  -  nUb]  [	        U[
        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR!                  XSU R"                  5      R%                  SS5      nUR!                  XSU R"                  5      R%                  SS5      nUbN  U(       d  UOSnWR'                  UUU R                  SU05      u  nnU(       a  SUR                  U R                  '   XR(                  -  SU R"                  4nUR!                  XU R(                  U R"                  5      R%                  SS5      nUR*                  " U6 nUR*                  " U6 nUR*                  " U6 nUR                  S5      n[,        R.                  " UUR%                  SS5      5      nUR                  5       XR(                  -  X4:w  a-  [1        SXR(                  -  X4 SUR                  5        35      eUb  UR                  5       U	SX4:w  a"  [1        S	U	SX4 SUR                  5        35      eUR!                  XR(                  X5      U-   n[,        R2                  " U[,        R4                  " [,        R6                  " UR8                  5      R:                  UR<                  S
95      nUR!                  XR(                  -  X5      nUR8                  [,        R>                  :X  aK  [@        RB                  RE                  US[,        RF                  S9RI                  [,        R>                  5      nO[@        RB                  RE                  USS9nUb  UR                  5       U R(                  4:w  a*  [1        SU R(                  4 SUR                  5        35      eUR!                  SSSS5      UR!                  XR(                  X5      -  nUR!                  XR(                  -  X5      nU(       a;  UR!                  XR(                  X5      nUR!                  XR(                  -  X5      nOSn[@        RB                  RK                  UU RJ                  U RL                  S9n[,        R.                  " UU5      nUR                  5       XR(                  -  XR"                  4:w  a5  [1        SXR(                  XR"                  4 SUR                  5        35      eUR!                  XR(                  XR"                  5      nUR%                  SS5      nUR+                  XU RN                  5      nU RQ                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr   rO   r<   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size )rD   )rN   rC   rM   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ))rb   rd   r}   ry   
isinstancer	   
is_updatedgetrt   cross_attention_cacheself_attention_cachelayerskeysvaluesr{   r|   rY   rw   	transposeupdaterp   reshaper2   bmmrx   maxtensorfinforC   minrD   float16r   
functionalsoftmaxfloat32rH   rq   r   ro   r~   )r    r   r   r   r   r   r   r   is_cross_attentionrf   tgt_len_src_lenquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r"   r'   XGLMAttention.forward   s    .T9',,.a/A"((+w {{=1DLL@%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#r4==ISSTUWXYJ',,S2t}}MWWXY[\]L)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S>>-A7TL .==002U]]0[^^_d_l_lmL==0020FL&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r$   )rq   ro   rw   rr   r{   rt   rp   r~   r}   ry   r|   )        FTN)NNNNFN)r+   r,   r-   r.   r/   r0   r   r1   boolr   r2   r3   r   tupler'   r4   r5   r6   s   @r"   rm   rm   h   s:   G $'%*#$(CC C %	C
 TNC tnC D>C C@ 48*.1526"'15|2|||2 #5<<0|2 !	|2
 !.|2 "%,,/|2  |2 !.|2 
u||Xell3XeELL>Q5RR	S|2 |2r$   rm   c                   V  ^  \ rS rSrSS\4U 4S jjjr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\
   S\\
   S\\R                     S\R                  4S jjrSrU =r$ )XGLMDecoderLayeri  configc                 8  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        UR                  (       aU  [	        U R                  UR
                  UR                  SUS9U l        [        R                   " U R                  5      U l        [        R                   " U R                  5      U l        [        R&                  " U R                  UR(                  5      U l        [        R&                  " UR(                  U R                  5      U l        [        R                   " U R                  5      U l        g )NT)ro   rp   rq   rr   rt   )r   r   d_modelro   rm   attention_headsattention_dropout	self_attnrq   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrz   ffn_dimfc1fc2final_layer_norm)r    r   rt   r!   s      r"   r   XGLMDecoderLayer.__init__  s   &nn,,,,
 ~~#F$>$>?"(";";%% -.. 0000#!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r$   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   	use_cacher   r   c           
         UnU R                  U5      nU R                  UUUUUU
S9u  p[        R                  R	                  XR                  U R
                  S9nX-   nSnUb`  UnU R                  U5      nU R                  UUUUUUU
S9u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  XU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   r   r   r   N)r   r   r   r   r   r   r   )r   r   r   r   rq   r   r   r   r   r   r   r   r   )r    r   r   r   r   r   r   r   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                  r"   r'   XGLMDecoderLayer.forward'  s   > !11-@ ,0>>'))+/) ,: ,
( --m||VZVcVc-d 0 " ,$H 88GM040A0A+!65 :-"3- 1B 1-M MM11-<<Z^ZgZg1hM$4M !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0 "+=>>Gr$   )r   r   rq   ro   r   r   r   r   r   r   r   r   )	NNNNNNFTN)r+   r,   r-   r.   r   r   r2   r3   r   r   r   r'   r4   r5   r6   s   @r"   r   r     s    =z = =D 268<9=26=A*.,1$(15N||N !.N  (5	N
 !) 6N "%,,/N %-U\\$:N !N $D>N D>N !.N 
N Nr$   r   c                   4    \ rS rSr% \\S'   SrSrS/rS r	Sr
g)	XGLMPreTrainedModelix  r   modelTr   c                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_stdr   r   rz   weightdatanormal_rs   zero_	Embeddingr   )r    moduler   s      r"   _init_weights!XGLMPreTrainedModel._init_weights  s    kk""fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r$    N)r+   r,   r-   r.   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r4   r   r$   r"   r   r   x  s"    &*#+,	?r$   r   c            "         ^  \ rS rSrSS\S\\R                     4U 4S jjjr\	              SS\\
R                     S\\
R                     S\\
R                     S\\
R                     S	\\
R                     S
\\
R                     S\\
R                     S\\\
R                        S\\
R                     S\\   S\\   S\\   S\\   S\\
R                     S\\\
R                     \4   4S jj5       rSrU =r$ )	XGLMModeli  r   embed_tokensc           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSnUb  X l        O/[        UR                  UR                  U R
                  US9U l        [        UR                  UR                  UR                  5      U l        ["        R$                  " ['        UR(                  5       Vs/ sH  n[+        XS9PM     sn5      U l        ["        R.                  " UR                  5      U l        SU l        U R5                  5         gs  snf )zB
embed_tokens (`nn.Embedding`, *optional*):
    output embeddings
r*   Nr)   )rt   F)r   r   rq   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrP   sqrtr   r   r   
vocab_sizer8   embed_positionsr   
ModuleListrange
num_layersr   r   r   
layer_normgradient_checkpointing	post_init)r    r   r   r   ir!   s        r"   r   XGLMModel.__init__  s    
 	 ~~))!..$*$B$B!393I3Idii/s# , 7!!6>>43C3CQ\!D  A**NN 

 mmTYZ`ZkZkTl$mTlq%5f%JTl$mn,,v~~6&+# %ns   E3r%   r   r_   r   r   	head_maskcross_attn_head_maskpast_key_valuesinputs_embedsr   r   output_hidden_statesreturn_dictr   r   c                 N   Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  U	b  [        S5      eUb7  U R                  X5        UR                  5       nUR                  SUS   5      nO"U	b  U	R                  5       SS nO[        S5      eU	c  U R                  U5      n	U R                  (       a/  U R                  (       a  U
(       a  [        R                  S5        Sn
SnU
(       aB  [        U[        5      (       d-  Sn[        R                  S5        [         R"                  " U5      nUb  UR%                  5       OS	n['        X/U	U5      nUcU  [(        R*                  " UUS   U-   [(        R,                  Ub  UR.                  OU	R.                  S
9nUR1                  S	5      nUb  Ub  [3        XYR4                  US   S9nXR7                  UU5      R9                  U	R.                  5      -   n[:        R<                  R?                  U[A        U R>                  5      U R                  S9nU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSn[C        Xg/SS/5       Hn  u  nnUc  M  UR                  5       S	   [E        U RF                  5      :w  d  M7  [        SU S[E        U RF                  5       SUR                  5       S	    S35      e   [I        U RF                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [(        RJ                  " / 5      nUU RL                  :  a  ML  U" UUUUUb  UU   OSUb  UU   OSUUU
US9
nUS	   nU(       d  My  UUS   4-  nUc  M  UUS   4-  nM     U RO                  U5      nU(       a  UU4-  nU(       a  URQ                  5       nU(       d  [S        S UUUUU4 5       5      $ [U        UUUUUS9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NzDYou cannot specify both input_ids and inputs_embeds at the same timerO   z5You have to specify either input_ids or inputs_embedsz_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   rB   )r   r   r   r   r   zThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r<   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r   ).0vs     r"   	<genexpr>$XGLMModel.forward.<locals>.<genexpr>F  s      rA rs   	)last_hidden_stater   r   
attentionscross_attentions)+r   r   r   r   use_return_dictrx   %warn_if_padding_and_no_attention_maskrb   rY   r   r   r   loggerwarning_oncer   r   r	   from_legacy_cacheget_seq_lengthr   r2   rS   longrD   rU   r   rC   r   rH   r   r   rq   r1   ziplenr   	enumeraterandr   r   to_legacy_cacher   r   )r    r%   r   r_   r   r   r   r   r   r   r   r   r   r   r   input_shapereturn_legacy_cacher`   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                               r"   r'   XGLMModel.forward  sZ   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"66yQ#..*K!r;r?;I&',,.s3KTUU  --i8M&&4==##u "	 $Z??"&\
 2CCOTOETE`!?!?!Afg:8N
  <<&B"88jj+4+@y''mFZFZ	L (11!4L !,1G1S%?&(;(;[QS_&" &(<(<\Ka(b(e(e  )
 
 --muT\\?R]a]j]j-k #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&#dkk*::$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos."3#-M *!,M  =#3"55(4(]1-=,??(7 #9: 6  -!11-==?O ':K^]qr  
 9+++%1
 	
r$   )	rq   r   r   r   r   r   r   r   r   r   )NNNNNNNNNNNNNN)r+   r,   r-   r.   r   r   r   r   r   r   r2   r3   listFloatTensorr   r   r   r   r'   r4   r5   r6   s   @r"   r   r     s   z ",,9O  >  -115/38<9=,07;=A04$(,0/3&*15d
ELL)d
 !.d
 u||,	d

  (5d
 !) 6d
 ELL)d
 'u||4d
 "$u'8'8"9:d
  -d
 D>d
 $D>d
 'tnd
 d^d
 !.d
  
uU\\"$MM	N!d
 d
r$   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            $         ^  \ rS rSrSrS/rU 4S jr\               SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\\	R                        S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\	R                     \4   4 S jj5       rSrU =r$ )XGLMForCausalLMiT  r   zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFrv   )
r   r   r   r   r   rz   hidden_sizer   lm_headr   )r    r   r!   s     r"   r   XGLMForCausalLM.__init__^  sH     v&
yy!3!3V5F5FUS 	r$   r%   r   r_   r   r   r   r   r   r   labelsr   r   r   r   r   r   c                 .   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUUUU	UUUUUS9nU R                  US   5      nSnU
b?  U R                  " UU
4U R                   R                  U R                   R                  S.UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r%   r   r_   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   )losslogitsr   r   r  r  )r   r   r   r	  r   r(  loss_functionr   r   r   r   r   r  r  )r    r%   r   r_   r   r   r   r   r   r   r*  r   r   r   r   r   kwargsr   r-  r,  outputs                        r"   r'   XGLMForCausalLM.forwardf  sW   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **)%"7#9!5+'/!5#)  
" gaj)%%  ;;11![[55	
 D Y,F'+'7D7V#CVC0#33!//))$55
 	
r$   )r(  r   )NNNNNNNNNNNNNNN)r+   r,   r-   r.   r   _tied_weights_keysr   r   r   r2   r3   r!  r"  r   r   r   r   r'   r4   r5   r6   s   @r"   r%  r%  T  s     *+  -115/38<9=,07;=A04)-$(,0/3&*15!Y
ELL)Y
 !.Y
 u||,	Y

  (5Y
 !) 6Y
 ELL)Y
 'u||4Y
 "$u'8'8"9:Y
  -Y
 &Y
 D>Y
 $D>Y
 'tnY
 d^Y
  !.!Y
$ 
uU\\"$EE	F%Y
 Y
r$   r%  )r%  r   r   )+r/   rP   typingr   r   r2   torch.utils.checkpointr   activationsr   cache_utilsr   r	   
generationr
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_xglmr   
get_loggerr+   r  r   r   Moduler8   rm   r   r   r   r%  __all__r   r$   r"   <module>rA     s      "    ! 5 ) e 9 l - , * 
		H	%
=bll 
=1w		 1wh\2BII \2~n1 nb ?/ ? ?$ E
# E
 E
P f
)? f
f
R Br$   