
    <hs                        S r SSKJrJr  SSKrSSKrSSKJr  SSKJr  SSK	J
r
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  \" 5       (       a  SSKJr  SSKJ r   \RB                  " \"5      r#S\$S\$S\RJ                  4S jr&S\RJ                  S\RJ                  4S jr'S\RJ                  S\RJ                  S\RJ                  S\RJ                  4S jr( " S S\RR                  5      r* " S S\RR                  5      r+ " S  S!\5      r,\ " S" S#\5      5       r-\ " S$ S%\-5      5       r.\" S&S'9 " S( S)\-\5      5       r// S*Qr0g)+zPyTorch CodeGen model.    )OptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )CodeGenConfig)	BlockMask)make_flex_block_causal_masknum_posdimreturnc           	         SS[         R                  " SUS[         R                  S9U-  -  -  n[         R                  " S[         R                  " U [         R                  S9R	                  5       U5      R	                  5       n[         R
                  " [         R                  " U5      [         R                  " U5      4SS9$ )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inps       d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positionsr+   0   s    eQQekk JS PQRH<<WEKK0X0^0^0`bjkqqsL99eii-uyy/FGQOO    xc                     U S S 2S S 2S S 2S S S24   nU S S 2S S 2S S 2SS S24   n[         R                  " U* U4SS9n U R                  S5      $ )Nr   r   r   )r    stackflatten)r-   x1x2s      r*   rotate_every_twor5   7   sS    	
1aCaC<B	
1aADqD=	BbS"I2&A99R=r,   tensorr&   r'   c                     [         R                  " US S 2S S 2S S S 24   SS5      n[         R                  " US S 2S S 2S S S 24   SS5      nX-  [        U 5      U-  -   $ )Nr   r   )r    repeat_interleaver5   )r6   r&   r'   s      r*   apply_rotary_pos_embr9   ?   s\    

!
!#aD!m"4a
;C

!
!#aD!m"4a
;CL-f5;<<r,   c                     ^  \ rS rSrSU 4S jjrS rS r  SS jr       SS\\	R                     S\\   S\\	R                     S	\\	R                     S
\\	R                     S\\   S\\   S\\	R                     S\\\	R                   \\	R                      4   \\\	R                   \\	R                      \\	R                   S4   4      4   4S jjrSrU =r$ )CodeGenAttentionE   c                 f  > [         TU ]  5         UR                  n[        R                  " UR
                  5      U l        [        R                  " UR                  5      U l        X l	        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                   U l        U R                  U R                   -  U l        U R"                  U R                   -  U R                  :w  a&  [%        SU R                   SU R                    S35      e[&        R(                  " [&        R*                  " U R"                  [&        R,                  S95      R/                  [&        R0                  " 5       5      U l        [        R4                  " U R                  U R                  S-  SS	9U l        [        R4                  " U R                  U R                  SS	9U l        UR:                  U l        U R:                  =(       d    U R                  n[=        X45      U l        g )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__max_position_embeddingsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr    sqrtr6   float32toget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr+   embed_positions)selfconfigrG   max_positionspos_embd_dimrJ   s        r*   r@   CodeGenAttention.__init__F   s   66JJv'8'89ZZ(:(:;" !8!8 9 :, ,  ++#)#=#= $*B*BB==4333t~~EWX\XfXfWg h++/+C+C*DBH   **U\\$--u}}%UVYYZ_ZqZqZst		$..$..12D5Q		$..$..uM ++8$..:=Wr,   c                     UR                  UR                  S S X$-  U4-   5      nUR                  UR                  S S S-   UR                  SS  -   5      nU$ )Nr/   r0   )r/   )reshapeshape)r[   r-   n_headdim_headmp_numreshapeds         r*   _split_headsCodeGenAttention._split_headsd   s[    99QWWSb\V-=x,HHI##AGGCRL5$88>>"#;N$NOr,   c                    [        UR                  5      S:X  a$  UR                  SSSSS5      R                  5       nO][        UR                  5      S:X  a#  UR                  SSSS5      R                  5       nO![	        S[        UR                  5       35      eUR                  5       SS	 X#-  4-   nUR                  U5      $ )
z=
Merges attn_head_size dim and num_attn_heads dim into n_ctx
   r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr0   )lenrb   permute
contiguousrP   sizeview)r[   r6   rN   attn_head_size	new_shapes        r*   _merge_headsCodeGenAttention._merge_headsi   s     v||!^^Aq!Q2==?F!#^^Aq!Q/::<FRSVW]WcWcSdRefggKKM#2&*=*N)PP	{{9%%r,   c                    UR                  [        R                  5      nUR                  [        R                  5      n[        R                  " XR	                  SS5      5      nUb"  US S 2S S 2S S 2S UR
                  S   24   nXg-  nX`R                  -  n[        R                  " SS9" U5      nUR                  UR                  5      nU R                  U5      nUb  Xe-  n[        R                  " Xc5      nX4$ )Nr/   r0   r   )rS   r    rR   matmul	transposerb   rU   r   Softmaxr   rD   )	r[   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputs	            r*   _attnCodeGenAttention._attnv   s     'ffU]]#||E==R+@A%(Aq/CIIbM/)ABK'L#oo5zzb),7#u{{3((6  '3Lll<7((r,   hidden_states
layer_pastr|   position_idsr}   	use_cacheoutput_attentionscache_positionr   .c	                 .   U R                  U5      n	Sn
U	R                  U	R                  S S U
S4-   5      nU R                  U R                  -  U
-  n[
        R                  " XSS9u  pnU R                  XR                  U R                  U
S9nU R                  XR                  U R                  U
S9nU R                  XR                  U R                  U
S9nUR                  SSSS5      nU R                  nUR                  UR                  :w  a"  UR                  UR                  5      nUU l	        UU   n[
        R                  " UUR                  S   S-  SS9u  nnU R                  b  US S 2S S 2S S 2S U R                  24   nUS S 2S S 2S S 2U R                  S 24   nUS S 2S S 2S S 2S U R                  24   nUS S 2S S 2S S 2U R                  S 24   n[        UUU5      n[        UUU5      n[
        R                  " UU/SS9n[
        R                  " UU/SS9nO[        UUU5      n[        UUU5      nUR                  SSSS5      nUR                  SSSS5      nUbI  UUU R                  US	.nUR                  UR                  UR                   5      XR"                  U5      u  pU R%                  XXU5      u  nnU R'                  UU R                  U R                  5      nU R)                  U5      nU R+                  U5      nUU4$ )
Nrk   r/   r   )re   r   r   r   r   )r&   r'   partial_rotation_sizer   )rW   ra   rb   rO   rN   r    splitrg   rm   rZ   devicerS   rY   r9   r%   updater   rG   r   rs   rX   rF   )r[   r   r   r|   r   r}   r   r   r   qkvre   	qkv_split	local_dimry   r{   rz   rZ   sincosr&   r'   k_rotk_passq_rotq_passcache_kwargsr   r~   s                              r*   forwardCodeGenAttention.forward   s    mmM*KK		#2&" =>	MMD$<$<<F	!KK	"Ec!!%)A)A4==Y_!`%=%=t}}U[\!!%)A)A4==Y_!`aAq)..!!\%8%88-001D1DEO#2D  .;;vv||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&sC5C(S9Ekk!Q1%aAq) !)-"0	L $**366-2E2E+F~~_klJC %)JJu5R[$\!\''T5M5Mt}}]mmK0((5L((r,   )rD   rM   rZ   rO   rG   rN   rX   rW   rF   rY   rU   N)NNNNNNFFN)rK   
__module____qualname____firstlineno__r@   rg   rs   r   r   r    FloatTensorr   
LongTensorboolr   tupleTensorr   __static_attributes____classcell__rJ   s   @r*   r;   r;   E   s<   X<
&$ )D '+6:3715$),159G) 1 12G) UOG) !!2!23	G)
 u//0G) E--.G) D>G) $D>G) !!1!12G) 
ellE%,,//0u||U5<<%8%c@Q:RRST	V
G) G)r,   r;   c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )
CodeGenMLP   c                    > [         TU ]  5         UR                  n[        R                  " X15      U l        [        R                  " X5      U l        [        UR                     U l	        [        R                  " UR                  5      U l        g r   )r?   r@   n_embdr   rV   fc_infc_outr   activation_functionactrB   rE   dropout)r[   intermediate_sizer\   rM   rJ   s       r*   r@   CodeGenMLP.__init__   s`    MM	YYy<
ii 1=&445zz&"4"45r,   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   )r[   r   s     r*   r   CodeGenMLP.forward   s@    

=1/M2]3r,   )r   r   r   r   )rK   r   r   r   r@   r   r    r   r   r   r   r   s   @r*   r   r      s1    6Xe.?.?%@ UEVEV  r,   r   c                   f  ^  \ rS rSrSU 4S jjr       SS\\R                     S\\   S\\R                     S\\R                     S\\R                     S\\
   S	\\
   S
\\R                     S\\\R                     \\\R                  \\R                  S4   4      4   4S jjrSrU =r$ )CodeGenBlock   c                   > [         TU ]  5         UR                  b  UR                  OSUR                  -  n[        R
                  " UR                  UR                  S9U l        [        X5      U l	        [        X15      U l        g )Nrk   eps)r?   r@   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r;   attnr   mlp)r[   r\   rG   	inner_dimrJ   s       r*   r@   CodeGenBlock.__init__   s_    &,nn&@FNNa&--FW	LLF4M4MN	$V7	i0r,   r   r   r|   r   r}   r   r   r   r   .c	                     Un	U R                  U5      nU R                  UUUUUUUUS9u  pU R                  U5      nX-   U	-   nX4$ )N)r   r   r|   r   r}   r   r   r   )r   r   r   )r[   r   r   r|   r   r}   r   r   r   residualattn_outputsr~   feed_forward_hidden_statess                r*   r   CodeGenBlock.forward   sl     !		-0%)YY'!)%/) &/ 	&
" &*XXm%<"$AHL**r,   )r   r   r   r   r   )rK   r   r   r   r@   r   r    r   r   r   r   r   r   r   r   r   r   r   s   @r*   r   r      s    1 '+6:3715$),159+ 1 12+ UO+ !!2!23	+
 u//0+ E--.+ D>+ $D>+ !!1!12+ 
uU\\"HU5<<uGXGXZ]G]A^3^-_$``	a+ +r,   r   c                   P   ^  \ rS rSr% \\S'   SrSrS/rSr	Sr
U 4S jrS rS	rU =r$ )
CodeGenPreTrainedModeli  r\   transformerTr   past_key_valuesc                 &   > [         TU ]  " U0 UD6  g r   )r?   r@   )r[   inputskwargsrJ   s      r*   r@   CodeGenPreTrainedModel.__init__$  s    &+F+r,   c                    [        U[        R                  45      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.        )meanstdNr   )
isinstancer   rV   weightdatanormal_r\   initializer_ranger>   zero_	Embeddingpadding_idxr   fill_)r[   modules     r*   _init_weights$CodeGenPreTrainedModel._init_weights'  s   fryyl++ MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r,    )rK   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphr@   r   r   r   r   s   @r*   r   r     s9    %&*#'("3!,* *r,   r   c                   r  ^  \ rS rSrU 4S jrS rS r\            SS\\	R                     S\\\\\\	R                        4      S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       r SS\\	R                  S4   S\	R                  S\	R                  S\S\4
S jjr\S\	R                  S\S\S\	R,                  S\	R                  S\4S j5       rSrU =r$ )CodeGenModeli8  c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " UR                  5      U l
        [
        R                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l        [
        R                   " U R                  UR"                  S9U l        ['        UR(                  UR*                  UR,                  -  5      U l        SU l        U R1                  5         g s  snf )N)rG   r   F)r?   r@   r   rM   
vocab_sizer   r   wterB   
embd_pdropdrop
ModuleListrangen_layerr   hr   r   ln_fminrY   n_ctxrN   gradient_checkpointing	post_init)r[   r\   irJ   s      r*   r@   CodeGenModel.__init__:  s      ++<< 1 14>>BJJv001	5QWQ_Q_K`aK`aV AK`abLLV5N5NO	f//A[A[1[\&+# 	  bs   /Ec                     U R                   $ r   r   )r[   s    r*   get_input_embeddings!CodeGenModel.get_input_embeddingsJ  s    xxr,   c                     Xl         g r   r   )r[   new_embeddingss     r*   set_input_embeddings!CodeGenModel.set_input_embeddingsM  s    !r,   	input_idsr   r|   token_type_idsr   r}   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      n[        U[        S5      [        45      (       d  [        S5      eU(       a  Uc
  [        5       nUR                  S   nUc7  Ub  UR!                  5       OSn["        R$                  " XU-   UR&                  S9nUc  UR)                  S5      nU R+                  X7XU	5      nU R-                  X`R                   R.                  5      nUnUb(  UR1                  S	U5      nU R                  U5      nUU-   nU R3                  U5      nS	UUR5                  S	5      4nU	(       a  S
OSnU
(       a  S
OSn[7        U R8                  5       H:  u  nnU
(       a  UU4-   nU" UUUUUU   UU	US9nUS   nU	(       d  M1  UUS   4-   nM<     U R;                  U5      nUR1                  U5      nU
(       a  UU4-   nU(       d  [=        S UUUU4 5       5      $ [?        UUUUS9$ )au  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   r   r/   r   )r   r|   r   r}   r   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r   ).0vs     r*   	<genexpr>'CodeGenModel.forward.<locals>.<genexpr>  s      cacs   	)last_hidden_stater   r   
attentions) r\   r   r  r   use_return_dictrP   r   trainingrH   rI   r   r   typer   r	   rb   get_seq_lengthr    r!   r   	unsqueeze_update_causal_maskget_head_maskr   rp   r   ro   	enumerater   r   r   r   )r[   r   r   r|   r  r   r}   r  r   r   r  r  r   r   
seq_lengthpast_seen_tokensr   r   token_type_embedsoutput_shapeall_self_attentionsall_hidden_statesr   blockoutputss                            r*   r   CodeGenModel.forwardP  s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  HHY/M /DJ+>??abb0*nO"((+
!CRC^==?de"\\*:z<YbobvbvwN)33A6L..>L]
 &&y++2E2EF	%%+00Z@N $ 8),==M		-0J(:(:2(>?$5b4"6BD!$&&)HAu#$58H$H!**)#A,#"3-	G $AJM  &9WQZM&I## *& 		-0%**<8 1]4D D )?<MObc   '+++*	
 	
r,   r   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fsdpa)r  past_key_values_lengthis_trainingr   r/   )sequence_lengthtarget_lengthr   r   
batch_size)cudaxpunpu)r\   _attn_implementationanyr   r    r   r   r  is_compileabler   _ignore_causal_mask_sdpar  r   rb   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r  finfor   _unmask_unattended)r[   r|   r  r   r   r   r  using_compilable_cacher   r&  r'  r   	min_dtypes                r*   r   CodeGenModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr,   r&  r'  r   r(  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nrk   )
fill_valuer   r   r   )diagonalr  r/   r   )r   r    r2  r   fullr   triur!   ra   expandclonerb   rS   masked_fill)r|   r&  r'  r   r   r(  r   r   r5  mask_lengthpadding_masks              r*   r1  BCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position
  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r,   )r   rM   r   r   r   rY   r   r   )NNNNNNNNNNNN)F)rK   r   r   r   r@   r   r   r   r   r    r   r   r   r   r   r   r   r   r   r  staticmethodintr   r1  r   r   r   s   @r*   r   r   8  s    "  15NR6:59371559$(,0/3&*59r
E,,-r
 "%uU5<<5H/I(I"JKr
 !!2!23	r

 !!1!12r
 u//0r
 E--.r
   1 12r
 D>r
 $D>r
 'tnr
 d^r
 !!1!12r
 
u--	.r
 r
v #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r,   r   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                      ^  \ rS rSrS/rU 4S jr\             SS\\R                     S\\
\\\\R                        4      S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\
\\4   4S jj5       rSrU =r$ )CodeGenForCausalLMiC  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g r   )
r?   r@   r   r   r   rV   r   r   lm_headr   )r[   r\   rJ   s     r*   r@   CodeGenForCausalLM.__init__K  sE     '/yy0A0AB 	r,   r   r   r|   r  r   r}   r  labelsr   r   r  r  r   r   c                 (   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUUS9nUS   nU R                  U5      R	                  [
        R                  5      nSnUb`  UR	                  UR                  5      nU R                  " UU4SU R                   R                  0UD6nUR	                  UR                  5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)r   r|   r  r   r}   r  r   r   r  r  r   r   r   r   )losslogitsr   r   r  )r\   r  r   rH  rS   r    rR   r   loss_functionr   r   r   r   r   r  )r[   r   r   r|   r  r   r}   r  rJ  r   r   r  r  r   r   transformer_outputsr   	lm_logitsrL  outputs                       r*   r   CodeGenForCausalLM.forwardS  sD   8 &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.
 LL/225==A	YYy//0F%%  ;;11 	D 77=../D\$7$;;F)-)9TGf$EvE%/??-;;*55
 	
r,   )rH  r   )NNNNNNNNNNNNN)rK   r   r   r   _tied_weights_keysr@   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   s   @r*   rF  rF  C  sz    ++  15NR6:59371559-1$(,0/3&*59J
E,,-J
 "%uU5<<5H/I(I"JKJ
 !!2!23	J

 !!1!12J
 u//0J
 E--.J
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
  
u,,	-!J
 J
r,   rF  )rF  r   r   )1__doc__typingr   r   r    torch.utils.checkpointr   activationsr   cache_utilsr   r	   
generationr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   configuration_codegenr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrK   rH   rC  r   r+   r5   r9   Moduler;   r   r   r   r   rF  __all__r   r,   r*   <module>re     sn    "    ! . ) > 9 O - 
 1  !!;J 
		H	%P P3 P5<< P  = =ELL =u|| =X]XdXd =W)ryy W)v (#+- #+L *_ * *: G) G GT 
V
/ V

V
r Kr,   