
    <h                     D   S SK r S SKJrJrJr  S SKrS SKJr  S SKJrJ	r	J
r
  SSKJr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&  SSK'J(r(  \%" 5       (       a  SSK)J*r*J+r+  \&RX                  " \-5      r. " S S\R^                  5      r0 " S S\R^                  5      r1   S1S\Rd                  S\Rf                  S\Rf                  S\Rf                  S\\Rf                     S\\4   S\4S\\Rf                     4S jjr5 " S S \Rd                  5      r6 " S! S"\5      r7\$ " S# S$\5      5       r8\$ " S% S&\85      5       r9\$" S'S(9 " S) S*\8\5      5       r:\$ " S+ S,\85      5       r;\$" S-S(9 " S. S/\85      5       r</ S0Qr=g)2    N)CallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogging   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S\	\R                     4U 4S	 jjjr
S
rU =r$ ) BioGptLearnedPositionalEmbedding6   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr#   r$   	__class__s      b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/biogpt/modeling_biogpt.pyr)   )BioGptLearnedPositionalEmbedding.__init__;   s"     ++5}E    attention_maskpast_key_values_lengthposition_idsc                    > Uc5  [         R                  " USS9nX1-  S-
  R                  5       nUSS2US24   n[        TU ]  X0R
                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr(   forwardr'   )r*   r/   r0   r1   r+   s       r,   r8   (BioGptLearnedPositionalEmbedding.forwardA   sZ      <<A>L(9A=CCEL'+A+B(BCLw|kk9::r.   )r'   )r   N)__name__
__module____qualname____firstlineno____doc__intr)   r5   
LongTensorr   r8   __static_attributes____classcell__r+   s   @r,   r!   r!   6   s]    Fs F3 F '(37	;((; !$; u//0	; ;r.   r!   c            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )BioGptScaledWordEmbeddingR   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r#   r$   padding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)r(   r)   rH   )r*   r#   r$   rG   rH   r+   s        r,   r)   "BioGptScaledWordEmbedding.__init__W   s    D&r.   	input_idsc                 <   > [         TU ]  U5      U R                  -  $ rJ   )r(   r8   rH   )r*   rL   r+   s     r,   r8   !BioGptScaledWordEmbedding.forward[   s    wy)D,<,<<<r.   rH   )      ?)r:   r;   r<   r=   r>   r?   r   floatr)   r5   Tensorr8   rA   rB   rC   s   @r,   rE   rE   R   sJ    's '3 'S '_ghm_n ' '= = =r.   rE   modulequerykeyvaluer/   scalingdropout	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R
                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R
                  R                  XU R                  S9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )N      r&   r	   r3   r   ptraining)sizer5   matmul	transposenn
functionalsoftmaxviewrX   r_   
contiguous)rS   rT   rU   rV   r/   rW   rX   rY   kwargsattn_weightsattn_outputs              r,   eager_attention_forwardrk   _   s     **R.D(<<}}Q':;gEL!#4==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$r.   c                     ^  \ rS rSrSr      SS\S\S\S\S\S\S	\\	   S
\\   4U 4S jjjr
      SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\S\\R                     S\\   S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )BioGptAttention}   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsrX   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r\   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rr   )r(   r)   ro   rp   rX   head_dimrt   
ValueErrorrW   rq   rs   ru   loggerwarning_oncer+   r:   rc   Lineark_projv_projq_projout_proj)
r*   ro   rp   rX   rq   rr   rs   rt   ru   r+   s
            r,   r)   BioGptAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr.   hidden_stateskey_value_statespast_key_valuer/   layer_head_maskoutput_attentionscache_positionrh   returnc                 8   USLn	UR                   SS u  pU	(       a  UR                   S   OUnXSU R                  4nXSU R                  4nU R                  U5      R                  " U6 R	                  SS5      nUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      nUR                  " U6 R	                  SS5      nUR                  " U6 R	                  SS5      nUbN  U	(       d  UOSnWR#                  UUU R                  SU05      u  nnU	(       a  SUR                  U R                  '   [$        nU R&                  R(                  S:w  a  [*        U R&                  R(                     nU" U UUUU4U R,                  (       d  SOU R.                  U R0                  UUS	.UD6u  nnUR3                  XS5      R5                  5       nU R7                  U5      nUU4$ )
z#Input shape: Batch x Time x ChannelNr[   r   r&   r   Teager        )rX   rW   r   rY   )shaperx   r   rf   rb   
isinstancer   
is_updatedgetru   cross_attention_cacheself_attention_cachelayerskeysvaluesr}   r~   updaterk   rt   _attn_implementationr   r_   rX   rW   reshaperg   r   )r*   r   r   r   r/   r   r   r   rh   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesattention_interfacerj   ri   s                           r,   r8   BioGptAttention.forward   sn   " .T9 %**3B//A"((+wr4==9DMM: {{=166FPPQRTUV%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#.9CCAqIJ',,n=GG1ML)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#;FFHmmK0L((r.   )rt   rX   ro   rx   rs   rq   r}   ru   rp   r   r   rW   r~   )r   FTFNN)NNNNFN)r:   r;   r<   r=   r>   r?   rQ   boolr   r   r)   r5   rR   r   r   r   tupler8   rA   rB   rC   s   @r,   rm   rm   }   s`   G  )-#'%C%C %C 	%C
 %C %C %C &%C C=%C %CT 48*.1526"'15Q)||Q) #5<<0Q) !	Q)
 !.Q) "%,,/Q)  Q) !.Q) -.Q) 
u||Xell3XeELL>Q5RR	SQ) Q)r.   rm   c                   l  ^  \ rS rSrSS\S\\   4U 4S jjjr       SS\R                  S\\R                     S\\R                     S\\
   S	\\   S
\\   S\\R                     S\\R                     S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )BioGptDecoderLayer   rt   ru   c           
      p  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l	        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [        R"                  " U R                  UR$                  5      U l        [        R"                  " UR$                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)ro   rp   rX   rq   rs   rt   ru   )r(   r)   hidden_sizero   rm   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrX   r
   
hidden_actactivation_fnactivation_dropoutrc   	LayerNormself_attn_layer_normr|   intermediate_sizefc1fc2final_layer_norm)r*   rt   ru   r+   s      r,   r)   BioGptDecoderLayer.__init__   s    ++(nn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r.   r   r/   r   r   r   	use_cacher1   r   rh   r   c	                 J   Un
U R                  U5      nU R                  " SUUUUUUUS.U	D6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUn
U R                  U5      nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  X4-  nU$ )ay  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
        cache in the correct position and to infer the complete sequence length.
)r   r   r/   r   r   r1   r   r]    )r   r   rc   rd   rX   r_   r   r   r   r   r   )r*   r   r/   r   r   r   r   r1   r   rh   residualself_attn_weightsoutputss                r,   r8   BioGptDecoderLayer.forward  s.   < !11-@ ,0>> 	,
'))+/%)	,
 	,
( --m||VZVcVc-d 0 !--m</**=9--m?V?Vaeanan-o/--m||VZVcVc-d 0 "++Gr.   )	r   r   rX   ro   r   r   r   r   r   rJ   )NNNFTNN)r:   r;   r<   r=   r   r   r?   r)   r5   rR   r   r   r@   r   r   r   FloatTensorr8   rA   rB   rC   s   @r,   r   r      s	   =| = = =4 2626*.,1$(3715?||? !.? "%,,/	?
 !? $D>? D>? u//0? !.? +,? 
u  (51B1BEDUDU1U+V"WW	X? ?r.   r   c                      \ rS rSr% \\S'   SrSrSrSr	Sr
SrS\\\R                  S4      S\R                  S\R                  S	\4S
 jr\S\R                  S\S\S\R(                  S\R                  S\4S j5       rSrg)BioGptPreTrainedModeliU  rt   biogptTr/   r   input_tensorr   past_key_valuesc           	      \   U R                   R                  S:X  au  [        U[        R                  5      (       a  [        U5      nU$ UcD  [        [        R                  " UR                  S   UR                  S   4UR                  S95      nU$ U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a.  U(       d'  [        R                  " UUUU R                  S	9(       a  g UR                  nUR                  S   nU(       a  UR!                  5       n	O5[        U[        R                  5      (       a  UR                  S
   OXX-   S-   n	U R#                  UUU	UUUR                  S   S9n
U R                   R                  S:X  aS  UbP  UR                  R$                  S;   a6  [        R&                  " U5      R(                  n[        R*                  " X5      n
U
$ )Nflex_attentionr   r   )r`   deviceflash_attention_2r   Fsdpa)inputs_embedsr0   is_trainingr[   )sequence_lengthtarget_lengthdtyper   
batch_size)cudaxpunpu)rt   r   r   r5   rR   r   onesr   r   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdpar_   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r*   r/   r   r   r   past_seen_tokensusing_compilable_cacher   r   r   causal_mask	min_dtypes               r,   _update_causal_mask)BioGptPreTrainedModel._update_causal_maska  s    ;;++/??.%,,77!<^!L "!  '!<JJ*003\5G5G5JK-44" "!;;++/BB)~/D.I.I.K.K%%
 @O?Z?99;`aCRC^!?!?di ;;++v5>T%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD
 E*..I0CCK[Kr.   r   r   r   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   )diagonalr   r[   r   )r4   r5   r   r   fullr   triuaranger   expandcloner   tomasked_fill)r/   r   r   r   r   r   rh   r   r   mask_lengthpadding_masks              r,   r   KBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r.   r   N)r:   r;   r<   r=   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r5   rR   r   r   staticmethodr?   r   r   rA   r   r.   r,   r   r   U  s     &*#N!J u||['@!ABJ llJ 	J
 JX 444 4 {{	4
 4 4 4r.   r   c                   |  ^  \ rS rSrS\4U 4S jjr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\\\R                           S	\\   S
\\R                     S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )BioGptModeli  rt   c           
        > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  U R                  U R                  US9U l        [!        UR"                  U R                  5      U l        [&        R(                  " [+        UR,                  5       Vs/ sH  n[/        XS9PM     sn5      U l        [&        R2                  " U R                  5      U l        SU l        U R9                  5         g s  snf )NrP   rO   )ru   F)r(   r)   rt   	layerdropr   rX   r   ro   pad_token_idrG   scale_embeddingmathsqrtrE   
vocab_sizeembed_tokensr!   max_position_embeddingsembed_positionsrc   
ModuleListrangenum_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r*   rt   rH   ir+   s       r,   r)   BioGptModel.__init__  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vVuQR%7%LVu$vw,,t~~6&+# %ws   	E$rL   r/   rY   r   r   r   r1   r   output_hidden_statesreturn_dictr   rh   r   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eUb$  UnUR                  nUR                  SUS   5      nO.Ub   UR                  5       S S nUS S 2S S 2S4   nO[        S5      eUc  U R                  U5      nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       aB  [        U[        5      (       d-  Sn[        R                  S5        [         R"                  " U5      nUR                  5       S S u  nnUb  UR%                  5       OSnUc#  [&        R(                  " UUU-   UR*                  S	9nUc%  UU-   n[&        R,                  " UUUR*                  S	9n[        U[         5      (       a  UR.                  OUnU R1                  UUUU5      nUc5  [&        R2                  " US
S9nXr-  S
-
  R5                  5       nUS S 2US 24   nU R7                  UUUS9nUU-   n[8        R:                  R=                  UU R<                  U R                  S9nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU	(       a  SOS nU(       a  SOS nS n[?        U R@                  5       H|  u  nnU	(       a  UU4-  nU R                  (       a(  [&        RB                  " / 5      nUU RD                  :  a  ML  U" U4UUb  UU   OS UUUUUS.UD6nUS   nU(       d  Ms  UUS
   4-  nM~     U	(       a  UU4-  nU RG                  U5      nU(       a  URI                  5       nU
(       d  [K        S UUUUU4 5       5      $ [M        UUUUUS9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer[   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   r   r3   )r1   r]   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )r/   r   r   r   r   r1   r   c              3   .   #    U H  nUc  M  Uv   M     g 7frJ   r   ).0vs     r,   	<genexpr>&BioGptModel.forward.<locals>.<genexpr>  s      rA rs   	)last_hidden_stater   r   
attentionscross_attentions)'rt   r   r  r   use_return_dictry   r   rf   r`   r  r  r_   rz   r{   r   r   r   from_legacy_cacher   r5   r   r   r   r   r   r6   r7   r  rc   rd   rX   	enumerater   randr   r  to_legacy_cacher   r   )r*   rL   r/   rY   r   r   r   r1   r   r  r  r   rh   inputinput_shapereturn_legacy_cacher   
seq_lengthr0   mask_seq_lengthself_attn_cacher   	positionsr   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                                  r,   r8   BioGptModel.forward  s"     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] -t";<stt"E++K!r;r?;I&',,.s3K!!Q(+Edee  --e4M&&4==##q "	 $Z??"&\
 2CCOTO!.!3!3!5cr!:
JETE`!?!?!Afg!"\\&(>(KTaThThN !4zAO"ZZ
OML`L`aN /+>?? 00  	 ..	
  <<A>L(9A=CCEL'+A+B(BCL((9O^j(k	%	1--mt||VZVcVc-d&&4==##p "	"6BD0d#"+DKK"8C#!m%55!}}&+jjn#&7)
*3<3H3d."3#)-
 
M *!,M  =#3"551 #96  -!116-==?O ':K^]qr  
 9+++%1
 	
r.   )
rt   rX   ro   r  r  r  r  r   r   rG   )NNNNNNNNNNN)r:   r;   r<   r=   r   r)   r   r   r5   r@   r   r   rR   r   r   r   r   r   r8   rA   rB   rC   s   @r,   r   r     sI   | *  156:1559@D$(37,0/3&*15W
E,,-W
 !!2!23W
 E--.	W

   1 12W
 "%ell(;"<=W
 D>W
 u//0W
 $D>W
 'tnW
 d^W
 !.W
 +,W
 
u??	@W
 W
r.   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                      ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\\\
R                           S\	\
R                     S\	\   S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\   S\\\4   4S jj5       rSrU =r$ )BioGptForCausalLMi  zoutput_projection.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFrw   )
r(   r)   r   r   rc   r|   r   r  output_projectionr  r*   rt   r+   s     r,   r)   BioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r.   c                     U R                   $ rJ   r5  r*   s    r,   get_output_embeddings'BioGptForCausalLM.get_output_embeddings  s    %%%r.   c                     Xl         g rJ   r9  )r*   new_embeddingss     r,   set_output_embeddings'BioGptForCausalLM.set_output_embeddings  s    !/r.   rL   r/   rY   r   r   labelsr   r1   r   r  r  r   rh   r   c                    Ub  UOU R                   R                  nU R                  " U4UUUUUUU	U
UUS.
UD6nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)
r/   rY   r   r   r   r1   r   r  r  r   r   r  r   )losslogitsr   r   r  r  )rt   r  r   r5  loss_functionr  r   r   r   r  r  )r*   rL   r/   rY   r   r   rA  r   r1   r   r  r  r   rh   r   sequence_outputprediction_scoreslm_lossoutputs                      r,   r8   BioGptForCausalLM.forward  s   . &1%<k$++B]B]++
)'+%/!5#)
 
 "!* 22?C((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r.   )r   r5  NNNNNNNNNNNN)r:   r;   r<   r=   _tied_weights_keysr)   r;  r?  r   r   r5   r@   r   r   rR   r   r   r   r   r   r8   rA   rB   rC   s   @r,   r2  r2    s`    55&0  156:1559@D-1$(37,0/3&*15>
E,,->
 !!2!23>
 E--.	>

   1 12>
 "%ell(;"<=>
 ))*>
 D>>
 u//0>
 $D>>
 'tn>
 d^>
 !.>
 +,>
 
u77	8>
 >
r.   r2  c                      ^  \ rS rSrU 4S jr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\
\
\R                           S\\R                     S	\\R                     S
\\   S\\R                     S\\   S\\   S\\   S\\R                     S\\
\4   4S jj5       rSrU =r$ )BioGptForTokenClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nOUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropout)r(   r)   
num_labelsr   r   hasattrrP  r   rc   DropoutrX   r|   r   
classifierr  )r*   rt   rP  r+   s      r,   r)   %BioGptForTokenClassification.__init__  s      ++!&)6/00V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr.   rL   token_type_idsr/   rY   r   r   rA  r   r1   r   r  r  r   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
UUUS9nUS   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " UUR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N
r   r/   rY   r   r   r1   r   r  r  r   r   r[   r   r&   )rC  rD  r   r  )rt   r  r   rX   rT  r   rf   rQ  r5   wheretensorignore_indextype_asr   r   r  )r*   rL   rV  r/   rY   r   r   rA  r   r1   r   r  r  r   transformer_outputsr   rD  rC  loss_fctactive_lossactive_logitsactive_labelsrI  s                          r,   r8   $BioGptForTokenClassification.forward  su   . &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r.   )r   rT  rX   rQ  )NNNNNNNNNNNNN)r:   r;   r<   r=   r)   r   r   r5   r@   r   r   rR   r   r   r   r8   rA   rB   rC   s   @r,   rN  rN    sc     15596:15@D59-1$(37,0/3&*15A
E,,-A
 !!1!12A
 !!2!23	A

 E--.A
 "%ell(;"<=A
   1 12A
 ))*A
 D>A
 u//0A
 $D>A
 'tnA
 d^A
 !.A
 
u++	,A
 A
r.   rN  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                     ^  \ rS rSrS\4U 4S jjr\            SS\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S	\\R                     S
\\   S\\R                     S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rS rS rSrU =r$ )BioGptForSequenceClassificationiF  rt   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r4  )
r(   r)   rQ  r   r   rc   r|   r   scorer  r6  s     r,   r)   (BioGptForSequenceClassification.__init__U  sS      ++!&)YYv114??O
 	r.   rL   r/   rY   r   r   rA  r   r1   r   r  r  r   r   c                 (   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  SnOUbV  [        R                  " XR                   R
                  5      R                  S5      S-
  R                  UR                  5      nO.Sn[        R                  U R                  R                   S35        U[        R                  " UUR                  S9U4   nSnUGb  U R                   R                   c  U R"                  S:X  a  S	U R                   l        OoU R"                  S:  aN  UR$                  [        R&                  :X  d  UR$                  [        R(                  :X  a  S
U R                   l        OSU R                   l        U R                   R                   S	:X  aJ  [+        5       nU R"                  S:X  a&  U" UR-                  5       UR-                  5       5      nOU" UU5      nOU R                   R                   S
:X  a=  [/        5       nU" UR1                  SU R"                  5      UR1                  S5      5      nO-U R                   R                   S:X  a  [3        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )rX  NrY  r   r&   r[   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)rC  rD  r   r   r  )rt   r  r   rg  r   r  r5   nesumr   r   rz   r{   r+   r:   r   problem_typerQ  r   r7   r?   r   squeezer   rf   r   r   r   r   r  )r*   rL   r/   rY   r   r   rA  r   r1   r   r  r  r   r^  r   rD  r   r   pooled_logitsrC  r_  rI  s                         r,   r8   'BioGptForSequenceClassification.forward^  s   , &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88I{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r.   c                 .    U R                   R                  $ rJ   r   r  r:  s    r,   get_input_embeddings4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r.   c                 $    XR                   l        g rJ   rt  )r*   rV   s     r,   set_input_embeddings4BioGptForSequenceClassification.set_input_embeddings  s    #( r.   )r   rQ  rg  rK  )r:   r;   r<   r=   r   r)   r   r   r5   r@   r   r   rR   r   r   r   r8   ru  rx  rA   rB   rC   s   @r,   re  re  F  sa   |   156:15@D59-1$(37,0/3&*15Z
E,,-Z
 !!2!23Z
 E--.	Z

 "%ell(;"<=Z
   1 12Z
 ))*Z
 D>Z
 u//0Z
 $D>Z
 'tnZ
 d^Z
 !.Z
 
u66	7Z
 Z
x() )r.   re  )r2  rN  re  r   r   )Nr   N)>r  typingr   r   r   r5   torch.nnrc   r   r   r   activationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_biogptr   integrations.flex_attentionr   r   
get_loggerr:   rz   	Embeddingr!   rE   ModulerR   rQ   rk   rm   r   r   r   r2  rN  re  __all__r   r.   r,   <module>r     s  ,  , ,   A A ! 5 ) > B 9  G & ^ ^ .  !!U 
		H	%;r|| ;8
= 
=&  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<{)bii {)|W3 Wt MO M M` n
' n
 n
b 
Q
- Q

Q
h Q
#8 Q
 Q
h k)&; k)k)\r.   