
    <h˃                     2   S r SSKrSSKJrJr  SSKrSSKJr  SSKrSSKJ	r	J
r
Jr  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJr  SSKJr  SSKJrJ r J!r!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*  \!" 5       (       a  SSK+J,r,J-r-   " S S\(5      r. " S S\&5      r/ " S S\$5      r0 " S S\%5      r1\  " S S\5      5       r2\  " S S\25      5       r3\ " S S!9 " S" S#\2\5      5       r4\  " S$ S%\25      5       r5\ " S&S!9 " S' S(\25      5       r6/ S)Qr7g)*zPyTorch BioGPT model.    N)OptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogger   )BartAttentionBartDecoderLayerBartScaledWordEmbedding)OPTLearnedPositionalEmbedding   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                   n   ^  \ rS rSr  SS\R
                  S\S\\R
                     4U 4S jjjrSr	U =r
$ ) BioGptLearnedPositionalEmbedding:   attention_maskpast_key_values_lengthposition_idsc                 &   > [         TU ]  XU5        g)z3`input_ids_shape` is expected to be [bsz x seqlen].N)superforward)selfr$   r%   r&   	__class__s       a/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/biogpt/modular_biogpt.pyr)   (BioGptLearnedPositionalEmbedding.forward;   s     	M     )r   N)__name__
__module____qualname____firstlineno__torch
LongTensorintr   r)   __static_attributes____classcell__r+   s   @r,   r"   r"   :   sG     '(37	N((N !$N u//0	N Nr.   r"   c                       \ rS rSrSrg)BioGptScaledWordEmbeddingE   r/   Nr0   r1   r2   r3   r7   r/   r.   r,   r;   r;   E       r.   r;   c                       \ rS rSrSrg)BioGptAttentionI   r/   Nr=   r/   r.   r,   r@   r@   I   r>   r.   r@   c                   l  ^  \ rS rSrSS\S\\   4U 4S jjjr       SS\R                  S\\R                     S\\R                     S\\
   S	\\   S
\\   S\\R                     S\\R                     S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )BioGptDecoderLayerM   config	layer_idxc           
        > [         TU ]  U5        UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l	        [        UR                     U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  U R                  5      U l        U ?U ?g )NT)	embed_dim	num_headsdropout
is_decoder	is_causalrE   rF   )r(   __init__hidden_sizerH   r@   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrJ   r	   
hidden_actactivation_fnnnLinearintermediate_sizefc1fc2encoder_attnencoder_attn_layer_norm)r*   rE   rF   r+   s      r,   rM   BioGptDecoderLayer.__init__N   s     ++(nn0077
 11#F$5$5699T^^V-E-EF99V55t~~F(r.   hidden_statesr$   layer_head_maskpast_key_valueoutput_attentions	use_cacher&   cache_positionkwargsreturnc	                 J   Un
U R                  U5      nU R                  " SUUUUUUUS.U	D6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUn
U R                  U5      nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  X4-  nU$ )ay  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
        cache in the correct position and to infer the complete sequence length.
)r]   r_   r$   r^   r`   r&   rb   ptrainingr/   )self_attn_layer_normrQ   rU   
functionalrJ   rh   final_layer_normrX   rT   activation_dropoutrY   )r*   r]   r$   r^   r_   r`   ra   r&   rb   rc   residualself_attn_weightsoutputss                r,   r)   BioGptDecoderLayer.forwardd   s.   < !11-@ ,0>> 	,
'))+/%)	,
 	,
( --m||VZVcVc-d 0 !--m</**=9--m?V?Vaeanan-o/--m||VZVcVc-d 0 "++Gr.   )rT   rJ   rH   rX   rY   rQ   N)NNNFTNN)r0   r1   r2   r3   r   r   r6   rM   r4   Tensorr
   boolr5   r   r   tupleFloatTensorr)   r7   r8   r9   s   @r,   rC   rC   M   s	   )| ) ) )2 2626*.,1$(3715?||? !.? "%,,/	?
 !? $D>? D>? u//0? !.? +,? 
u  (51B1BEDUDU1U+V"WW	X? ?r.   rC   c                      \ rS rSr% \\S'   SrSrSrSr	Sr
SrS\\\R                  S4      S\R                  S\R                  S	\4S
 jr\S\R                  S\S\S\R(                  S\R                  S\4S j5       rSrg)BioGptPreTrainedModel   rE   biogptTr$   r   input_tensorrb   past_key_valuesc           	      \   U R                   R                  S:X  au  [        U[        R                  5      (       a  [        U5      nU$ UcD  [        [        R                  " UR                  S   UR                  S   4UR                  S95      nU$ U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a.  U(       d'  [        R                  " UUUU R                  S	9(       a  g UR                  nUR                  S   nU(       a  UR!                  5       n	O5[        U[        R                  5      (       a  UR                  S
   OXX-   S-   n	U R#                  UUU	UUUR                  S   S9n
U R                   R                  S:X  aS  UbP  UR                  R$                  S;   a6  [        R&                  " U5      R(                  n[        R*                  " X5      n
U
$ )Nflex_attentionr   r   )sizedeviceflash_attention_2g        Fsdpa)inputs_embedsr%   is_training)sequence_lengthtarget_lengthdtyperb   
batch_size)cudaxpunpu)rE   _attn_implementation
isinstancer4   rr   r    onesshaper   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdparh   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r*   r$   rz   rb   r{   past_seen_tokensusing_compilable_cacher   r   r   causal_mask	min_dtypes               r,   _update_causal_mask)BioGptPreTrainedModel._update_causal_mask   s    ;;++/??.%,,77!<^!L "!  '!<JJ*003\5G5G5JK-44" "!;;++/BB)~/D.I.I.K.K%%
 @O?Z?99;`aCRC^!?!?di ;;++v5>T%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD
 E*..I0CCK[Kr.   r   r   r   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   )diagonalr   r   r   )dimr4   r   r   fullr   triuarangereshapeexpandcloner   tomasked_fill)r$   r   r   r   rb   r   rc   r   r   mask_lengthpadding_masks              r,   r   KBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position   s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r.   r/   N)r0   r1   r2   r3   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r4   rr   r
   r   staticmethodr6   r   r   r7   r/   r.   r,   rw   rw      s     &*#N!J u||['@!ABJ llJ 	J
 JX 444 4 {{	4
 4 4 4r.   rw   c                   |  ^  \ rS rSrS\4U 4S jjr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\\\R                           S	\\   S
\\R                     S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )BioGptModeli7  rE   c           
        > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  U R                  U R                  US9U l        [!        UR"                  U R                  5      U l        [&        R(                  " [+        UR,                  5       Vs/ sH  n[/        XS9PM     sn5      U l        [&        R2                  " U R                  5      U l        SU l        U R9                  5         g s  snf )Ng      ?)embed_scale)rF   F)r(   rM   rE   	layerdroprR   rJ   rN   rH   pad_token_idpadding_idxscale_embeddingmathsqrtr;   
vocab_sizeembed_tokensr"   max_position_embeddingsembed_positionsrU   
ModuleListrangenum_hidden_layersrC   layers	LayerNorm
layer_normgradient_checkpointing	post_init)r*   rE   r   ir+   s       r,   rM   BioGptModel.__init__9  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vVuQR%7%LVu$vw,,t~~6&+# %ws   	E$	input_idsr$   	head_maskr   r{   ra   r&   r`   output_hidden_statesreturn_dictrb   rc   rd   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eUb$  UnUR                  nUR                  SUS   5      nO.Ub   UR                  5       S S nUS S 2S S 2S4   nO[        S5      eUc  U R                  U5      nU R                  (       a0  U R                  (       a  U(       a  [        R                  " S5        SnSnU(       aC  [        U[        5      (       d.  Sn[        R                  " S5        [         R"                  " U5      nUR                  5       S S u  nnUb  UR%                  5       OSnUc#  [&        R(                  " UUU-   UR*                  S	9nUc%  UU-   n[&        R,                  " UUUR*                  S	9n[        U[         5      (       a  UR.                  OUnU R1                  UUUU5      nUc5  [&        R2                  " US
S9nXr-  S
-
  R5                  5       nUS S 2US 24   nU R7                  UUUS9nUU-   n[8        R:                  R=                  UU R<                  U R                  S9nU R                  (       a0  U R                  (       a  U(       a  [        R                  " S5        SnU	(       a  SOS nU(       a  SOS nS n[?        U R@                  5       H|  u  nnU	(       a  UU4-  nU R                  (       a(  [&        RB                  " / 5      nUU RD                  :  a  ML  U" U4UUb  UU   OS UUUUUS.UD6nUS   nU(       d  Ms  UUS
   4-  nM~     U	(       a  UU4-  nU RG                  U5      nU(       a  URI                  5       nU
(       d  [K        S UUUUU4 5       5      $ [M        UUUUUS9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   r   )r   )r&   rf   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r/   )r$   r^   r_   r`   ra   r&   rb   c              3   .   #    U H  nUc  M  Uv   M     g 7frq   r/   ).0vs     r,   	<genexpr>&BioGptModel.forward.<locals>.<genexpr>  s      rA rs   	)last_hidden_stater{   r]   
attentionscross_attentions)'rE   r`   r   ra   use_return_dict
ValueErrorr   viewr~   r   r   rh   r   warning_oncer   r
   r   from_legacy_cacher   r4   r   r   r   self_attention_cacher   cumsumlongr   rU   rj   rJ   	enumerater   randr   r   to_legacy_cachert   r   )r*   r   r$   r   r   r{   ra   r&   r`   r   r   rb   rc   inputinput_shapereturn_legacy_cacher   
seq_lengthr%   mask_seq_lengthself_attn_cacher   	positionsr]   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                                  r,   r)   BioGptModel.forwardN  s"     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] -t";<stt"E++K!r;r?;I&',,.s3K!!Q(+Edee  --e4M&&4==##q "	 $Z??"&\
 2CCOTO!.!3!3!5cr!:
JETE`!?!?!Afg!"\\&(>(KTaThThN !4zAO"ZZ
OML`L`aN /+>?? 00  	 ..	
  <<A>L(9A=CCEL'+A+B(BCL((9O^j(k	%	1--mt||VZVcVc-d&&4==##p "	"6BD0d#"+DKK"8C#!m%55!}}&+jjn#&7)
*3<3H3d."3#)-
 
M *!,M  =#3"551 #96  -!116-==?O ':K^]qr  
 9+++%1
 	
r.   )
rE   rJ   rH   r   r   r   r   r   r   r   )NNNNNNNNNNN)r0   r1   r2   r3   r   rM   r   r   r4   r5   ru   rt   rr   rs   r   r   r   r   r)   r7   r8   r9   s   @r,   r   r   7  sI   | *  156:1559@D$(37,0/3&*15W
E,,-W
 !!2!23W
 E--.	W

   1 12W
 "%ell(;"<=W
 D>W
 u//0W
 $D>W
 'tnW
 d^W
 !.W
 +,W
 
u??	@W
 W
r.   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                      ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\\\
R                           S\	\
R                     S\	\   S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\   S\\\4   4S jj5       rSrU =r$ )BioGptForCausalLMi  zoutput_projection.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NF)bias)
r(   rM   r   ry   rU   rV   rN   r   output_projectionr   r*   rE   r+   s     r,   rM   BioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r.   c                     U R                   $ rq   r   r*   s    r,   get_output_embeddings'BioGptForCausalLM.get_output_embeddings  s    %%%r.   c                     Xl         g rq   r  )r*   new_embeddingss     r,   set_output_embeddings'BioGptForCausalLM.set_output_embeddings  s    !/r.   r   r$   r   r   r{   labelsra   r&   r`   r   r   rb   rc   rd   c                    Ub  UOU R                   R                  nU R                  " U4UUUUUUU	U
UUS.
UD6nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)
r$   r   r   r{   ra   r&   r`   r   r   rb   r   r   r   )losslogitsr{   r]   r   r   )rE   r   ry   r   loss_functionr   r   r{   r]   r   r   )r*   r   r$   r   r   r{   r
  ra   r&   r`   r   r   rb   rc   ro   sequence_outputprediction_scoreslm_lossoutputs                      r,   r)   BioGptForCausalLM.forward   s   . &1%<k$++B]B]++
)'+%/!5#)
 
 "!* 22?C((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r.   )ry   r   NNNNNNNNNNNN)r0   r1   r2   r3   _tied_weights_keysrM   r  r  r   r   r4   r5   ru   rt   rr   rs   r   r   r   r   r)   r7   r8   r9   s   @r,   r   r     s`    55&0  156:1559@D-1$(37,0/3&*15>
E,,->
 !!2!23>
 E--.	>

   1 12>
 "%ell(;"<=>
 ))*>
 D>>
 u//0>
 $D>>
 'tn>
 d^>
 !.>
 +,>
 
u77	8>
 >
r.   r   c                      ^  \ rS rSrU 4S jr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\
\
\R                           S\\R                     S	\\R                     S
\\   S\\R                     S\\   S\\   S\\   S\\R                     S\\
\4   4S jj5       rSrU =r$ )BioGptForTokenClassificationiB  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nOUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropout)r(   rM   
num_labelsr   ry   hasattrr  rR   rU   DropoutrJ   rV   rN   
classifierr   )r*   rE   r  r+   s      r,   rM   %BioGptForTokenClassification.__init__D  s      ++!&)6/00V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr.   r   token_type_idsr$   r   r{   r   r
  ra   r&   r`   r   r   rb   rd   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
UUUS9nUS   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " UUR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N
r{   r$   r   r   ra   r&   r`   r   r   rb   r   r   r   r   )r  r  r]   r   )rE   r   ry   rJ   r  r   r   r  r4   wheretensorignore_indextype_asr   r]   r   )r*   r   r  r$   r   r{   r   r
  ra   r&   r`   r   r   rb   transformer_outputsr]   r  r  loss_fctactive_lossactive_logitsactive_labelsr  s                          r,   r)   $BioGptForTokenClassification.forwardR  su   . &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r.   )ry   r  rJ   r  )NNNNNNNNNNNNN)r0   r1   r2   r3   rM   r   r   r4   r5   ru   rt   rr   rs   r   r   r)   r7   r8   r9   s   @r,   r  r  B  sc     15596:15@D59-1$(37,0/3&*15A
E,,-A
 !!1!12A
 !!2!23	A

 E--.A
 "%ell(;"<=A
   1 12A
 ))*A
 D>A
 u//0A
 $D>A
 'tnA
 d^A
 !.A
 
u++	,A
 A
r.   r  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                     ^  \ rS rSrS\4U 4S jjr\            SS\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S	\\R                     S
\\   S\\R                     S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rS rS rSrU =r$ )BioGptForSequenceClassificationi  rE   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r   )
r(   rM   r  r   ry   rU   rV   rN   scorer   r   s     r,   rM   (BioGptForSequenceClassification.__init__  sS      ++!&)YYv114??O
 	r.   r   r$   r   r{   r   r
  ra   r&   r`   r   r   rb   rd   c                 *   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  SnOUbV  [        R                  " XR                   R
                  5      R                  S5      S-
  R                  UR                  5      nO/Sn[        R                  " U R                  R                   S35        U[        R                  " UUR                  S9U4   nSnUGb  U R                   R                   c  U R"                  S:X  a  S	U R                   l        OoU R"                  S:  aN  UR$                  [        R&                  :X  d  UR$                  [        R(                  :X  a  S
U R                   l        OSU R                   l        U R                   R                   S	:X  aJ  [+        5       nU R"                  S:X  a&  U" UR-                  5       UR-                  5       5      nOU" UU5      nOU R                   R                   S
:X  a=  [/        5       nU" UR1                  SU R"                  5      UR1                  S5      5      nO-U R                   R                   S:X  a  [3        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )r!  Nr"  r   r   r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r  r  r{   r]   r   )rE   r   ry   r0  r   r   r4   nesumr   r   r   r   r+   r0   r   problem_typer  r   r   r6   r   squeezer   r   r   r   r{   r]   r   )r*   r   r$   r   r{   r   r
  ra   r&   r`   r   r   rb   r'  r]   r  r   r   pooled_logitsr  r(  r  s                         r,   r)   'BioGptForSequenceClassification.forward  s   , &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88I{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r.   c                 .    U R                   R                  $ rq   ry   r   r  s    r,   get_input_embeddings4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r.   c                 $    XR                   l        g rq   r=  )r*   values     r,   set_input_embeddings4BioGptForSequenceClassification.set_input_embeddings  s    #( r.   )ry   r  r0  r  )r0   r1   r2   r3   r   rM   r   r   r4   r5   ru   rt   rr   rs   r   r   r)   r>  rB  r7   r8   r9   s   @r,   r.  r.    sa   |   156:15@D59-1$(37,0/3&*15Z
E,,-Z
 !!2!23Z
 E--.	Z

 "%ell(;"<=Z
   1 12Z
 ))*Z
 D>Z
 u//0Z
 $D>Z
 'tnZ
 d^Z
 !.Z
 
u66	7Z
 Z
x() )r.   r.  )r   r  r.  r   rw   )8__doc__r   typingr   r   r4   torch.nnrU   torch.utils.checkpointr   r   r   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   bart.modeling_bartr   r   r   opt.modeling_optr   configuration_biogptr   integrations.flex_attentionr   r    r"   r;   r@   rC   rw   r   r   r  r.  __all__r/   r.   r,   <module>rU     s]     "    A A ! 5 )  . &  
 = .  !!UN'D N	 7 		m 	V) Vr MO M M` n
' n
 n
b 
Q
- Q

Q
h Q
#8 Q
 Q
h k)&; k)k)\r.   