
    <h                     J   S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
JrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSK J!r!J"r"  SSK#J$r$  \"RJ                  " \&5      r'S(S jr( " S S\RR                  5      r* " S S\RR                  5      r+ " S S\5      r,\! " S S\5      5       r-\! " S S\-5      5       r.\!" SS9 " S S\-\5      5       r/\!" S S9 " S! S"\-5      5       r0\! " S# S$\-5      5       r1\! " S% S&\-5      5       r2/ S'Qr3g))zPyTorch MPT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfigc                 H   [         R                  " SU-
  S[         R                  US9R                  SSSU5      nS[        R
                  " [        R                  " U 5      5      -  n[         R                  " SUS-   [         R                  US9R                  5       nXbU-  -  nS[         R                  " SU5      -  nUR                  SUSS5      nXP:w  a7  [         R                  " USS2SSS2S4   USS2SSS2S4   /SS9SS2SU 2S4   nXG-  nUR                  S5      $ )	a  
Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr6   -   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D$889D599Q%%F[[0!Q7F(vaAsl3VAssCK5HIqQRSU_V_U_adRdeNE==    c                      ^  \ rS rSrSrSS\S\\   4U 4S jjjr   SS\	R                  S\	R                  S\\   S	\\	R                     S
\\	R                     4
S jjrSrU =r$ )MptAttentionD   zrMulti-head self attention.
Using torch or triton attention implementation enables user to also use additive bias.
config	layer_idxc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  R                  U l        U R                  c5  S[        R                  " U R                  U R                  -  5      -  U l        UR                  R                  U l        UR                  R                  U l        [        R                  " U R                  SU R                  -  SS9U l        [        R                  " U R                  U R                  SS9U l        X l        g )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler&   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr<   )selfr;   r<   	__class__s      r5   rA   MptAttention.__init__I   s   !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Q"r7   hidden_statesposition_biaspast_key_valueattention_maskcache_positionc                 \   UR                   S S u  pgU R                  U5      nU R                  (       a%  UR                  U R                  * U R                  S9nUR	                  SSS9u  pnU	R                  XgU R                  U R                  5      R                  SS5      n	U
R                  XgU R                  U R                  5      R                  SS5      n
UR                  XgU R                  U R                  5      R                  SS5      nUb#  SU0nUR                  XU R                  U5      u  p[        R                  " XR                  SS5      5      U R                  -  nUc  UOXsR                  5       -   nUb  [        UR                   5      S:w  a!  [!        S	[        UR                   5       35      eU
R                   S   n[#        S
UR%                  S5      U-
  5      n[#        S
UR%                  S5      U-
  5      nUS S 2US 2US 24   nX-   nUb:  UR'                  U[        R(                  " U	R*                  5      R,                  5      n[.        R0                  R3                  UR5                  5       SS9R7                  UR*                  5      n[.        R0                  R9                  UU R:                  U R<                  S9n[        R                  " UU5      nUR?                  S
SSS5      RA                  5       RC                  XgS5      nU RE                  U5      nUU4$ )Nr   )minmaxr   r    r   rW   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperN   rL   clampchunkreshaperC   rF   	transposeupdater<   r"   matmulrH   get_seq_lengthlen
ValueErrorrZ   sizemasked_fillfinfor   rY   r   r
   softmaxr*   todropoutrK   r_   permute
contiguousr%   rO   )rP   rS   rT   rU   rV   rW   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                        r5   forwardMptAttention.forwardY   s    "/!4!4Ra!8
IIm,	==!T]]NNI1:1J.,#++JDLLRVR_R_`jjklnop''
dmm\ffghjkl
#++JDLLRVR_R_`jjklnop%,n=L'5'<'<ZW[WeWegs't$J <<6J6J2r6RSVZVhVhh%3%;zNkNkNmAm$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjfhimmN3L((r7   )
rN   rK   rL   rF   rB   r<   rE   rC   rO   rH   N)NNN)__name__
__module____qualname____firstlineno____doc__r   r   intrA   r"   Tensorr   r   __static_attributes____classcell__rQ   s   @r5   r9   r9   D   s    #y #Xc] # #( +/15151)||1) ||1) !	1)
 !.1) !.1) 1)r7   r9   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	MptMLP   r;   c                   > [         TU ]  5         UR                  n[        R                  " USU-  SS9U l        [        R                  " SS9U l        [        R                  " SU-  USS9U l        UR                  R                  U l        g )N   Fr>   none)approximate)r@   rA   rB   r   rM   up_projGELUact	down_projrG   rJ   hidden_dropout)rP   r;   rB   rQ   s      r5   rA   MptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r7   rS   residualreturnc                     U R                  U R                  U5      5      nU R                  U5      n[        R                  " X0R
                  U R                  S9nXB-   nU$ )Nr]   )r   r   r   Fro   r   r_   )rP   rS   r   intermediate_outputoutputs        r5   r   MptMLP.forward   sS    m!<="nn];.2E2EPTP]P]^"r7   )r   r   r   r   )r   r   r   r   r   rA   r"   r   r   r   r   r   s   @r5   r   r      s:    <y <U\\ U\\ ell  r7   r   c                      ^  \ rS rSrSS\S\\   4U 4S jjjr    SS\R                  S\R                  S\R                  S\\
   S	\S
\S\\R                     4S jjrSrU =r$ )MptBlock   r;   r<   c                   > [         TU ]  5         UR                  n[        X1R                  S9U l        S U R
                  l        UR                  U l        [        X5      U l
        [        X1R                  S9U l        S U R                  l        [        U5      U l        UR                  R                  U l        ["        R$                  " U R                   5      U l        g )Neps)r@   rA   rB   r   layer_norm_epsilonnorm_1r?   rC   r.   r9   attnnorm_2r   ffnrG   rJ   dropout_rater   Dropoutresid_attn_dropout)rP   r;   r<   rB   rQ   s       r5   rA   MptBlock.__init__   s    ((1J1JK 3	1J1JK&>"..99"$**T->->"?r7   rS   rT   rV   
layer_past	use_cacheoutput_attentionsrW   c                     U R                  U5      nUn	U R                  UUUUUS9u  pU R                  U
5      U	-   nU R                  U5      nUn	U R	                  X5      nX4$ )N)rT   rV   rU   rW   )r   r   r   r   r   )rP   rS   rT   rV   r   r   r   rW   layernorm_outputr   attn_outputsr~   r   s                r5   r   MptBlock.forward   s      ;;}5  &*YY')%) &/ &
" //=H;;}5 ! *5##r7   )r   r   r   r   r   r.   r   r   )NFFN)r   r   r   r   r   r   r   rA   r"   r   r   boolr   r   r   r   s   @r5   r   r      s    @y @Xc] @ @2 '+"'15"$||"$ ||"$ 	"$
 UO"$ "$  "$ !."$ "$r7   r   c                      ^  \ rS rSr% \\S'   SrSrS/rS/r	U 4S jr
S\R                  4S	 jr\S
\\\R"                  \R"                  4      S\\\R"                  \R"                  4      4S j5       rSrU =r$ )MptPreTrainedModel   r;   transformerTr   z
lm_head.*.c                 &   > [         TU ]  " U0 UD6  g r   )r@   rA   )rP   inputskwargsrQ   s      r5   rA   MptPreTrainedModel.__init__   s    &+F+r7   modulec                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        5      (       aW  UR                  b$  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.g        )meanstdNr   )
isinstancer   rM   weightdatanormal_r;   initializer_ranger?   zero_	Embeddingpadding_idxr   fill_)rP   r   s     r5   _init_weights MptPreTrainedModel._init_weights   s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .	**{{&  &&(MM$$S) +r7   rU   r   c                 j   ^^^ U S   S   R                   u  pmmX-  m[        UUU4S jU  5       5      $ )zg
Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
r   c              3   z   >#    U H1  nUS    R                  TTT5      US   R                  TTT5      4v   M3     g7f)r   r   N)rc   ).0r   batch_size_times_num_headsrF   rs   s     r5   	<genexpr>;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>  sL      

 -
 1%%&@(JW1%%&@*hW -s   8;)r`   tuple)rU   rr   r.   r   rF   rs   s      @@@r5   _convert_to_mpt_cache(MptPreTrainedModel._convert_to_mpt_cache   sI     7EQ6G6J6P6P3
x%/%;"  

 -
 
 	
r7    )r   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingrA   r   Moduler   staticmethodr   r"   r   r   r   r   r   s   @r5   r   r      s    %&*##'4o#,*BII *" 
eELL%,,$>?@
	uU\\5<</0	1
 
r7   r   c                     ^  \ rS rSrS\4U 4S jjrS rSS jrS\R                  4S jr
\         SS\\R                     S	\\\\\R                  \R                  4   S
4   \4      S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  S
4   \4   4S jj5       rSrU =r$ )MptModeli  r;   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l        [        U R                  UR                  S9U l        S U R                   l        SU l        U R'                  5         g s  snf )N)r<   r   F)r@   rA   rB   rC   r.   r   r   
vocab_sizewte
ModuleListrangen_layersr   blocksr   r   norm_fr?   gradient_checkpointing	post_init)rP   r;   irQ   s      r5   rA   MptModel.__init__  s     !-- << 1 143C3CD mmERXRaRaLb$cLbqXf%BLb$cd   0 0f6O6OP&+# 	 %ds   
C4c                     U R                   $ r   r   )rP   s    r5   get_input_embeddingsMptModel.get_input_embeddings$  s    xxr7   c                     [        XX45      $ r   )r6   )rP   r.   r/   r0   r   s        r5   r6   MptModel.build_mpt_alibi_tensor'  s    %i.YYr7   new_embeddingsc                     Xl         g r   r   rP   r   s     r5   set_input_embeddingsMptModel.set_input_embeddings*  s    !r7   	input_idspast_key_values.rV   inputs_embedsr   r   output_hidden_statesreturn_dictrW   r   c
                 n   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pOUb  UR                  u  pnO[        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nSnU(       aB  [        U[        5      (       d-  Sn[        R                  S5        [        R                  " U5      nUnU(       a  SOSnU(       a  SOSnUb  UR!                  5       OS	nUU-   nUc"  ["        R$                  " UU4UR&                  S
9nOUR)                  UR&                  5      nU R+                  U R,                  U R                   R.                  UR&                  S
9n[1        X;U4UU5      nUR3                  5       nU R4                   H3  nU(       a  UU4-   nU" UUUUUUU	S9nUS	   nU(       d  M*  UUS   4-   nM5     U R7                  U5      nU(       a  UR9                  5       nU(       a  UU4-   nU(       d  [;        S XUU4 5       5      $ [=        UUUUS9$ )j  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   )r   rV   r   r   rT   rW   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r   )r   vs     r5   r   #MptModel.forward.<locals>.<genexpr>  s      cacs   	)last_hidden_stater   rS   
attentions)r;   r   r   r   use_return_dictri   r`   r   r_   loggerwarning_oncer   r   r   r   from_legacy_cacherg   r"   onesr   rn   r6   r.   rD   r   r   r   r   to_legacy_cacher   r   )rP   r   r   rV   r   r   r   r   r   rW   r   rr   rs   _return_legacy_cacherS   all_self_attentionsall_hidden_statespast_key_values_lengthseq_length_with_pastr1   causal_maskblockoutputss                           r5   r   MptModel.forward-  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%JATUU&&4==##p "	  HHY/M#Z??"&U
 +<<_MO%$5b4"6BD FUE`!?!?!Afg),BB!"ZZ5I(JS`SgSghN+..}/C/CDN++DNNDKK<S<S\i\p\p+q74mE[
 "&&([[E#$58H$H!**#"3#-G $AJM  &9WQZM&I#! !& M2-==?O 1]4D D )<MObc   9+++*	
 	
r7   )r   r   rB   r   r.   r      N	NNNNNNNNN)r   r   r   r   r   rA   r   r6   r"   r   r   r   r   
LongTensorr   r   r   r   r   r   r   r   r   s   @r5   r   r     sG   y ,Z"5<< "  15ae1548$(,0/3&*15w
E,,-w
 "%eELL%,,4N.OQT.T(UW\(\"]^w
 !.	w

   0 01w
 D>w
 $D>w
 'tnw
 d^w
 !.w
 
uU\\3&')RR	Sw
 w
r7   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSrS/rS\4U 4S jjrS\R                  4S jr	\
          SS\\R                     S\\\\R                  \R                  4   S	4      S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                     \4   4S jj5       rSrU =r$ )MptForCausalLMi  zlm_head.weightr;   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr>   )
r@   rA   r   r   r   rM   rB   r   lm_headr   rP   r;   rQ   s     r5   rA   MptForCausalLM.__init__  sI     #F+yy!3!3V5F5FUS 	r7   r   c                     Xl         g r   )r   r   s     r5   set_output_embeddings$MptForCausalLM.set_output_embeddings  s    %r7   r   r   .rV   r   labelsr   r   r   r   rW   r   c                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbE  UR	                  UR
                  5      nU R                  " UU4SU R                   R                  0UD6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)r   rV   r   r   r   r   r   rW   r   r   r   losslogitsr   rS   r  )r;   r  r   r   rn   r   loss_functionr   r   r   rS   r  )rP   r   r   rV   r   r&  r   r   r   r   rW   r   transformer_outputsrS   	lm_logitsr)  r   s                    r5   r   MptForCausalLM.forward  s   @ &1%<k$++B]B]"..+)'/!5#) / 

 ,A.LL/	YYy//0F%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r7   )r   r   )
NNNNNNNNNN)r   r   r   r   _tied_weights_keysr   rA   r"   r   r$  r   r   r  r   r   r   r   r   r   r   r   s   @r5   r  r    sK    ++y &ELL &  15SW1504)-$(,0/3&*15F
E,,-F
 "%ellELL.H(I3(N"OPF
 !.	F

  -F
 &F
 D>F
 $D>F
 'tnF
 d^F
 !.F
 
uU\\"$EE	FF
 F
r7   r  a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   h  ^  \ rS rSrS\4U 4S jjr\         SS\\R                     S\\
\
\R                  \R                  4   S4      S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\
\R                     \4   4S jj5       rSrU =r$ )MptForSequenceClassificationi  r;   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
r@   rA   
num_labelsr   r   r   rM   rB   scorer   r!  s     r5   rA   %MptForSequenceClassification.__init__  sV      ++#F+YYv1163D3D5Q
 	r7   r   r   .rV   r   r&  r   r   r   r   r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [5        UUU
R6                  U
R8                  U
R:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   rV   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r[   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationr(  )r;   r  r   r4  r`   pad_token_idri   rn   r   r"   r$   r#   argmaxr  r	  rQ   r   problem_typer3  r   longr   r	   r-   r   r   r   r   rS   r  )rP   r   r   rV   r   r&  r   r   r   r   r,  rS   r*  rr   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr)  loss_fctr   s                        r5   r   $MptForSequenceClassification.forward  s   < &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r7   )r3  r4  r   r  )r   r   r   r   r   rA   r   r   r"   r  r   r   r   r   r   r   r   r   r   s   @r5   r1  r1    s   y   15SW1504)-$(,0/3&*d
E,,-d
 "%ellELL.H(I3(N"OPd
 !.	d

  -d
 &d
 D>d
 $D>d
 'tnd
 d^d
 
uU\\"$DD	Ed
 d
r7   r1  c                   h  ^  \ rS rSrS\4U 4S jjr\         SS\\R                     S\\
\
\R                  \R                  4   S4      S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\
\R                     \4   4S jj5       rSrU =r$ )MptForTokenClassificationi  r;   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nO-[        US5      (       a  UR                  b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)r@   rA   r3  r   r   hasattrrI  r   r   r   ro   rM   rB   
classifierr   )rP   r;   rI  rQ   s      r5   rA   "MptForTokenClassification.__init__  s      ++#F+6/00V5N5N5Z!'!:!:V-..63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r7   r   r   .rV   r   r&  r   r   r   r   r   c
                 
   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUbl  UR                  UR                  5      nUR                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )r7  Nr8  r   r   )r)  r*  rS   r  )r;   r  r   ro   rK  rn   r   r`   r   r%   r3  r   rS   r  )rP   r   r   rV   r   r&  r   r   r   r   deprecated_argumentsr,  rS   r*  r)  rr   rs   rD  r   s                      r5   r   !MptForTokenClassification.forward  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r7   )rK  ro   r3  r   r  )r   r   r   r   r   rA   r   r   r"   r  r   r   r   r   r   r   r   r   r   s   @r5   rG  rG    s   y "  15SW1504)-$(,0/3&*B
E,,-B
 "%ellELL.H(I3(N"OPB
 !.	B

  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\\"$99	:B
 B
r7   rG  c                     ^  \ rS rSrU 4S jr\        SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\
   S	\\
   S
\\
   S\\\4   4S jj5       rSrU =r$ )MptForQuestionAnsweringi  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr   )	r@   rA   r   r   r   rM   rB   
qa_outputsr   r!  s     r5   rA    MptForQuestionAnswering.__init__  sA     #F+))F$6$6: 	r7   r   rV   r   start_positionsend_positionsr   r   r   r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U	SS -   nUb  U4U-   $ U$ [        UUUU	R                  U	R                  S	9$ )
r   N)rV   r   r   r   r   r   r   r[   r    )ignore_indexr   )r)  start_logits
end_logitsrS   r  )r;   r  r   rS  splitr-   rq   rh   rj   ra   r   r   rS   r  )rP   r   rV   r   rU  rV  r   r   r   r  sequence_outputr*  rY  rZ  
total_lossignored_indexrD  
start_lossend_lossr   s                       r5   r   MptForQuestionAnswering.forward  s   2 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r7   )rS  r   )NNNNNNNN)r   r   r   r   rA   r   r   r"   r  FloatTensorr   r   r   r   r   r   r   r   s   @r5   rQ  rQ    s      156:596:48,0/3&*E
E,,-E
 !!2!23E
   1 12	E

 "%"2"23E
   0 01E
 $D>E
 'tnE
 d^E
 
u22	3E
 E
r7   rQ  )r  r   r   r1  rG  rQ  r  )4r   r&   typingr   r   r"   torch.utils.checkpointr   torch.nnr   r   r   r	   r
   r   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerr   r  r6   r   r9   r   r   r   r   r  r1  rG  rQ  __all__r   r7   r5   <module>rp     sd     "    L L $ . ) I 9  . , ( 
		H	%.F)299 F)RRYY *7$) 7$t ,
 ,
 ,
^ X
! X
 X
v U
' U
U
p o
#5 o
o
d U
 2 U
 U
p O
0 O
 O
dr7   