
    <h@                       S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  \$RN                  " \(5      r)S\RT                  S\+S\+4S jr, " S S\RZ                  5      r. " S S\R^                  5      r0 " S S\5      r1 " S S\5      r2 " S S\R^                  5      r3 " S S \R^                  5      r4\# " S! S"\!5      5       r5 " S# S$\55      r6 " S% S&\55      r7\# " S' S(\55      5       r8\#" S)S*9 " S+ S,\5\5      5       r9\#" S-S*9 " S. S/\55      5       r:\# " S0 S1\55      5       r; " S2 S3\55      r< " S4 S5\5\5      r=/ S6Qr>g)7zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_idss       \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr)   3   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                      ^  \ rS rSrSrS\S\4U 4S jjrSS\R                  S\S\R                  4U 4S	 jjjr	S
r
U =r$ )MvpLearnedPositionalEmbeddingD   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr.   r/   	__class__s      r(   r5   &MvpLearnedPositionalEmbedding.__init__I   s"     ++5}Er*   r   past_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr2   )dtypedevicer!   r   )r#   torcharangelongweightr=   expand	unsqueezer4   forwardr3   )r6   r   r9   r:   bszseq_lenr7   s         r(   rD   %MvpLearnedPositionalEmbedding.forwardO   s     $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r*   )r3   )r   N)__name__
__module____qualname____firstlineno____doc__intr5   r>   TensorrD   __static_attributes____classcell__r7   s   @r(   r,   r,   D   sH    Fs F3 F; ;s ;^c^j^j ; ;r*   r,   c                     ^  \ rS rSrSr    SS\S\S\\   S\\   S\\   S\\   4U 4S	 jjjr	       SS
\
R                  S\\
R                     S\\   S\\
R                     S\\
R                     S\\
R                     S\S\\
R                     S\\
R                  \\
R                     \\\
R                        4   4S jjrSrU =r$ )MvpAttention]   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rY   )r4   r5   rU   rV   rW   head_dimr%   scalingrX   rZ   r   Lineark_projv_projq_projout_proj)r6   rU   rV   rW   rX   rY   rZ   r7   s          r(   r5   MvpAttention.__init__`   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr*   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskattn_promptoutput_attentionscache_positionreturnc	                 \   USLn	UR                  5       u  pnU R                  U5      U R                  -  nUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUbN  U	(       d  UOSnWR'                  UUU R                  SU05      u  nnU	(       a  SUR
                  U R                  '   Ub  [(        R*                  " US   R-                  U
SSS5      U/SS9n[(        R*                  " US   R-                  U
SSS5      U/SS9nUbZ  [(        R.                  " U
SXS   R                  S5      5      R1                  UR2                  5      n[(        R*                  " UU/SS9nXR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR4                  " U6 nUR4                  " U6 nUR4                  " U6 nUR                  S5      n[(        R6                  " UUR%                  SS5      5      nUR                  5       XR                   -  UU4:w  a.  [9        S	XR                   -  UU4 S
UR                  5        35      eUbz  UR                  5       U
SUU4:w  a#  [9        SU
SUU4 S
UR                  5        35      eUR                  XR                   UU5      U-   nUR                  XR                   -  UU5      n[:        R<                  R?                  USS9nUb  UR                  5       U R                   4:w  a*  [9        SU R                   4 S
UR                  5        35      eUR                  SSSS5      UR                  XR                   UU5      -  nUR                  XR                   -  UU5      nU(       a=  UR                  XR                   UU5      nUR                  XR                   -  UU5      nOSn[:        R<                  RA                  UU R@                  U RB                  S9n[(        R6                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [9        SXR                   XR"                  4 S
UR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR5                  XU RD                  5      nU RG                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr!   r   r2   rl   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )$sizerb   r^   
isinstancer   
is_updatedgetrZ   cross_attention_cacheself_attention_cachelayerskeysvaluesr`   ra   viewrV   r]   	transposeupdater>   catrB   zerostor=   reshapebmmr%   r   
functionalsoftmaxrW   rs   rU   rc   )r6   re   rf   rg   rh   ri   rj   rk   rl   is_cross_attentionrE   tgt_len_query_statesrv   curr_past_key_valuecurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                             r(   rD   MvpAttention.forward}   s    .T9',,.a {{=1DLL@%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn="KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q'q>;N;Nq;QRUUVdVkVkl!&K+Hr!SNN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r*   )rW   rU   r]   rX   r`   rZ   rV   rc   rb   r^   ra   )        FTN)NNNNNFN)rH   rI   rJ   rK   rL   rM   r   floatboolr5   r>   rN   r   tuplerD   rO   rP   rQ   s   @r(   rS   rS   ]   sQ   G $'%*#$(CC C %	C
 TNC tnC D>C C@ 48*.1526.2"'15|2|||2 #5<<0|2 !	|2
 !.|2 "%,,/|2 ell+|2  |2 !.|2 
u||Xell3XeELL>Q5RR	S|2 |2r*   rS   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\R                  S\\	   S	\
\R                  \\R                     4   4S
 jjrSrU =r$ )MvpEncoderLayer   configc                 h  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)rU   rV   rW   )r4   r5   d_modelrU   rS   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrW   r
   activation_functionactivation_fnactivation_dropoutr_   encoder_ffn_dimfc1fc2final_layer_normr6   r   r7   s     r(   r5   MvpEncoderLayer.__init__   s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r*   re   rh   ri   self_attn_promptrk   rm   c                    UnU R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nXa-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXa-   nU R                  U5      nUR                  [        R                  :X  a  [        R                  " U5      R                  5       (       d)  [        R                   " U5      R                  5       (       aC  [        R"                  " UR                  5      R$                  S-
  n[        R&                  " X* US9nX4$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
        `(2, encoder_attention_heads, pro_len, head_dim)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)re   rh   ri   rj   rk   rq   i  )minmax)r   r   r   rW   rs   r   r   r   r   r   r   r<   r>   float16isinfanyisnanfinfor   clamp)	r6   re   rh   ri   r   rk   residualr   clamp_values	            r(   rD   MvpEncoderLayer.forward  su   * !&*nn')+(/ '5 '
# --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<%--/KK&**,,M0J0N0N0P0P++m&9&9:>>EK!KK<[YM**r*   )	r   r   rW   rU   r   r   r   r   r   )F)rH   rI   rJ   rK   r   r5   r>   FloatTensorr   r   r   rD   rO   rP   rQ   s   @r(   r   r      s    =y =, -2/+((/+ ))/+ **	/+
  ++/+ $D>/+ 
u  (5+<+<"==	>/+ /+r*   r   c                      ^  \ rS rSrSS\4U 4S jjjr           SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\	   S\\
   S\\
   S\\R                     S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )MvpDecoderLayeri?  r   c                   > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)rU   rV   rW   rX   rZ   )rW   rX   rZ   )r4   r5   r   rU   rS   decoder_attention_headsr   r   rW   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr_   decoder_ffn_dimr   r   r   )r6   r   rZ   r7   s      r(   r5   MvpDecoderLayer.__init__@  s   %nn44,,
 ~~#F$>$>?"(";";$&LL$@!(NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r*   re   rh   encoder_hidden_statesencoder_attention_maskri   cross_attn_layer_head_maskr   cross_attn_promptrg   rk   	use_cacherl   rm   c           
         UnU R                  UU	UUUU
US9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnUb`  UnU R                  UUUUUU	U
S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU
(       a  UX4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
        `(2, decoder_attention_heads, pro_len, head_dim)`.
    cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
        `(2, decoder_attention_heads, pro_len, head_dim)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)re   rg   rh   ri   rj   rk   rl   rq   N)re   rf   rh   ri   rj   rg   rk   )r   r   r   rW   rs   r   r   r   r   r   r   r   r   )r6   re   rh   r   r   ri   r   r   r   rg   rk   r   rl   r   self_attn_weightscross_attn_weightsoutputss                    r(   rD   MvpDecoderLayer.forward\  s   J ! ,0>>'))+(/) ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+!65 :--"3 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< ")>>Gr*   )r   r   rW   rU   r   r   r   r   r   r   r   N)NNNNNNNNFTN)rH   rI   rJ   rK   r   r5   r>   rN   r   r   r   r   r   rD   rO   rP   rQ   s   @r(   r   r   ?  sS   =y = => 268<9=26=A3748*.,1$(15U||U !.U  (5	U
 !) 6U "%,,/U %-U\\$:U #5<<0U $ELL1U !U $D>U D>U !.U 
u  (51B1BEDUDU1U+V"WW	XU Ur*   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )MvpClassificationHeadi  z-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " US9U l        [        R                  " X#5      U l        g )Nrr   )r4   r5   r   r_   denseDropoutrW   rc   )r6   r   r   r   r   r7   s        r(   r5   MvpClassificationHead.__init__  s@     	YYy4
zzN3		)9r*   re   rm   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r   )rW   r   r>   tanhrc   )r6   re   s     r(   rD   MvpClassificationHead.forward  sN    ]3

=1

=1]3m4r*   )r   rW   rc   )rH   rI   rJ   rK   rL   rM   r   r5   r>   rN   rD   rO   rP   rQ   s   @r(   r   r     sQ    7
:
: 
: 	
:
 
:U\\ ell  r*   r   c                   l   ^  \ rS rSrSrU 4S jrS\R                  S\\R                     4S jr	Sr
U =r$ )	MvpPrompti  z)Layer-wise prompt for encoder or decoder.c           	      :  > [         TU ]  5         UR                  U l        X l        X0l        UR
                  U-  U l        [        R                  " UR                  S9U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " [        R                  " UR
                  UR                  5      [        R                  " 5       [        R                  " UR                  US-  UR
                  -  5      5      U l        g )Nr   r2   )r4   r5   prompt_length
num_layersrV   r   r]   r   r   rW   	Embeddingprompt_embedding
Sequentialr_   prompt_mid_dimGELUprompt_trans)r6   r   r   rV   r7   s       r(   r5   MvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r*   
prompt_idsrm   c                 *   U R                  U R                  U5      5      nUR                  U R                  U R                  S-  U R
                  U R                  5      nU R                  U5      nUR                  / SQ5      R                  S5      nU$ )Nr2   )r   r2   r   r	   )
r   r   r}   r   r   rV   r]   rW   permutesplit)r6   r   prompts      r(   rD   MvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r*   )rW   r]   rV   r   r   r   r   )rH   rI   rJ   rK   rL   r5   r>   rN   r   rD   rO   rP   rQ   s   @r(   r   r     s0    3
%,, 53F  r*   r   c                   >    \ rS rSr% \\S'   SrSrS r\	S 5       r
Srg)	MvpPreTrainedModeli  r   modelTc                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_stdru   r   r_   rA   datanormal_rY   zero_r   padding_idx)r6   moduler   s      r(   _init_weights MvpPreTrainedModel._init_weights  s    kk""fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r*   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      US.nU$ )N)r      
      r2   r         r2   r=   )rh   r   )r   r   r>   tensorr=   ne)r6   	pad_tokenr   dummy_inputss       r(   r  MvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r*    N)rH   rI   rJ   rK   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr  rO   r
  r*   r(   r   r     s-    &*#	?  r*   r   c                   "  ^  \ rS rSrSr SS\S\\R                     S\\	   4U 4S jjjr
       SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\\\4   4S jjrSrU =r$ )
MvpEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MvpEncoderLayer`].

Args:
    config: MvpConfig
    embed_tokens (nn.Embedding): output embedding
    use_prompt (bool): whether to use prompt
r   embed_tokens
use_promptc                 N  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSU l        Ub  X l        O0[        R                   " UR"                  X@R                  5      U l        [%        UR                  U5      U l        [        R(                  " [+        UR,                  5       Vs/ sH  n[/        U5      PM     sn5      U l        [        R2                  " U5      U l        X0l        U(       a7  UR8                  U l        [;        UUR,                  UR<                  5      U l        SU l         U RC                  5         g s  snf )N      ?F)"r4   r5   rW   encoder_layerdrop	layerdropr   r   r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler  r   r   
vocab_sizer,   embed_positions
ModuleListrangeencoder_layersr   rz   r   layernorm_embeddingr  r   r   r   r   gradient_checkpointing	post_init)r6   r   r  r  rU   r   r7   s         r(   r5   MvpEncoder.__init__  sD    	 ~~11NN	!..$*$B$B!393I3I499Y/s# , "V->->	K[K[ \D<** 
 mmeFLaLaFb$cFb_V%<Fb$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   F"r   rh   	head_maskinputs_embedsrk   output_hidden_statesreturn_dictrm   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb$  UnUR
                  n	UR                  SU	S   5      nO.Ub   UR                  5       SS n	USS2SS2S4   nO[	        S5      eUc  U R                  U5      U R                  -  nU R                  U5      n
XJ-   nU R                  U5      n[        R                  R                  XR                  U R                  S9nU R                   (       aJ  ["        R$                  " U R&                  5      R)                  U R*                  5      nU R-                  U5      nUb  [/        X$R0                  5      nU(       a  SOSnU(       a  SOSnUb`  UR                  5       S   [3        U R4                  5      :w  a6  [	        S[3        U R4                  5       S	UR                  5       S    S
35      e[7        U R4                  5       H  u  nnU(       a  X4-   nSnU R                  (       a(  ["        R8                  " / 5      nUU R:                  :  a  SnU(       a  SnO-U" UUUb  UU   OSU R                   (       a  WU   OSUS9nUS   nU(       d  M  UUS   4-   nM     U(       a  X4-   nU(       d  [=        S XU4 5       5      $ [?        XUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   z5You have to specify either input_ids or inputs_embedsrq   r
  r   z&The head_mask should be specified for  layers, but it is for .FT)NN)ri   r   rk   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r
  .0vs     r(   	<genexpr>%MvpEncoder.forward.<locals>.<genexpr>  s     e$Sq$Ss   	last_hidden_statere   
attentions) r   rk   r(  use_return_dictr%   r#   r}   rt   r  r  r  r"  r   r   rW   rs   r  r>   r?   r   r   r=   r   r   r<   lenrz   	enumeraterandr  r   r   )r6   r   rh   r&  r'  rk   r(  r)  inputinput_shape	embed_posre   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r(   rD   MvpEncoder.forward3  s   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%100?--m||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %7H[H[\N30d  ~~"s4;;'78 <S=M<N O!(+,A/ 
 #,DKK"8C#!/2B!BG}}&+jjn#&7"G , -!"7@7LYs^RV?C&6s&;TX&7! !.a 0  !/=3C2E!E1 #94  +.>>Ne]N$Seee+Vd
 	
r*   )rW   r  r  r  r#  r  r"  rz   r  r   r   r   r  NF)NNNNNNN)rH   rI   rJ   rK   rL   r   r   r   r   r   r5   r>   
LongTensorrN   r   r   r   r   rD   rO   rP   rQ   s   @r(   r  r    s     lq$$/7/E$ZbcgZh$ $P 1515,059,0/3&*@
E,,-@
 !.@
 ELL)	@

   1 12@
 $D>@
 'tn@
 d^@
 
uo%	&@
 @
r*   r  c                     ^  \ rS rSrSr SS\S\\R                     S\\	   4U 4S jjjr
             SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\\R                        S\\R                     S\\	   S\\	   S\\	   S\\	   S\\R                     S\\\4   4S jjrSrU =r$ )
MvpDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

Args:
    config: MvpConfig
    embed_tokens (nn.Embedding): output embedding
    use_prompt (bool): whether to use prompt
r   r  r  c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        Ub  X l        O;[        R                   " UR"                  UR                  U R                  5      U l        [%        UR                  UR                  5      U l        [        R(                  " [+        UR,                  5       Vs/ sH  n[/        XS9PM     sn5      U l        [        R2                  " UR                  5      U l        X0l        U(       a]  UR8                  U l        [;        UUR,                  UR<                  5      U l        [;        UUR,                  UR<                  5      U l         SU l!        U RE                  5         g s  snf )Nr  )rZ   F)#r4   r5   rW   decoder_layerdropr  r   r   r  max_target_positionsr  r  r  r   r  r  r   r   r  r,   r  r  r   decoder_layersr   rz   r   r"  r  r   r   r   r   r   r#  r$  )r6   r   r  r  ir7   s        r(   r5   MvpDecoder.__init__  su    	 ~~11!..$*$B$B!8>8N8N499V^^4TW# , "V->->PTP`P` aD<**NN 
 mmSXY_YnYnSo$pSoa_V%ISo$pq#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %qs   G#r   rh   r   r   r&  cross_attn_head_maskpast_key_valuesr'  r   rk   r(  r)  rl   rm   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUb$  UnUR                  nUR                  SUS   5      nO.Ub   UR                  5       SS nUSS2SS2S4   nO[        S5      eUc  U R                  U5      U R                  -  nU R                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	SnU	(       aB  [        U[         5      (       d-  [        R                  S5        Sn["        R$                  " U5      nUb  UR'                  5       OS	n[)        X/UU5      nUb  Ub  [+        XHR,                  US   S
9nU R/                  UU5      nUU-   nU R1                  U5      n[2        R4                  R7                  UU R6                  U R                  S9nU R8                  (       a[  [:        R<                  " U R>                  5      RA                  U RB                  5      nU RE                  U5      nU RG                  U5      nU(       a  SOSnU
(       a  SOSnU
(       a  Ub  SOSn[I        XV/SS/5       Hn  u  nnUc  M  UR                  5       S	   [K        U RL                  5      :w  d  M7  [        SU S[K        U RL                  5       SUR                  5       S	    S35      e   [O        U RL                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [:        RP                  " / 5      nUU RR                  :  a  ML  U" UUUUUb  UU   OSUb  UU   OSU R8                  (       a  WU   OSU R8                  (       a  WU   OSUU
U	US9nUS	   nU
(       d  M  UUS   4-  nUc  M  UUS   4-  nM     U(       a  UU4-  nU(       a  URU                  5       nU(       d  [W        S UUUUU4 5       5      $ [Y        UUUUUS9$ )ak  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
        cross-attention on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
        shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer!   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )r   rq   r
  r&  rO  zThe `z` should be specified for r+  r,  )	r   ri   r   r   r   rg   rk   r   rl   r   r2   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r
  r.  s     r(   r1  %MvpDecoder.forward.<locals>.<genexpr>  s      rA rs   	)r4  rP  re   r5  cross_attentions)-r   rk   r(  r   r6  r%   r#   r}   rt   r  r  r#  rs   loggerwarning_onceru   r   r   from_legacy_cacheget_seq_lengthr   r   r<   r  r"  r   r   rW   r  r>   r?   r   r   r=   r   r   zipr7  rz   r8  r9  r  to_legacy_cacher   r   ) r6   r   rh   r   r   r&  rO  rP  r'  r   rk   r(  r)  rl   r:  r;  return_legacy_cacher9   	positionsre   r   r   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer?  decoder_layerrB  rC  s                                    r(   rD   MvpDecoder.forward  sn   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee  --i84;K;KKM&&4==##p "	#Z??\
 #'1CCOTOETE`!?!?!Afg:8N

 !,1G1S%?&(;(;[QS_&"
 ((0FG	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos;???"23"7PT=A__#4S#9RV."3#-M *!,M  =#3"55(4(]1-=,??(9 #9>  -!11-==?O ':K^]qr  
 9+++%1
 	
r*   )r   rW   r  r  r  r#  r  r"  rz   rK  r   r   r   r  rE  )NNNNNNNNNNNNN)rH   rI   rJ   rK   rL   r   r   r   r   r   r5   r>   rF  rN   r   listr   r   r   rD   rO   rP   rQ   s   @r(   rH  rH    s    lq&&/7/E&ZbcgZh& &T 1515=A=A,07;=A59$(,0/3&*15R
E,,-R
 !.R
  ((9(9:	R

 !))9)9 :R
 ELL)R
 'u||4R
 "$u'8'8"9:R
   1 12R
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
u??	@R
 R
r*   rH  c            &       >  ^  \ rS rSrS/rSS/rS\4U 4S jjrS rS r	S	 r
S
 rS r\                SS\\R                      S\\R"                     S\\R                      S\\R                      S\\R"                     S\\R"                     S\\R"                     S\\\R&                        S\\\R&                        S\\R&                     S\\R&                     S\\   S\\   S\\   S\\   S\\R"                     S\\\4   4"S jj5       rSrU =r$ ) MvpModeli  final_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                 t  > [         TU ]  U5        UR                  UR                  p2UR                  U l        [
        R                  " X1R                  U5      U l        [        XR                  UR                  5      U l
        [        XR                  UR                  5      U l        U R                  5         g r   )r4   r5   r   r  r  r   r   r   sharedr  encoderrH  decoderr$  )r6   r   r   r  r7   s       r(   r5   MvpModel.__init__  s     "("5"5v7H7HZ ++ll:~~{K!&++v7H7HI!&++v7H7HI 	r*   c                     U R                   $ r   )rk  r6   s    r(   get_input_embeddingsMvpModel.get_input_embeddings  s    {{r*   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r   )rk  rl  r  rm  r6   values     r(   set_input_embeddingsMvpModel.set_input_embeddings  s'    $(KK!$(KK!r*   c                     U R                   $ r   )rl  rp  s    r(   get_encoderMvpModel.get_encoder      ||r*   c                     U R                   $ r   rm  rp  s    r(   get_decoderMvpModel.get_decoder  r{  r*   c                 4   U R                   (       d   S5       eU R                  S5        U R                  R                  R                  S5        U R                  R                  R                  S5        U R                  R
                  R                  S5        g )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r  requires_grad_rl  r   rm  r   rp  s    r(   set_lightweight_tuningMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r*   r   rh   decoder_input_idsdecoder_attention_maskr&  decoder_head_maskrO  encoder_outputsrP  r'  decoder_inputs_embedsr   rk   r(  r)  rl   rm   c                 T   UcE  UcB  Uc  [        S5      e[        XR                  R                  U R                  R                  5      nUb  UOU R                  R
                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUc  U R                  UUUU
UUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  UUUS   UUUU	UUUUUUS9nU(       d  UU-   $ [        UR                  UR                   UR"                  UR$                  UR&                  UR                  UR"                  UR$                  S	9$ )
az  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   rh   r&  r'  rk   r(  r)  r   r   r2   r3  )r   rh   r   r   r&  rO  rP  r'  r   rk   r(  r)  rl   )r4  rP  decoder_hidden_statesdecoder_attentionsrT  encoder_last_hidden_stater   encoder_attentions)r%   r)   r   r   r   rk   r(  r   r6  rl  ru   r   r7  rm  r   r4  rP  re   r5  rT  )r6   r   rh   r  r  r&  r  rO  r  rP  r'  r  r   rk   r(  r)  rl   decoder_outputss                     r(   rD   MvpModel.forward  s   f $)>)F  U  !3;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]""ll#-#+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r*   )rm  rl  rk  r  NNNNNNNNNNNNNNNN)rH   rI   rJ   rK   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r5   rq  rv  ry  r~  r  r   r   r>   rF  rN   rd  r   r   r   r   r   rD   rO   rP   rQ   s   @r(   rf  rf    s   *=)>&79VWy 0
<  15158<=A,0487;=A=A59=A$(,0/3&*15#t
E,,-t
 !.t
 $E$4$45	t

 !))9)9 :t
 ELL)t
 $ELL1t
 'u||4t
 "$u'8'8"9:t
 "$u'8'8"9:t
   1 12t
  ((9(9:t
 D>t
 $D>t
 'tnt
  d^!t
" !.#t
$ 
u((	)%t
 t
r*   rf  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            (         ^  \ rS rSr/ SQrS\4U 4S jjrS rS r S#S\	S	\
\	   S
\S\R                  4U 4S jjjrS\	SS4S jrS r\                 S$S\
\R&                     S\
\R(                     S\
\R&                     S\
\R&                     S\
\R(                     S\
\R(                     S\
\R(                     S\
\\R,                        S\
\\R,                        S\
\R,                     S\
\R,                     S\
\R&                     S\
\   S\
\   S\
\   S\
\   S\
\R(                     S\\\4   4$S  jj5       rS\R(                  4S! jrS"rU =r$ )%MvpForConditionalGenerationi]  )rh  ri  lm_head.weightr   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nrg  r   Fr\   )r4   r5   rf  r   register_bufferr>   r   rk  r.   r   r_   r   lm_headr$  r   s     r(   r5   $MvpForConditionalGeneration.__init__e  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r*   c                 6    U R                   R                  5       $ r   )r   ry  rp  s    r(   ry  'MvpForConditionalGeneration.get_encodern      zz%%''r*   c                 6    U R                   R                  5       $ r   )r   r~  rp  s    r(   r~  'MvpForConditionalGeneration.get_decoderq  r  r*   Nnew_num_tokenspad_to_multiple_ofmean_resizingrm   c                 J   > [         TU ]  XU5      nU R                  U5        U$ r   )r4   resize_token_embeddings_resize_final_logits_bias)r6   r  r  r  new_embeddingsr7   s        r(   r  3MvpForConditionalGeneration.resize_token_embeddingst  s+     8]jk&&~6r*   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr!   r   r  ro   rg  )rg  r#   r>   r   r=   r   r  )r6   r  old_num_tokensnew_bias
extra_biass        r(   r  5MvpForConditionalGeneration._resize_final_logits_bias{  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r*   c                 n    U R                   R                  5         U R                  R                  S5        g rE  r   r  r  r  rp  s    r(   r  2MvpForConditionalGeneration.set_lightweight_tuning  $    

))+##E*r*   r   rh   r  r  r&  r  rO  r  rP  r'  r  labelsr   rk   r(  r)  rl   c                    Ub  UOU R                   R                  nUbX  U(       a  [        R                  S5        SnUc7  Uc4  [	        XR                   R
                  U R                   R                  5      nU R                  UUUUUUUUU	U
UUUUUUS9nU R                  US   5      U R                  -   nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  S9	$ )	aE  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example of summarization:

Fine-tuning a model
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer(
...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
...     return_tensors="pt",
... )
>>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     generated_ids = model.generate(**inputs)

>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rh   r  r  r  r&  r  rO  rP  r'  r  r   rk   r(  r)  rl   r   r!   r   	losslogitsrP  r  r  rT  r  r   r  )r   r6  rU  warningr)   r   r   r   r  rg  r   r}   r  r   rP  r  r  rT  r  r   r  )r6   r   rh   r  r  r&  r  rO  r  rP  r'  r  r  r   rk   r(  r)  rl   r   	lm_logitsmasked_lm_lossloss_fctoutputs                          r(   rD   #MvpForConditionalGeneration.forward  s   d &1%<k$++B]B]klI (-B-J$6KK44dkk6X6X%! **)/+#9/!5+'"7/!5#)!  
$ LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   c                 j    [        XR                  R                  U R                  R                  5      $ r   )r)   r   r   r   )r6   r  s     r(   %prepare_decoder_input_ids_from_labelsAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s#    !&++*B*BDKKDfDfggr*   r  r   )NT)NNNNNNNNNNNNNNNNN)rH   rI   rJ   rK   r  r   r5   ry  r~  rM   r   r   r   r   r  r  r  r   r>   rF  rN   rd  r   r   r   r   rD   r  rO   rP   rQ   s   @r(   r  r  ]  sD    jy (( dh!7?}\`	 < < <+  15158<=A,0487;=A=A59=A-1$(,0/3&*15%C
E,,-C
 !.C
 $E$4$45	C

 !))9)9 :C
 ELL)C
 $ELL1C
 'u||4C
 "$u'8'8"9:C
 "$u'8'8"9:C
   1 12C
  ((9(9:C
 ))*C
 D>C
 $D>C
  'tn!C
" d^#C
$ !.%C
& 
uo%	&'C
 C
JhELL h hr*   r  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $         ^  \ rS rSrSS/rS\4U 4S jjrS r\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\4   4 S jj5       rSrU =r$ )MvpForSequenceClassificationi  rh  ri  r   c                    > [         TU ]  " U40 UD6  [        U5      U l        [	        UR
                  UR
                  UR                  UR                  5      U l        U R                  5         g r   )
r4   r5   rf  r   r   r   
num_labelsclassifier_dropoutclassification_headr$  )r6   r   kwargsr7   s      r(   r5   %MvpForSequenceClassification.__init__  sZ    *6*f%
#8NNNN%%	$
  	r*   c                 n    U R                   R                  5         U R                  R                  S5        g rE  )r   r  r  r  rp  s    r(   r  3MvpForSequenceClassification.set_lightweight_tuning(  s&    

))+  //6r*   r   rh   r  r  r&  r  rO  r  r'  r  r  r   rk   r(  r)  rm   c                    Ub  UOU R                   R                  nUb  SnUc%  U	b"  [        SU R                  R                   35      eU R                  UUUUUUUUU	U
UUUUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      5      S:  a  [        S5      eUUSS24   R                  UR!                  S5      SUR!                  S5      5      SS2SSS24   nU R#                  U5      nSnUGb  U R                   R$                  c  U R                   R&                  S:X  a  S	U R                   l        OyU R                   R&                  S:  aN  UR(                  [        R*                  :X  d  UR(                  [        R,                  :X  a  S
U R                   l        OSU R                   l        U R                   R$                  S	:X  aT  [/        5       nU R                   R&                  S:X  a&  U" UR1                  5       UR1                  5       5      nOU" UU5      nOU R                   R$                  S
:X  aG  [3        5       nU" UR                  SU R                   R&                  5      UR                  S5      5      nO-U R                   R$                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  UR>                  UR@                  URB                  URD                  S9	$ )a
  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Example of single-label classification:

Fine-tuning a model on `num_labels` classes
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForSequenceClassification

>>> num_labels = 2  # for example, this is a binary classification task
>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

>>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor(1)  # the real label for inputs

>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax()
```
NFz8Passing input embeddings is currently not supported for rh   r  r  r&  r  rO  r  r'  r  r   rk   r(  r)  r   r   z7All examples must have the same number of <eos> tokens.r!   
regressionsingle_label_classificationmulti_label_classificationr  )#r   r6  NotImplementedErrorr7   rH   r   eqeos_token_idr   r=   r7  r>   unique_consecutivesumr%   r}   rt   r  problem_typer  r<   r@   rM   r   squeezer   r   r   rP  r  r  rT  r  r   r  )r6   r   rh   r  r  r&  r  rO  r  r'  r  r  r   rk   r(  r)  r   re   eos_masksentence_representationr  r  r  r  s                           r(   rD   $MvpForSequenceClassification.forward,  s   Z &1%<k$++B]B]I!:%J4>>KbKbJcd  **)/#9/!5+'"7/!5#  
   
<< 8 89<<]=Q=QRu''Q89A=VWW"/!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   )r  r   )NNNNNNNNNNNNNNN)rH   rI   rJ   rK   r  r   r5   r  r   r   r>   rF  rN   rd  r   r   r   r   r   rD   rO   rP   rQ   s   @r(   r  r    s    89VWy 7  15158<=A,0487;=A59=A-1$(,0/3&*!T
E,,-T
 !.T
 $E$4$45	T

 !))9)9 :T
 ELL)T
 $ELL1T
 'u||4T
 "$u'8'8"9:T
   1 12T
  ((9(9:T
 ))*T
 D>T
 $D>T
 'tnT
  d^!T
" 
u55	6#T
 T
r*   r  c            &         ^  \ rS rSrSS/rU 4S jrS r\                SS\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\\	R                        S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\\4   4"S jj5       rSrU =r$ )MvpForQuestionAnsweringi  rh  ri  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r1   )
r4   r5   r  rf  r   r   r_   hidden_size
qa_outputsr$  r   s     r(   r5    MvpForQuestionAnswering.__init__  s[      ++f%
))F$6$68I8IJ 	r*   c                 n    U R                   R                  5         U R                  R                  S5        g rE  )r   r  r  r  rp  s    r(   r  .MvpForQuestionAnswering.set_lightweight_tuning  s$    

))+&&u-r*   r   rh   r  r  r&  r  rO  r  start_positionsend_positionsr'  r  r   rk   r(  r)  rm   c                    Ub  UOU R                   R                  nU	b  U
b  SnU R                  UUUUUUUUUUUUUUS9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnU	b  U
b  [        U	R                  5       5      S:  a  U	R                  S5      n	[        U
R                  5       5      S:  a  U
R                  S5      n
UR                  S5      nU	R                  SU5      n	U
R                  SU5      n
[        US9nU" UU	5      nU" UU
5      nUU-   S	-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  UR                  UR                  UR                   UR"                  UR$                  S
9
$ )a
  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

Fine-tuning a model for extrative question answering, and our model also supports generative question answering
using `BartForConditionalGeneration`
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForQuestionAnswering

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer(
...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
...     return_tensors="pt",
... )
>>> target_start_index = torch.tensor([18])
>>> target_end_index = torch.tensor([19])

>>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()

>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
>>> predict_answer = tokenizer.decode(predict_answer_tokens)
```
NFr  r   r   r!   ro   )ignore_indexr2   )
r  start_logits
end_logitsrP  r  r  rT  r  r   r  )r   r6  r   r  r   r  
contiguousr7  rt   r   r   r   rP  r  r  rT  r  r   r  )r6   r   rh   r  r  r&  r  rO  r  r  r  r'  r  r   rk   r(  r)  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                               r(   rD   MvpForQuestionAnswering.forward  s   f &1%<k$++B]B]&=+DI**)/#9/!5+'"7/!5#  
" "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r*   )r   r  r  r  )rH   rI   rJ   rK   r  r5   r  r   r   r>   rN   rF  rd  r   r   r   r   r   rD   rO   rP   rQ   s   @r(   r  r    s   79VW
.  -1158<=A,0487;=A6:4859=A$(,0/3&*#Q
ELL)Q
 !.Q
 $E$4$45	Q

 !))9)9 :Q
 ELL)Q
 $ELL1Q
 'u||4Q
 "$u'8'8"9:Q
 "%"2"23Q
   0 01Q
   1 12Q
  ((9(9:Q
 D>Q
 $D>Q
  'tn!Q
" d^#Q
$ 
u99	:%Q
 Q
r*   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MvpDecoderWrapperin  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 D   > [         TU ]  U5        [        U5      U l        g r   )r4   r5   rH  rm  r   s     r(   r5   MvpDecoderWrapper.__init__t  s     !&)r*   c                 &    U R                   " U0 UD6$ r   r}  )r6   argsr  s      r(   rD   MvpDecoderWrapper.forwardx  s    ||T,V,,r*   r}  )	rH   rI   rJ   rK   rL   r5   rD   rO   rP   rQ   s   @r(   r  r  n  s    
*- -r*   r  c            "         ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
\              SS	\\R                     S
\\R                     S\\R                      S\\R                      S\\R                     S\\R                     S\\\R                         S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )MvpForCausalLMi|  r  c                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr\   )rX   is_encoder_decoderr4   r5   r  r   r   r_   r  r  r  r$  r   s     r(   r5   MvpForCausalLM.__init__  sX     $)! &v.
yy!3!3V5F5FUS 	r*   c                 B    U R                   R                  R                  $ r   r   rm  r  rp  s    r(   rq  #MvpForCausalLM.get_input_embeddings  s    zz!!...r*   c                 8    XR                   R                  l        g r   r  rt  s     r(   rv  #MvpForCausalLM.set_input_embeddings  s    */

'r*   c                 $    XR                   l        g r   r   rm  )r6   rm  s     r(   set_decoderMvpForCausalLM.set_decoder  s    $

r*   c                 .    U R                   R                  $ r   r   rp  s    r(   r~  MvpForCausalLM.get_decoder  s    zz!!!r*   c                 n    U R                   R                  5         U R                  R                  S5        g rE  r  rp  s    r(   r  %MvpForCausalLM.set_lightweight_tuning  r  r*   r   rh   r   r   r&  rO  rP  r'  r  r   rk   r(  r)  rl   rm   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUS9nU R                  US   5      nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a9  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MvpForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> list(logits.shape)
[1, 8, 50267]
```N)r   rh   r   r   r&  rO  rP  r'  r   rk   r(  r)  r   r!   r   )r  r  rP  re   r5  rT  )r   rk   r(  r6  r   rm  r  r   r}   r  r   rP  re   r5  rT  )r6   r   rh   r   r   r&  rO  rP  r'  r  r   rk   r(  r)  rl   r   r  r  r  r  s                       r(   rD   MvpForCausalLM.forward  sF   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 gaj)')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r*   r  )NNNNNNNNNNNNNN)rH   rI   rJ   rK   r  r5   rq  rv  r  r~  r  r   r   r>   rF  rN   r   rd  r   r   r   r   rD   rO   rP   rQ   s   @r(   r  r  |  s   *+	/0%"+  1515=A>B,07;=A59-1$(,0/3&*15T
E,,-T
 !.T
  ((9(9:	T

 !)):): ;T
 ELL)T
 'u||4T
 "$u'8'8"9:T
   1 12T
 ))*T
 D>T
 $D>T
 'tnT
 d^T
 !.T
  
u77	8!T
 T
r*   r  )r  r  r  r  rf  r   )?rL   r  typingr   r   r>   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mvpr   
get_loggerrH   rU  rN   rM   r)   r   r,   ModulerS   r   r   r   r   r   r  rH  rf  r  r  r  r  r  __all__r
  r*   r(   <module>r     s     "    A A ! 5 ) :   . , ( 
		H	%%,, c [^ ";BLL ;2\2299 \2~@+0 @+Fr0 rlBII 0		 2   6q
# q
hD
# D
N \
! \
 \
~ 
mh"4o mh
mh` i
#5 i
i
X e
0 e
 e
R-* -s
' s
lr*   