
    <hIz                        S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
Jr  SSKJr  SS	KJrJrJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJrJrJrJrJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  \" 5       (       a  SSK'J(r(J)r)   " S S\ 5      r*\ " S S\5      5       r+ " S S\5      r, " S S\5      r-\ " S S\+5      5       r.\" SS9 " S  S!\+\5      5       r/ " S" S#\5      r0 " S$ S%\"5      r1 " S& S'\5      r2/ S(Qr3g))zPyTorch PLBART model.    N)OptionalUnion)nn)CrossEntropyLoss   )Cache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_available   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfig)	BlockMaskmake_flex_block_causal_maskc                       \ rS rSrSrg)PLBartScaledWordEmbedding7    N__name__
__module____qualname____firstlineno____static_attributes__r"       a/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/plbart/modular_plbart.pyr    r    7       r)   r    c                      \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrS\\R                  S4   S	\R                  4S
 jrS\\\R                  S4      S\R                  S\R                  S\4S jr\S\R                  S\S\S\R*                  S\R                  S\4S j5       rS\\R                  S4   S\\R                  S4   S\R.                  S	\R                  4S jrSrg)PLBartPreTrainedModel;   configmodelTPLBartDecoderLayerPLBartEncoderLayerattention_maskNinputs_embedsc                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ )Nflash_attention_2r   sdpaflex_attentionF)	is_causal	r/   _attn_implementationr   dtype
isinstancetorchTensorr   r   )selfr3   r4   s      r*   _update_full_mask'PLBartPreTrainedModel._update_full_maskF   s    
 %{{//3FF343F  MQ  11V; "E^UhUh!i  115EEnell;;%@[`%aN
  "<NL_L_!`r)   r   input_tensorcache_positionpast_key_valuesc           	      \   U R                   R                  S:X  au  [        U[        R                  5      (       a  [        U5      nU$ UcD  [        [        R                  " UR                  S   UR                  S   4UR                  S95      nU$ U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a.  U(       d'  [        R                  " UUUU R                  S	9(       a  g UR                  nUR                  S   nU(       a  UR!                  5       n	O5[        U[        R                  5      (       a  UR                  S
   OXX-   S-   n	U R#                  UUU	UUUR                  S   S9n
U R                   R                  S:X  aS  UbP  UR                  R$                  S;   a6  [        R&                  " U5      R(                  n[        R*                  " X5      n
U
$ )Nr8   r   r   )sizedevicer6   g        Fr7   )r4   past_key_values_lengthis_training)sequence_lengthtarget_lengthr<   rD   
batch_size)cudaxpunpu)r/   r;   r=   r>   r?   r   onesshaperH   anyget_seq_lengthis_compileabler
   _ignore_causal_mask_sdpatrainingr<   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r@   r3   rC   rD   rE   past_seen_tokensusing_compilable_cacher<   rL   rM   causal_mask	min_dtypes               r*   _update_causal_mask)PLBartPreTrainedModel._update_causal_mask]   s    ;;++/??.%,,77!<^!L "!  '!<JJ*003\5G5G5JK-44" "!;;++/BB)~/D.I.I.K.K%%
 @O?Z?99;`aCRC^!?!?di ;;++v5>T%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD
 E*..I0CCK[Kr)   rL   rM   r<   rN   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer<   rH   r   )diagonalrH   rK   r   )dimr>   r\   r]   fullrH   triuarangereshapeexpandclonerS   tomasked_fill)r3   rL   rM   r<   rD   rN   kwargsra   rb   mask_lengthpadding_masks              r*   rZ   KPLBartPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position   s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r)   encoder_hidden_statesencoder_attention_maskinput_shapec                    Ub  Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        UUR                  US   S9nU$ U R                   R                  S:X  a/  [	        U[
        R                  5      (       a  [        UUS   SS9nU$ [        X$R                  US   S9nU$ )	Nr6   r   r7   rK   )tgt_lenr8   F)query_lengthr9   r:   )r@   rw   rx   ry   r4   s        r*   _update_cross_attn_mask-PLBartPreTrainedModel._update_cross_attn_mask   s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellCC-H.%0_"'.* &%	 *D*,?,?UW*& &%r)   r"   )r$   r%   r&   r'   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r>   r?   rA   r   r   rc   staticmethodintr<   rZ   Sizer}   r(   r"   r)   r*   r-   r-   ;   sN   &*#-/CDNellD01 ||.J u||['@!ABJ llJ 	J
 JX 444 4 {{	4
 4 4 4n!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!&r)   r-   c                       \ rS rSrSrg)PLBartEncoderi  r"   Nr#   r"   r)   r*   r   r     r+   r)   r   c                       \ rS rSrSrg)PLBartDecoderi
  r"   Nr#   r"   r)   r*   r   r   
  r+   r)   r   c            &       8  ^  \ rS rSrSS/rS\4U 4S jjrS rS rS r	S	 r
S
 r\                SS\\R                     S\\R                     S\\R                     S\\R                      S\\R                      S\\R                     S\\R                      S\\\R$                        S\\   S\\R$                     S\\R$                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                      \4   4"S jj5       rSrU =r$ )PLBartModeli  encoder.embed_tokens.weightdecoder.embed_tokens.weightr/   c                 r  > [         TU ]  U5        UR                  UR                  p2UR                  (       a   [
        R                  " UR                  5      OSn[        X1R                  X$S9U l	        [        XR                  5      U l        [        XR                  5      U l        U R                  5         g )Ng      ?)embed_scale)super__init__pad_token_id
vocab_sizescale_embeddingmathsqrtd_modelr    sharedr   encoderr   decoderinit_weights)r@   r/   padding_idxr   r   	__class__s        r*   r   PLBartModel.__init__  s}     "("5"5v7H7HZ393I3Idii/s/
NNKq$V[[9$V[[9r)   c                     U R                   $ N)r   r@   s    r*   get_input_embeddings PLBartModel.get_input_embeddings  s    {{r)   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r   )r   r   embed_tokensr   )r@   values     r*   set_input_embeddings PLBartModel.set_input_embeddings!  s'    $(KK!$(KK!r)   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   )r/   tie_word_embeddings_tie_or_clone_weightsr   r   r   r   r   s    r*   _tie_weightsPLBartModel._tie_weights&  sP    ;;**&&t||'@'@$++N&&t||'@'@$++N +r)   c                     U R                   $ r   )r   r   s    r*   get_encoderPLBartModel.get_encoder+      ||r)   c                     U R                   $ r   )r   r   s    r*   get_decoderPLBartModel.get_decoder.  r   r)   	input_idsr3   decoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputsrE   r4   decoder_inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictrD   returnc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc"  Uc  [        XR                   R                  5      nUc  U R                  UUUU
UUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  UUUS   UUUU	UUUUUUS9nU(       d  UU-   $ [        UR                  UR                  UR                  UR                   UR"                  UR                  UR                  UR                   S9$ )	a:  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
cross_attn_head_mask (:
    obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify
    selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
N)r   r3   r   r4   r   r   r   r   r   r   )last_hidden_statehidden_states
attentions)r   r3   rw   rx   r   r   rE   r4   r   r   r   r   rD   )r   rE   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterw   encoder_attentions)r/   r   r   r   use_return_dictr   r   r   r=   r   lenr   r   r   rE   r   r   r   )r@   r   r3   r   r   r   r   r   r   rE   r4   r   r   r   r   r   rD   decoder_outputss                     r*   forwardPLBartModel.forward1  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] $)>)F 29kk>V>V W""ll#-#+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r)   )r   r   r   )NNNNNNNNNNNNNNNN)r$   r%   r&   r'   _tied_weights_keysr   r   r   r   r   r   r   r   r   r>   
LongTensorr?   listFloatTensorr   boolr   tupler   r   r(   __classcell__r   s   @r*   r   r     s   79VW
| 
0
O
  15598<9=,08<7;=A+/59=A$(,0/3&*59#k
E,,-k
 !!1!12k
 $E$4$45	k

 !) 6k
 ELL)k
 $E$4$45k
 'u||4k
 "$u'8'8"9:k
 "%k
   1 12k
  ((9(9:k
 D>k
 $D>k
 'tnk
  d^!k
" !!1!12#k
$ 
uU\\"$66	7%k
 k
r)   r   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc            (         ^  \ rS rSrSrS/r/ SQrS\4U 4S jjrS r	S r
 S$S
\S\\   S\S\R                  4U 4S jjjrS
\SS	4S jr\                 S%S\\R(                     S\\R(                     S\\R(                     S\\R*                     S\\R*                     S\\R(                     S\\R*                     S\\\R.                        S\\   S\\R.                     S\\R.                     S\\R*                     S\\   S\\   S\\   S\\   S \\R(                     S\\\R*                     \4   4$S! jj5       rS\R*                  4S" jrS#rU =r$ )&PLBartForConditionalGenerationi  r0   final_logits_bias)r   r   zlm_head.weightr/   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr   r   F)bias)r   r   r   r0   register_bufferr>   zerosr   num_embeddingsr   Linearr   lm_headr   )r@   r/   r   s     r*   r   'PLBartForConditionalGeneration.__init__  s      (
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^r)   c                 6    U R                   R                  5       $ r   )r0   r   r   s    r*   r   *PLBartForConditionalGeneration.get_encoder      zz%%''r)   c                 6    U R                   R                  5       $ r   )r0   r   r   s    r*   r   *PLBartForConditionalGeneration.get_decoder  r   r)   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )r   resize_token_embeddings_resize_final_logits_biasweightrS   )r@   r   r   r   new_embeddingsr   s        r*   r   6PLBartForConditionalGeneration.resize_token_embeddings  s<     8]jk&&~'<'<'B'B1'EFr)   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )NrK   r   ri   )rj   r   )r   rS   r>   r   rH   catr   )r@   r   old_num_tokensnew_bias
extra_biass        r*   r   8PLBartForConditionalGeneration._resize_final_logits_bias  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r)   r   r3   r   r   r   r   r   r   rE   r4   r   labelsr   r   r   r   rD   c                    Ub  UOU R                   R                  nUb%  Uc"  Uc  [        XR                   R                  5      nU R	                  UUUUUUUUU	U
UUUUUUS9nU R                  US   5      nUU R                  R                  UR                  5      -   nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  S9	$ )a
  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
cross_attn_head_mask (:
    obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify
    selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example Mask-filling:

```python
>>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

>>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

>>> # en_XX is the language symbol id <LID> for English
>>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)

>>> tokenizer.decode(predictions).split()
['first', 'same', 'highest', 'result', 'number']
```
N)r3   r   r   r   r   r   r   rE   r4   r   r   r   r   r   rD   r   rK   r   )	losslogitsrE   r   r   r   r   rw   r   )r/   r   r   r   r0   r   r   rq   rH   r   viewr   r   rE   r   r   r   r   rw   r   )r@   r   r3   r   r   r   r   r   r   rE   r4   r   r   r   r   r   r   rD   outputs	lm_logitsmasked_lm_lossloss_fctoutputs                          r*   r   &PLBartForConditionalGeneration.forward  s{   V &1%<k$++B]B] (-B-J$6v{{?W?W$X!**)/+#9/!5+'"7/!5#)!  
$ LL,	 6 6 9 9):J:J KK	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r)   c                 @    [        XR                  R                  5      $ r   )r   r/   r   )r@   r   s     r*   %prepare_decoder_input_ids_from_labelsDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labelsC  s    !&++*B*BCCr)   )r   r0   )NT)NNNNNNNNNNNNNNNNN) r$   r%   r&   r'   r   _keys_to_ignore_on_load_missingr   r   r   r   r   r   r   r   r   	Embeddingr   r   r   r>   r   r?   r   r   r   r   r   r   r   r  r(   r   r   s   @r*   r   r     sL     ':&;#i| (( dh!7?}\`	 < < <  15598<9=,08<7;=A+/59=A)-$(,0/3&*59%x
E,,-x
 !!1!12x
 $E$4$45	x

 !) 6x
 ELL)x
 $E$4$45x
 'u||4x
 "$u'8'8"9:x
 "%x
   1 12x
  ((9(9:x
 &x
 D>x
 $D>x
  'tn!x
" d^#x
$ !!1!12%x
& 
uU\\"O3	4'x
 x
tDELL D Dr)   r   c                       \ rS rSrSrg)PLBartClassificationHeadiG  r"   Nr#   r"   r)   r*   r  r  G  r+   r)   r  c                   (   ^  \ rS rSrU 4S jrSrU =r$ )PLBartForSequenceClassificationiK  c                  :   > [        5       R                  " S0 U D6  g)aL  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
cross_attn_head_mask (:
    obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify
    selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr"   r   r   super_kwargsr   s    r*   r   'PLBartForSequenceClassification.forwardL  s    B 	','r)   r"   )r$   r%   r&   r'   r   r(   r   r   s   @r*   r  r  K  s    !( !(r)   r  c                   2   ^  \ rS rSr\U 4S j5       rSrU =r$ )PLBartForCausalLMip  c                  :   > [        5       R                  " S0 U D6  g)a  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, PLBartForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
>>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base", add_cross_attention=False)
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```Nr"   r  r  s    r*   r   PLBartForCausalLM.forwardq  s    : 	','r)   r"   )r$   r%   r&   r'   r   r   r(   r   r   s   @r*   r  r  p  s    ( (r)   r  )r  r   r  r   r-   )4__doc__r   typingr   r   r>   torch.utils.checkpointr   torch.nnr   cache_utilsr   
generationr	   modeling_attn_mask_utilsr
   r   r   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   bart.modeling_bartr   r   r   r   r   (bigbird_pegasus.modeling_bigbird_pegasusr   mbart.modeling_mbartr   configuration_plbartr   integrations.flex_attentionr   r   r    r-   r   r   r   r   r  r  r  __all__r"   r)   r*   <module>r(     s$     "    %   ) 
 
 . A  _ 5 .  !!U	 7 	 G&O G& G&T	K 		K 	 N
' N
 N
b 
_D%:O _D
_DD	5 	"(&M "(J( (Br)   