ó
    <±hÖ¾  ã                   ó  • S r SSKJrJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  \'" 5       (       a  SSK+J,r,  SSK-J.r.  \(R^                  " \05      r1 " S S\Rd                  5      r3 S2S\Rh                  S\Rj                  S\Rj                  S\Rj                  S\\Rj                     S\6S\64S jjr7 " S S \Rh                  5      r8 " S! S"\5      r9\% " S# S$\ 5      5       r: " S% S&\:5      r;\% " S' S(\:5      5       r< " S) S*\:\5      r=\%" S+S,9 " S- S.\:5      5       r>\% " S/ S0\:5      5       r?/ S1Qr@g)3zPyTorch OPT model.é    )ÚCallableÚOptionalÚUnionN)Únn)ÚBCEWithLogitsLossÚCrossEntropyLossÚMSELossé   )ÚACT2FN)ÚCacheÚDynamicCache)ÚGenerationMixin)ÚAttentionMaskConverter)ÚFlashAttentionKwargs)ÚGradientCheckpointingLayer)ÚBaseModelOutputWithPastÚCausalLMOutputWithPastÚQuestionAnsweringModelOutputÚ SequenceClassifierOutputWithPast)ÚALL_ATTENTION_FUNCTIONSÚPreTrainedModel)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tupleÚis_torch_flex_attn_availableÚloggingé   )Ú	OPTConfig)Ú	BlockMask)Úmake_flex_block_causal_maskc                   óŠ   ^ • \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S\	\R                     4U 4S	 jjjr
S
rU =r$ )ÚOPTLearnedPositionalEmbeddingé3   zF
This module learns positional embeddings up to a fixed maximum size.
Únum_embeddingsÚembedding_dimc                 óL   >• SU l         [        TU ]	  XR                   -   U5        g ©Né   )ÚoffsetÚsuperÚ__init__)Úselfr%   r&   Ú	__class__s      €Ú\/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/opt/modeling_opt.pyr,   Ú&OPTLearnedPositionalEmbedding.__init__8   s"   ø€ ð ˆŒÜ‰Ñ˜¯+©+Ñ5°}ÕEó    Úattention_maskÚpast_key_values_lengthÚposition_idsc                 óª   >• Uc5  [         R                  " USS9nX1-  S-
  R                  5       nUSS2US24   n[        TU ]  X0R
                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   ©Údim)ÚtorchÚcumsumÚlongr+   Úforwardr*   )r-   r2   r3   r4   r.   s       €r/   r;   Ú%OPTLearnedPositionalEmbedding.forward>   sZ   ø€ ð ÑÜ Ÿ<š<¨¸AÑ>ˆLØ(Ñ9¸AÑ=×CÑCÓEˆLà'ªÐ+AÑ+BÐ(BÑCˆLä‰w‰˜|¯k©kÑ9Ó:Ð:r1   )r*   )r   N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Úintr,   r8   Ú
LongTensorr   r;   Ú__static_attributes__Ú__classcell__©r.   s   @r/   r#   r#   3   s]   ø† ñðF sð F¸3÷ Fð '(Ø37ñ	;à×(Ñ(ð;ð !$ð;ð ˜u×/Ñ/Ñ0÷	;ö ;r1   r#   ÚmoduleÚqueryÚkeyÚvaluer2   ÚscalingÚdropoutc                 ó°  • [         R                  " XR                  SS5      5      U-  nUb  X„-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  X†U R                  S9n[         R                  " Xƒ5      n	U	R                  SS5      R                  5       n	X˜4$ )Néÿÿÿÿéþÿÿÿ)r7   Údtype©ÚpÚtrainingr   r)   )r8   ÚmatmulÚ	transposer   Ú
functionalÚsoftmaxÚfloat32ÚtorP   rL   rS   Ú
contiguous)
rG   rH   rI   rJ   r2   rK   rL   ÚkwargsÚattn_weightsÚattn_outputs
             r/   Úeager_attention_forwardr^   P   s°   € ô —<’< §}¡}°R¸Ó'<Ó=ÀÑG€LØÑ!Ø#Ñ4ˆä—=‘=×(Ñ(¨¸2ÄUÇ]Á]Ð(ÐS×VÑVÐW\×WbÑWbÓc€LÜ—=‘=×(Ñ(¨È6Ï?É?Ð(Ð[€Lä—,’,˜|Ó3€KØ×'Ñ'¨¨1Ó-×8Ñ8Ó:€KàÐ$Ð$r1   c                   ó:  ^ • \ rS rSrSr SS\S\\   4U 4S jjjr     SS\	R                  S\\\	R                        S\\	R                     S	\\	R                     S
\S\\	R                     S\\	R                  \\	R                     \\   4   4S jjrSrU =r$ )ÚOPTAttentionég   z=Multi-headed attention from 'Attention Is All You Need' paperÚconfigÚ	layer_idxc                 óö  >• [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        X l
        Uc-  [        R                  SU R                  R                   S35        U R                  U R                  -  U l        SU l        U R                  U R                  -  U R                  :w  a&  [#        SU R                   SU R                   S35      eU R                  S-  U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        [&        R(                  " U R                  U R                  U R                  S9U l        g )	NzInstantiating z¹ without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      à¿©Úbias)r+   r,   rb   Úhidden_sizeÚ	embed_dimÚnum_attention_headsÚ	num_headsÚattention_dropoutrL   Úenable_biasrc   ÚloggerÚwarning_oncer.   r=   Úhead_dimÚ	is_causalÚ
ValueErrorrK   r   ÚLinearÚk_projÚv_projÚq_projÚout_proj)r-   rb   rc   r[   r.   s       €r/   r,   ÚOPTAttention.__init__j   s{  ø€ ô 	‰ÑÔØŒØ×+Ñ+ˆŒØ×3Ñ3ˆŒØ×/Ñ/ˆŒØ!×-Ñ-ˆÔØ"ŒØÑÜ×ÑØ  §¡×!8Ñ!8Ð 9ð :,ð ,ôð Ÿ™¨$¯.©.Ñ8ˆŒØˆŒàM‰M˜DŸN™NÑ*¨t¯~©~Ó=ÜØMÈdÏnÉnÐM]Ø$ T§^¡^Ð$4°Bð8óð ð —}‘} dÑ*ˆŒä—i’i §¡°·±ÀT×EUÑEUÑVˆŒÜ—i’i §¡°·±ÀT×EUÑEUÑVˆŒÜ—i’i §¡°·±ÀT×EUÑEUÑVˆŒÜŸ	š	 $§.¡.°$·.±.Àt×GWÑGWÑXˆr1   Úhidden_statesÚpast_key_valuer2   Úlayer_head_maskÚoutput_attentionsÚcache_positionÚreturnc                 ó  • UR                  5       u  p‰n
U R                  U5      U R                  -  nUR                  USU R                  U R
                  5      R                  SS5      nU R                  U5      nU R                  U5      nUR                  USU R                  U R
                  5      R                  SS5      nUR                  USU R                  U R
                  5      R                  SS5      nUb!  UR                  XÍU R                  SU05      u  pÍ[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[         U R                  R                     nU" U UUUU4U R"                  (       d  S	OU R$                  S
S.UD6u  nnUR'                  X‰S5      R)                  5       nU R+                  U5      nU(       d  SnUU4$ )z#Input shape: Batch x Time x ChannelrN   r   r)   Nr|   ÚeagerÚsdpazã`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.ç        ç      ð?)rL   rK   )Úsizeru   rK   Úviewrj   ro   rU   rs   rt   Úupdaterc   r^   rb   Ú_attn_implementationrm   rn   r   rS   rL   ÚreshaperZ   rv   )r-   rx   ry   r2   rz   r{   r|   r[   ÚbszÚtgt_lenÚ_Úquery_statesÚ
key_statesÚvalue_statesÚattention_interfacer]   r\   s                    r/   r;   ÚOPTAttention.forward   sÔ  € ð (×,Ñ,Ó.‰ˆað —{‘{ =Ó1°D·L±LÑ@ˆØ#×(Ñ(¨¨b°$·.±.À$Ç-Á-ÓP×ZÑZÐ[\Ð^_Ó`ˆà—[‘[ Ó/ˆ
Ø—{‘{ =Ó1ˆØ—_‘_ S¨"¨d¯n©n¸d¿m¹mÓL×VÑVÐWXÐZ[Ó\ˆ
Ø#×(Ñ(¨¨b°$·.±.À$Ç-Á-ÓP×ZÑZÐ[\Ð^_Ó`ˆàÑ%à'5×'<Ñ'<Ø¨$¯.©.Ð;KÈ^Ð:\ó(Ñ$ˆJô )@Ðà;‰;×+Ñ+¨wÓ6Ø{‰{×/Ñ/°6Ó9Ö>OÜ×#Ñ#ðLõô
 '>¸d¿k¹k×>^Ñ>^Ñ&_Ð#á$7ØØØØØð	%
ð  $Ÿ}Ÿ}‘C°$·,±,Øñ	%
ð ñ	%
Ñ!ˆ\ð "×)Ñ)¨#¸Ó;×FÑFÓHˆØ—m‘m KÓ0ˆæ ØˆLà˜LÐ(Ð(r1   )rb   rL   rh   rl   ro   rp   rs   rc   rj   rv   ru   rK   rt   ©N)NNNFN)r=   r>   r?   r@   rA   r   r   rB   r,   r8   ÚTensorÚtupleÚboolr   r;   rD   rE   rF   s   @r/   r`   r`   g   sÙ   ø† ÙGð
 $(ñ!Yàð!Yð ˜C‘=÷!Yð !YðL 9=Ø15Ø26Ø"'Ø15ñ<)à—|‘|ð<)ð !  u§|¡|Ñ!4Ñ5ð<)ð ! §¡Ñ.ð	<)ð
 " %§,¡,Ñ/ð<)ð  ð<)ð ! §¡Ñ.ð<)ð 
ˆu|‰|˜X e§l¡lÑ3°X¸e±_ÐDÑ	E÷<)ó <)r1   r`   c                   ó†  ^ • \ rS rSrSS\S\\   4U 4S jjjr       SS\R                  S\\R                     S\\R                     S\\
\R                        S	\\   S
\\   S\\R                     S\\R                     S\\   S\
\R                  \\
\R                  \R                  4      4   4S jjrSrU =r$ )ÚOPTDecoderLayeréÌ   rb   rc   c                 óp  >• [         TU ]  5         UR                  U l        [	        XS9U l        UR                  U l        UR                  U l        [        UR                     U l
        [        R                  " U R                  UR                  S9U l        [        R                  " U R                  UR                   UR"                  S9U l        [        R                  " UR                   U R                  UR"                  S9U l        [        R                  " U R                  UR                  S9U l        g )N)rb   rc   ©Úelementwise_affinere   )r+   r,   rg   rh   r`   Ú	self_attnÚdo_layer_norm_beforerL   r   Úactivation_functionÚactivation_fnr   Ú	LayerNormÚlayer_norm_elementwise_affineÚself_attn_layer_normrr   Úffn_dimrl   Úfc1Úfc2Úfinal_layer_norm)r-   rb   rc   r.   s      €r/   r,   ÚOPTDecoderLayer.__init__Í   s×   ø€ Ü‰ÑÔØ×+Ñ+ˆŒä%¨VÑIˆŒà$*×$?Ñ$?ˆÔ!Ø—~‘~ˆŒÜ# F×$>Ñ$>Ñ?ˆÔä$&§L¢LØN‰N¨v×/SÑ/Sñ%
ˆÔ!ô —9’9˜TŸ^™^¨V¯^©^À&×BTÑBTÑUˆŒÜ—9’9˜VŸ^™^¨T¯^©^À&×BTÑBTÑUˆŒÜ "§¢¨T¯^©^ÐPV×PtÑPtÑ uˆÕr1   rx   r2   rz   ry   r{   Ú	use_cacher4   r|   r[   r}   c	                 ó*  • Un
U R                   (       a  U R                  U5      nU R                  " SUUUUUUUS.U	D6u  p[        R                  R                  XR
                  U R                  S9nX¡-   nU R                   (       d  U R                  U5      nUR                  nUR                  SUR                  S5      5      nUn
U R                   (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  XR
                  U R                  S9nX¡-   R                  U5      nU R                   (       d  U R                  U5      nU4nU(       a  XÛ4-  nU$ )a4  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence..
)rx   ry   r4   r2   rz   r{   r|   rQ   rN   © )r›   r    rš   r   rV   rL   rS   Úshaper‡   rƒ   r¤   r¢   r   r£   r„   )r-   rx   r2   rz   ry   r{   r¦   r4   r|   r[   ÚresidualÚself_attn_weightsÚhidden_states_shapeÚoutputss                 r/   r;   ÚOPTDecoderLayer.forwardÞ   s‰  € ð< !ˆð ×$×$Ø ×5Ñ5°mÓDˆMð ,0¯>ª>ð 	,
Ø'Ø)Ø%Ø)Ø+Ø/Ø)ñ	,
ð ñ	,
Ñ(ˆô Ÿ™×-Ñ-¨m¿|¹|ÐVZ×VcÑVcÐ-ÐdˆØ Ñ0ˆð ×(×(Ø ×5Ñ5°mÓDˆMð ,×1Ñ1ÐØ%×-Ñ-¨b°-×2DÑ2DÀRÓ2HÓIˆØ ˆð ×$×$Ø ×1Ñ1°-Ó@ˆMàŸ™ Ó/ˆØ×*Ñ*¨=Ó9ˆàŸ™ Ó/ˆÜŸ™×-Ñ-¨m¿|¹|ÐVZ×VcÑVcÐ-Ðdˆà!Ñ1×7Ñ7Ð8KÓLˆð ×(×(Ø ×1Ñ1°-Ó@ˆMà Ð"ˆæØÐ+Ñ+ˆGàˆr1   )	r   r›   rL   rh   r¢   r£   r¤   rš   r    r   )NNNFFNN)r=   r>   r?   r@   r   r   rB   r,   r8   r‘   r’   r“   rC   r   r   ÚFloatTensorr;   rD   rE   rF   s   @r/   r•   r•   Ì   s#  ø† ñv˜yð v°X¸c±]÷ vð vð( 26Ø26Ø8<Ø,1Ø$)Ø37Ø15ñPà—|‘|ðPð ! §¡Ñ.ðPð " %§,¡,Ñ/ð	Pð
 !  u§|¡|Ñ!4Ñ5ðPð $ D™>ðPð ˜D‘>ðPð ˜u×/Ñ/Ñ0ðPð ! §¡Ñ.ðPð Ð-Ñ.ðPð 
ˆu× Ñ  (¨5°×1BÑ1BÀE×DUÑDUÐ1UÑ+VÑ"WÐWÑ	X÷Pó Pr1   r•   c                   óH   • \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSrSrS rSrg)	ÚOPTPreTrainedModeli1  rb   ÚmodelTr•   c                 óô  • U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        R                  5      (       aJ  UR
                  R                  R                  S5        UR                  R                  R                  5         g g )Nr   )ÚmeanÚstdr‚   )rb   Úinit_stdÚ
isinstancer   rr   ÚweightÚdataÚnormal_rf   Úzero_Ú	EmbeddingÚpadding_idxrž   Úfill_)r-   rG   rµ   s      r/   Ú_init_weightsÚ OPTPreTrainedModel._init_weights>  s  € Øk‰k×"Ñ"ˆÜfœbŸi™i×(Ñ(ØM‰M×Ñ×&Ñ&¨C°SÐ&Ñ9Ø{‰{Ñ&Ø—‘× Ñ ×&Ñ&Õ(ð 'ä˜¤§¡×-Ñ-ØM‰M×Ñ×&Ñ&¨C°SÐ&Ñ9Ø×!Ñ!Ñ-Ø—‘×"Ñ" 6×#5Ñ#5Ñ6×<Ñ<Õ>ð .ä˜¤§¡×-Ñ-ØM‰M×Ñ×$Ñ$ SÔ)ØK‰K×Ñ×"Ñ"Õ$ð .r1   r¨   N)r=   r>   r?   r@   r   Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_no_split_modulesÚ_supports_attention_backendÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnÚ_can_compile_fullgraphr¿   rD   r¨   r1   r/   r±   r±   1  s?   ‡ àÓØÐØ&*Ð#Ø*Ð+ÐØ"&ÐØÐØ€NØÐà!Ðõ%r1   r±   c                   ó2  ^ • \ rS rSrSrS\4U 4S jjr SS\\R                  S4   S\R                  S\R                  S	\
S
\4
S jjr\S\R                  S\S\S\R                  S\R                  S\4S j5       r\           SS\\R&                     S\\R                     S\\R                     S	\\
   S\\R(                     S\\   S
\\   S\\   S\\   S\\R&                     S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )Ú
OPTDecoderiM  z‡
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

Args:
    config: OPTConfig
rb   c           
      óD  >• [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  U R
                  5      U l        [        UR                  UR                  5      U l        UR                  UR                  :w  a0  [        R                   " UR                  UR                  SS9U l        OS U l        UR                  UR                  :w  a0  [        R                   " UR                  UR                  SS9U l        OS U l        UR&                  (       a@  UR(                  (       d/  [        R*                  " UR                  UR,                  S9U l        OS U l        [        R0                  " [3        UR4                  5       Vs/ sH  n[7        XS9PM     sn5      U l        SU l        U R=                  5         g s  snf )NFre   r˜   )rc   )r+   r,   rL   Ú	layerdropÚpad_token_idr½   Úmax_position_embeddingsÚmax_target_positionsÚ
vocab_sizer   r¼   Úword_embed_proj_dimÚembed_tokensr#   rg   Úembed_positionsrr   Úproject_outÚ
project_inr›   Ú_remove_final_layer_normrž   rŸ   r¤   Ú
ModuleListÚrangeÚnum_hidden_layersr•   ÚlayersÚgradient_checkpointingÚ	post_init)r-   rb   Úir.   s      €r/   r,   ÚOPTDecoder.__init__U  s™  ø€ Ü‰Ñ˜Ô Ø—~‘~ˆŒØ×)Ñ)ˆŒØ!×.Ñ.ˆÔØ$*×$BÑ$BˆÔ!Ø ×+Ñ+ˆŒäŸLšL¨×):Ñ):¸F×<VÑ<VÐX\×XhÑXhÓiˆÔÜ<¸V×=[Ñ=[Ð]c×]oÑ]oÓpˆÔà×%Ñ%¨×);Ñ);Ó;Ü!Ÿyšy¨×);Ñ);¸V×=WÑ=WÐ^cÑdˆDÕà#ˆDÔà×%Ñ%¨×);Ñ);Ó;Ü Ÿiši¨×(BÑ(BÀF×DVÑDVÐ]bÑcˆDOà"ˆDŒOð
 ×&×&¨v×/N×/NÜ$&§L¢LØ×"Ñ"°v×7[Ñ7[ñ%ˆDÕ!ð %)ˆDÔ!ä—m’mÔSXÐY_×YqÑYqÔSrÓ$sÑSrÈa¤_°VÔ%IÑSrÑ$sÓtˆŒà&+ˆÔ#à‰Õùò	 %ts   Ç'Hr2   r    Úinput_tensorr|   Úpast_key_valuesr{   c           	      óæ  • U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X¼5      nU$ )NÚflash_attention_2r   Úflex_attentionr   Fr€   )Úinputs_embedsr3   Úis_trainingr   rN   )Úsequence_lengthÚtarget_lengthrP   r|   Ú
batch_size)ÚcudaÚxpuÚnpu)rb   r†   Úanyr·   r8   r‘   r!   Úget_seq_lengthÚis_compileabler   Ú_ignore_causal_mask_sdparS   rP   r©   Úget_max_cache_shapeÚ5_prepare_4d_causal_attention_mask_with_cache_positionÚdeviceÚtypeÚfinfoÚminÚ_unmask_unattended)r-   r2   rà   r|   rá   r{   Úpast_seen_tokensÚusing_compilable_cacherP   rç   rè   Úcausal_maskÚ	min_dtypes                r/   Ú_update_causal_maskÚOPTDecoder._update_causal_mask{  sË  € ð ;‰;×+Ñ+Ð/BÓBØÑ)¨~ÀÑ/D×.IÑ.I×.KÑ.KØ%Ð%ØØ;‰;×+Ñ+Ð/?Ó?Ü˜.¬%¯,©,×7Ñ7Ü!<¸^Ó!LØ!Ð!ð
 @OÑ?Z˜?×9Ñ9Ô;Ð`aÐØCRÑC^ ×!?Ò!?ÐdiÐð ;‰;×+Ñ+¨vÓ5Ö>TÖ]nÜ%×>Ò>ØØ*Ø'7Ø ŸM™M÷	ð à×"Ñ"ˆØ&×,Ñ,¨QÑ/ˆÞ!Ø+×?Ñ?ÓA‰Mô ˜n¬e¯l©l×;Ñ;ð ×$Ñ$ RÒ(à%Ñ7¸!Ñ;ð ð ×PÑPØØ+Ø'ØØ)Ø#×)Ñ)¨!Ñ,ð Qð 
ˆð K‰K×,Ñ,°Ó6ØÑ*Ø×%Ñ%×*Ñ*Ð.DÓDÞ%ô
 Ÿš EÓ*×.Ñ.ˆIÜ0×CÒCÀKÓ[ˆKàÐr1   rç   rè   rP   ré   c                 ó¶  • U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XƒUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :„  -  nUSSSS2SS24   R                  USSS5      nU b‹  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X¨5      USS2SS2SS2SU	24'   U$ )	a½  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Né   )Ú
fill_valuerP   ró   r   )Údiagonal©ró   rN   r   )r7   r8   rõ   rö   Úfullró   ÚtriuÚaranger‡   ÚexpandÚcloner©   rY   Úmasked_fill)r2   rç   rè   rP   r|   ré   r[   rú   rû   Úmask_lengthÚpadding_masks              r/   rò   Ú@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_position¿  s}  € ð> Ñ%¨.×*<Ñ*<Ó*>À!Ó*Cà(ˆKð* Ðô' Ÿš EÓ*×.Ñ.ˆIÜŸ*š*Ø Ð0¸YÐ\j×\qÑ\qñˆKð  !Ó#Ü#Ÿjšj¨¸qÑAØœ5Ÿ<š<¨×>SÑ>SÑTÐWe×WmÑWmÐnpÐrsÓWtÑtÑtˆKØ% d¨D²!²QÐ&6Ñ7×>Ñ>¸zÈ1ÈbÐRTÓUˆKØÑ)Ø)×/Ñ/Ó1Ø,×2Ñ2°2Ñ6Ø*ª1ªa²°L°[°LÐ+@ÑAÀNÒSTÐVZÐ\`ÒbcÐScÑDd×DgÑDgØ×&Ñ&óEñ  ð  ,¨qÑ0Ø5@ÂÂAÂqÈ,È;È,ÐAVÑ5W×5cÑ5cØ ó6šAšq¢! \ k \Ð1Ñ2ð Ðr1   Ú	input_idsÚ	head_maskrå   r¦   Úoutput_hidden_statesÚreturn_dictr4   r[   r}   c                 ó\  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	USL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUb  UR                  SUR                  S   5      nUc  U R                  U5      nU(       a  Uc
  [        5       nUb  UR                  5       OSnUc/  [        R                   " XÝUR                  S   -   UR"                  S9nUc=  XÕR                  S   -   n[        R$                  " UR                  S   XåR"                  S9nU R'                  X%X´U5      nU
c5  [        R(                  " USS	9n
X¢-  S-
  R+                  5       n
U
SS2US24   n
U R-                  X-U
S
9nU R.                  b  U R/                  U5      nUUR1                  UR"                  5      -   nU(       a  SOSnU(       a  SOSn[3        U/S/5       Hn  u  nnUc  M  UR5                  5       S   [7        U R8                  5      :w  d  M7  [        SU S[7        U R8                  5       SUR5                  5       S    S35      e   [;        U R8                  5       H|  u  nnU(       a  UU4-  nU R                  (       a(  [        R<                  " / 5      nUU R>                  :  a  ML  U" U4UU
Ub  UU   OSUUUUS.UD6nUS   nU(       d  Ms  UUS   4-  nM~     U R@                  b  U RA                  U5      nU RB                  b  U RC                  U5      nU(       a  UU4-  n[E        UUUUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
        config.n_positions - 1]`. for padding use -1.

        [What are position IDs?](../glossary#position-ids)
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
        this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
        the complete sequence length.
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrN   r   r   r  r6   )r4   r¨   r  zThe `z` should be specified for z layers, but it is for Ú.)r2   r4   rz   ry   r{   r¦   r|   ©Úlast_hidden_staterá   rx   Ú
attentions)#rb   r{   r  r¦   Úuse_return_dictrq   rÜ   rS   rm   rn   r„   r©   rÓ   r   rî   r8   r  ró   Úonesrü   r9   r:   rÔ   rÖ   rY   Úziprƒ   ÚlenrÛ   Ú	enumerateÚrandrÍ   r¤   rÕ   r   )r-   r  r2   r  rá   rå   r¦   r{   r  r  r4   r|   r[   rø   Ú
seq_lengthrú   Ú
pos_embedsrx   Úall_hidden_statesÚall_self_attnsÚ	attn_maskÚ	mask_nameÚidxÚdecoder_layerÚdropout_probabilityÚlayer_outputss                             r/   r;   ÚOPTDecoder.forward÷  sÅ  € ðP 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	à%0Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà˜Ð -°tÐ";×<ÜÐYÓZÐZà×&×&¨4¯=¯=¾YÜ×ÑØjôð ˆIàÑ Ø!Ÿ™ r¨9¯?©?¸2Ñ+>Ó?ˆIàÑ Ø ×-Ñ-¨iÓ8ˆMæ˜Ñ0Ü*›nˆOà?NÑ?Z˜?×9Ñ9Ô;Ð`aÐØÑ!Ü"Ÿ\š\Ø °]×5HÑ5HÈÑ5KÑ"KÐTa×ThÑThñˆNð Ñ!Ø)×,?Ñ,?ÀÑ,BÑBˆJÜ"ŸZšZ¨×(;Ñ(;¸AÑ(>À
×SgÑSgÑhˆNà×.Ñ.Ø¨>ÐL]ó
ˆð
 Ñä Ÿ<š<¨¸AÑ>ˆLØ(Ñ9¸AÑ=×CÑCÓEˆLà'ªÐ+;Ñ+<Ð(<Ñ=ˆLà×)Ñ)¨.ÐYeÐ)Ðfˆ
à?‰?Ñ&Ø ŸO™O¨MÓ:ˆMà%¨
¯©°m×6JÑ6JÓ(KÑKˆö #7™B¸DÐÞ0™°dˆô %(¨¨°k°]Ö$CÑ ˆIyØÓ$Ø—>‘>Ó# AÑ&¬3¨t¯{©{Ó+;Õ<Ü$Ø 	˜{Ð*DÄSÈÏÉÓEUÐDVð WØ%ŸN™NÓ,¨QÑ/Ð0°ð3óð ñ %Dô #,¨D¯K©KÖ"8ÑˆCæ#Ø! mÐ%5Ñ5Ð!à}}Ü&+§j¢j°£nÐ#Ø&¨¯©Ó7Ùá)Øð
à*Ø)Ø3<Ñ3H ¨3¢ÈdØ.Ø"3Ø#Ø-ñ
ð ñ
ˆMð *¨!Ñ,ˆMç Ð Ø =°Ñ#3Ð"5Ñ5’ñ3 #9ð6 × Ñ Ñ,Ø ×1Ñ1°-Ó@ˆMà×ÑÑ'Ø ×,Ñ,¨]Ó;ˆMö  Ø -Ð!1Ñ1Ðä&Ø+Ø+Ø+Ø%ñ	
ð 	
r1   )rL   rÔ   rÓ   r¤   rÜ   rÍ   rÛ   rÐ   r½   rÖ   rÕ   rÑ   )F©NNNNNNNNNNN)r=   r>   r?   r@   rA   r   r,   r   r8   r‘   r   r“   rü   ÚstaticmethodrB   rP   rò   r   r   rC   r¯   r   r   r’   r   r;   rD   rE   rF   s   @r/   rË   rË   M  sì  ø† ñð#˜y÷ #ðX #(ñBà˜eŸl™l¨KÐ7Ñ8ðBð —l‘lðBð Ÿ™ð	Bð
 ðBð  õBðH ð4ØŸ™ð4àð4ð ð4ð {‰{ð	4ð
 Ÿ™ð4ð ó4ó ð4ðl ð 15Ø15Ø,0Ø+/Ø59Ø$(Ø,0Ø/3Ø&*Ø37Ø15ñu
à˜E×,Ñ,Ñ-ðu
ð ! §¡Ñ.ðu
ð ˜EŸL™LÑ)ð	u
ð
 " %™ðu
ð   × 1Ñ 1Ñ2ðu
ð ˜D‘>ðu
ð $ D™>ðu
ð ' t™nðu
ð ˜d‘^ðu
ð ˜u×/Ñ/Ñ0ðu
ð ! §¡Ñ.ðu
ð Ð-Ñ.ðu
ð 
ˆuÐ-Ð-Ñ	.ôu
ó öu
r1   rË   c                   óœ  ^ • \ rS rSrS\4U 4S jjrS rS rS r\	\
           SS\\R                     S\\R                     S	\\R                     S
\\\\R"                     \4      S\\R"                     S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )ÚOPTModeli°  rb   c                 ód   >• [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r+   r,   rË   ÚdecoderrÝ   ©r-   rb   r.   s     €r/   r,   ÚOPTModel.__init__²  s&   ø€ Ü‰Ñ˜Ô Ü! &Ó)ˆŒà‰Õr1   c                 ó.   • U R                   R                  $ r   ©r+  rÓ   ©r-   s    r/   Úget_input_embeddingsÚOPTModel.get_input_embeddings¸  s   € Ø|‰|×(Ñ(Ð(r1   c                 ó$   • XR                   l        g r   r/  ©r-   rJ   s     r/   Úset_input_embeddingsÚOPTModel.set_input_embeddings»  s   € Ø$)‰Õ!r1   c                 ó   • U R                   $ r   ©r+  r0  s    r/   Úget_decoderÚOPTModel.get_decoder¾  s   € Ø|‰|Ðr1   r  r2   r  rá   rå   r¦   r{   r  r  r4   r|   r[   r}   c                 ó~  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R
                  " SUUU
UUUUUUSUS.UD6n[        UR                  UR                  UR                  UR                  S9$ )NT©r  r2   r4   r  rá   rå   r¦   r{   r  r  r|   r  r¨   )rb   r{   r  r¦   r  r+  r   r  rá   rx   r  )r-   r  r2   r  rá   rå   r¦   r{   r  r  r4   r|   r[   Údecoder_outputss                 r/   r;   ÚOPTModel.forwardÁ  sÛ   € ð" 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø%0Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆð Ÿ,š,ð 
ØØ)Ø%ØØ+Ø'ØØ/Ø!5ØØ)ñ
ð ñ
ˆô 'Ø-×?Ñ?Ø+×;Ñ;Ø)×7Ñ7Ø&×1Ñ1ñ	
ð 	
r1   r8  r&  )r=   r>   r?   r@   r   r,   r1  r5  r9  r   r   r   r8   rC   r‘   r   Úlistr¯   r   r“   r   r   r’   r   r;   rD   rE   rF   s   @r/   r)  r)  °  sT  ø† ð˜y÷ ò)ò*òð Øð 15Ø15Ø,0ØKOØ59Ø$(Ø,0Ø/3Ø&*Ø37Ø15ñ+
à˜E×,Ñ,Ñ-ð+
ð ! §¡Ñ.ð+
ð ˜EŸL™LÑ)ð	+
ð
 " %¨¨U×->Ñ->Ñ(?ÀÐ(FÑ"GÑHð+
ð   × 1Ñ 1Ñ2ð+
ð ˜D‘>ð+
ð $ D™>ð+
ð ' t™nð+
ð ˜d‘^ð+
ð ˜u×/Ñ/Ñ0ð+
ð ! §¡Ñ.ð+
ð Ð-Ñ.ð+
ð 
ˆuÐ-Ð-Ñ	.ô+
ó ó ö+
r1   r)  c            !       óÀ  ^ • \ rS rSrS/rU 4S jrS rS rS rS r	\
\            SS\\R                     S	\\R                     S
\\R                     S\\\\R$                     \4      S\\R$                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )ÚOPTForCausalLMiñ  zlm_head.weightc                 óÂ   >• [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g ©NFre   )
r+   r,   r)  r²   r   rr   rÒ   rÑ   Úlm_headrÝ   r,  s     €r/   r,   ÚOPTForCausalLM.__init__ô  sK   ø€ Ü‰Ñ˜Ô Ü˜fÓ%ˆŒ
ô —y’y ×!;Ñ!;¸V×=NÑ=NÐUZÑ[ˆŒð 	‰Õr1   c                 óB   • U R                   R                  R                  $ r   ©r²   r+  rÓ   r0  s    r/   r1  Ú#OPTForCausalLM.get_input_embeddingsþ  ó   € Øz‰z×!Ñ!×.Ñ.Ð.r1   c                 ó8   • XR                   R                  l        g r   rG  r4  s     r/   r5  Ú#OPTForCausalLM.set_input_embeddings  ó   € Ø*/
‰
×ÑÕ'r1   c                 ó$   • XR                   l        g r   ©r²   r+  )r-   r+  s     r/   Úset_decoderÚOPTForCausalLM.set_decoder  s   € Ø$
‰
Õr1   c                 ó.   • U R                   R                  $ r   rN  r0  s    r/   r9  ÚOPTForCausalLM.get_decoder  s   € Øz‰z×!Ñ!Ð!r1   r  r2   r  rá   rå   Úlabelsr¦   r{   r  r  r4   r|   r[   r}   c                 ó"  • Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  R
                  " SUUUUUUUUU	SUS.UD6nU R                  US   5      R                  5       nSnUbE  UR                  UR                  5      nU R                  " UU4SU R                   R                  0UD6n[        UUUR                  UR                  UR                  S9$ )aî  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, OPTForCausalLM

>>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
```NTr<  r   rÑ   ©ÚlossÚlogitsrá   rx   r  r¨   )rb   r{   r  r  r²   r+  rD  rZ   rY   ró   Úloss_functionrÑ   r   rá   rx   r  )r-   r  r2   r  rá   rå   rS  r¦   r{   r  r  r4   r|   r[   r­   rW  rV  s                    r/   r;   ÚOPTForCausalLM.forward
  s7  € ðR 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆð —*‘*×$Ò$ð 
ØØ)Ø%ØØ+Ø'ØØ/Ø!5ØØ)ñ
ð ñ
ˆð —‘˜g a™jÓ)×4Ñ4Ó6ˆàˆØÑà—Y‘Y˜vŸ}™}Ó-ˆFØ×%Ò%ØØñð  Ÿ;™;×1Ñ1ðð ñ	ˆDô &ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)ñ
ð 	
r1   )rD  r²   ©NNNNNNNNNNNN)r=   r>   r?   r@   Ú_tied_weights_keysr,   r1  r5  rO  r9  r   r   r   r8   rC   r‘   r   r?  r¯   r   r“   r   r   r’   r   r;   rD   rE   rF   s   @r/   rA  rA  ñ  sƒ  ø† Ø*Ð+Ðõò/ò0ò%ò"ð Øð 15Ø15Ø,0ØKOØ59Ø-1Ø$(Ø,0Ø/3Ø&*Ø37Ø15ñP
à˜E×,Ñ,Ñ-ðP
ð ! §¡Ñ.ðP
ð ˜EŸL™LÑ)ð	P
ð
 " %¨¨U×->Ñ->Ñ(?ÀÐ(FÑ"GÑHðP
ð   × 1Ñ 1Ñ2ðP
ð ˜×)Ñ)Ñ*ðP
ð ˜D‘>ðP
ð $ D™>ðP
ð ' t™nðP
ð ˜d‘^ðP
ð ˜u×/Ñ/Ñ0ðP
ð ! §¡Ñ.ðP
ð Ð+Ñ,ðP
ð 
ˆuÐ,Ð,Ñ	-ôP
ó ó öP
r1   rA  aÒ  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Úcustom_introc                   ó‚  ^ • \ rS rSrS\4U 4S jjr\           SS\\R                     S\\R                     S\\R                     S\\\\R                     \4      S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rS rS rSrU =r$ )ÚOPTForSequenceClassificationi_  rb   c                 óä   >• [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g rC  )
r+   r,   Ú
num_labelsr)  r²   r   rr   rÒ   ÚscorerÝ   r,  s     €r/   r,   Ú%OPTForSequenceClassification.__init__n  sT   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒÜ˜fÓ%ˆŒ
Ü—Y’Y˜v×9Ñ9¸4¿?¹?ÐQVÑWˆŒ
ð 	‰Õr1   r  r2   r  rá   rå   rS  r¦   r{   r  r  r4   r}   c                 óè  • U
b  U
OU R                   R                  n
U R                  UUUUUUUUU	U
S9
nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOÁUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " XþR                  S
9U4   nSnUGb  U R                   R"                  c‘  U R$                  S:X  a  SU R                   l        OoU R$                  S:”  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOŽU" UU5      nO„U R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N©	rá   r2   r4   r  rå   r¦   r{   r  r  r   r)   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rN   )ró   rP   zŠ will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  Ú
regressionÚsingle_label_classificationÚmulti_label_classificationrU  )rb   r  r²   ra  r©   rÎ   rq   rY   ró   r8   Úint32r  Úargmaxrm   rn   r.   r=   Úproblem_typer`  rP   r:   rB   r	   Úsqueezer   r„   r   r   rá   rx   r  )r-   r  r2   r  rá   rå   rS  r¦   r{   r  r  r4   Útransformer_outputsrx   rW  ré   rç   Úlast_non_pad_tokenÚnon_pad_maskÚtoken_indicesÚpooled_logitsrV  Úloss_fctÚoutputs                           r/   r;   Ú$OPTForSequenceClassification.forwardw  s  € ð* &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà"Ÿj™jØØ+Ø)Ø%ØØ'ØØ/Ø!5Ø#ð )ð 
Ðð ,¨AÑ.ˆØ—‘˜MÓ*ˆàÑ Ø*3¯/©/¸"¸1Ð*=Ñ'ˆJ™à*7×*=Ñ*=¸b¸qÐ*AÑ'ˆJ˜à;‰;×#Ñ#Ñ+°
¸a³ÜÐ\Ó]Ð]Ø;‰;×#Ñ#Ñ+Ø!#ÑØÑ"à%¯©×)AÑ)AÑA×EÑEÀfÇmÁmÔUZ×U`ÑU`ÓaˆLÜ!ŸLšL¨¯©¸Ñ)<ÀVÇ]Á]ÔZ_×ZeÑZeÑfˆMØ"/°,Ñ">×!FÑ!FÀrÓ!JÑà!#ÐÜ×ÑØ—>‘>×*Ñ*Ð+ð ,Zð Zôð
 œuŸ|š|¨J¿}¹}ÑMÐOaÐaÑbˆàˆØÒØ{‰{×'Ñ'Ñ/Ø—?‘? aÓ'Ø/;D—K‘KÕ,Ø—_‘_ qÓ(¨f¯l©l¼e¿j¹jÓ.HÈFÏLÉLÔ\a×\eÑ\eÓLeØ/LD—K‘KÕ,à/KD—K‘KÔ,à{‰{×'Ñ'¨<Ó7Ü"›9Ø—?‘? aÓ'Ù# M×$9Ñ$9Ó$;¸V¿^¹^Ó=MÓN‘Dá# M°6Ó:‘DØ—‘×)Ñ)Ð-JÓJÜ+Ó-Ù × 2Ñ 2°2°t·±Ó GÈÏÉÐUWËÓY‘Ø—‘×)Ñ)Ð-IÓIÜ,Ó.Ù ¨vÓ6ÞØ#Ð%Ð(;¸A¸BÐ(?Ñ?ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä/ØØ Ø/×?Ñ?Ø-×;Ñ;Ø*×5Ñ5ñ
ð 	
r1   c                 óB   • U R                   R                  R                  $ r   rG  r0  s    r/   r1  Ú1OPTForSequenceClassification.get_input_embeddingsÖ  rI  r1   c                 ó8   • XR                   R                  l        g r   rG  r4  s     r/   r5  Ú1OPTForSequenceClassification.set_input_embeddingsÙ  rL  r1   )r²   r`  ra  r&  )r=   r>   r?   r@   r   r,   r   r   r8   rC   r¯   r   r?  r   r“   r’   r   r;   r1  r5  rD   rE   rF   s   @r/   r^  r^  _  sQ  ø† ð˜y÷ ð ð 15Ø6:Ø15ØKOØ59Ø-1Ø$(Ø,0Ø/3Ø&*Ø37ñ\
à˜E×,Ñ,Ñ-ð\
ð ! ×!2Ñ!2Ñ3ð\
ð ˜E×-Ñ-Ñ.ð	\
ð
 " %¨¨U×->Ñ->Ñ(?ÀÐ(FÑ"GÑHð\
ð   × 1Ñ 1Ñ2ð\
ð ˜×)Ñ)Ñ*ð\
ð ˜D‘>ð\
ð $ D™>ð\
ð ' t™nð\
ð ˜d‘^ð\
ð ˜u×/Ñ/Ñ0ð\
ð 
ˆuÐ6Ð6Ñ	7ô\
ó ð\
ò|/÷0ð 0r1   r^  c                   ó¢  ^ • \ rS rSrS\4U 4S jjr\            SS\\R                     S\\R                     S\\R                     S\\\\R                     \4      S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rS rS rSrU =r$ )ÚOPTForQuestionAnsweringiÝ  rb   c                 ó°   >• [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r(   )	r+   r,   r)  r²   r   rr   rÒ   Ú
qa_outputsrÝ   r,  s     €r/   r,   Ú OPTForQuestionAnswering.__init__ß  s@   ø€ Ü‰Ñ˜Ô Ü˜fÓ%ˆŒ
ÜŸ)š) F×$>Ñ$>ÀÓBˆŒð 	‰Õr1   r  r2   r  rá   rå   Ústart_positionsÚend_positionsr¦   r{   r  r  r4   r}   c                 óŒ  • Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUbé  Ubæ  [        UR                  5       5      S:”  a  UR                  S5      n[        UR                  5       5      S:”  a  UR                  S5      nUR                  S5      nUR                  SU5      R                  UR                  5      nUR                  SU5      R                  UR                  5      n[        US9nU" UU5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
a  
Example:

```python
>>> from transformers import AutoTokenizer, OPTForQuestionAnswering
>>> import torch

>>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

>>> # note: we are loading a OPTForQuestionAnswering from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random
>>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

>>> inputs = tokenizer(question, text, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()

>>> answer_offset = len(tokenizer(question)[0])

>>> predict_answer_tokens = inputs.input_ids[
...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
... ]
>>> predicted = tokenizer.decode(predict_answer_tokens)
>>> predicted
' a nice puppet'
```Nrd  r   r   rN   r6   )Úignore_indexr)   )rV  Ústart_logitsÚ
end_logitsrx   r  )rb   r  r²   r{  Úsplitrk  rZ   r  rƒ   ÚclamprY   ró   r   r   rx   r  )r-   r  r2   r  rá   rå   r}  r~  r¦   r{   r  r  r4   rl  rx   rW  r  r‚  Ú
total_lossÚignored_indexrq  Ú
start_lossÚend_lossrr  s                           r/   r;   ÚOPTForQuestionAnswering.forwardç  sè  € ð` &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà"Ÿj™jØØ+Ø)Ø%ØØ'ØØ/Ø!5Ø#ð )ð 
Ðð ,¨AÑ.ˆà—‘ Ó/ˆØ#)§<¡<°°r <Ð#:Ñ ˆjØ#×+Ñ+¨BÓ/×:Ñ:Ó<ˆØ×'Ñ'¨Ó+×6Ñ6Ó8ˆ
àˆ
ØÑ&¨=Ñ+Dä?×'Ñ'Ó)Ó*¨QÓ.Ø"1×"9Ñ"9¸"Ó"=Ü=×%Ñ%Ó'Ó(¨1Ó,Ø -× 5Ñ 5°bÓ 9à(×-Ñ-¨aÓ0ˆMØ-×3Ñ3°A°}ÓE×HÑHÈÏÉÓWˆOØ)×/Ñ/°°=ÓA×DÑDÀVÇ]Á]ÓSˆMä'°]ÑCˆHÙ! ,°Ó@ˆJÙ 
¨MÓ:ˆHØ$ xÑ/°1Ñ4ˆJæØ" JÐ/Ð2EÀaÀbÐ2IÑIˆFØ/9Ñ/EZM FÑ*ÐQÈ6ÐQä+ØØ%Ø!Ø-×;Ñ;Ø*×5Ñ5ñ
ð 	
r1   c                 óB   • U R                   R                  R                  $ r   rG  r0  s    r/   r1  Ú,OPTForQuestionAnswering.get_input_embeddingsI  rI  r1   c                 ó8   • XR                   R                  l        g r   rG  r4  s     r/   r5  Ú,OPTForQuestionAnswering.set_input_embeddingsL  rL  r1   )r²   r{  rZ  )r=   r>   r?   r@   r   r,   r   r   r8   rC   r¯   r   r?  r   r“   r’   r   r;   r1  r5  rD   rE   rF   s   @r/   ry  ry  Ý  sj  ø† ð˜y÷ ð ð 15Ø6:Ø15ØKOØ59Ø6:Ø48Ø$(Ø,0Ø/3Ø&*Ø37ñ_
à˜E×,Ñ,Ñ-ð_
ð ! ×!2Ñ!2Ñ3ð_
ð ˜E×-Ñ-Ñ.ð	_
ð
 " %¨¨U×->Ñ->Ñ(?ÀÐ(FÑ"GÑHð_
ð   × 1Ñ 1Ñ2ð_
ð " %×"2Ñ"2Ñ3ð_
ð   × 0Ñ 0Ñ1ð_
ð ˜D‘>ð_
ð $ D™>ð_
ð ' t™nð_
ð ˜d‘^ð_
ð ˜u×/Ñ/Ñ0ð_
ð 
ˆuÐ2Ð2Ñ	3ô_
ó ð_
òB/÷0ð 0r1   ry  )rA  r)  r±   r^  ry  )r   )ArA   Útypingr   r   r   r8   Útorch.utils.checkpointr   Útorch.nnr   r   r	   Úactivationsr   Úcache_utilsr   r   Ú
generationr   Úmodeling_attn_mask_utilsr   Úmodeling_flash_attention_utilsr   Úmodeling_layersr   Úmodeling_outputsr   r   r   r   Úmodeling_utilsr   r   Úprocessing_utilsr   Úutilsr   r   r   r   r   Úconfiguration_optr   Ú!torch.nn.attention.flex_attentionr    Úintegrations.flex_attentionr!   Ú
get_loggerr=   rm   r¼   r#   ÚModuler‘   Úfloatr^   r`   r•   r±   rË   r)  rA  r^  ry  Ú__all__r¨   r1   r/   Ú<module>r¢     s´  ðñ ç ,Ñ ,ã Û Ý ß AÑ Aå !ß .Ý )Ý >Ý BÝ 9÷ó ÷ GÝ &ß pÕ pÝ (ñ  ×!Ñ!Ý;åJð 
×	Ò	˜HÓ	%€ô; B§L¡Lô ;ðH ñ%ØI‰Ið%à<‰<ð%ð 
‰ð%ð <‰<ð	%ð
 ˜UŸ\™\Ñ*ð%ð ð%ð õ%ô.b)2—9‘9ô b)ôJbÐ0ô bðJ ô%˜ó %ó ð%ô6`
Ð#ô `
ðF ô=
Ð!ó =
ó ð=
ô@k
Ð'¨ô k
ñ\ ðñôm0Ð#5ó m0óðm0ð` ôo0Ð0ó o0ó ðo0òdr1   