
    <h2                    R   S r SSKrSSKrSSKrSSKJr  SSKJrJr  SSK	r	SSK
r	SSKJr  SSK	Jr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-  SSK.J/r/J0r0J1r1J2r2  SSK3J4r4  \2Rj                  " \65      r7S r8 " S S\Rr                  5      r: " S S\Rr                  5      r; " S S\;5      r< " S S\Rr                  5      r=\;\<S.r> " S S\Rr                  5      r? " S  S!\Rr                  5      r@ " S" S#\Rr                  5      rA " S$ S%\5      rB " S& S'\Rr                  5      rC " S( S)\Rr                  5      rD " S* S+\Rr                  5      rE " S, S-\Rr                  5      rF " S. S/\Rr                  5      rG " S0 S1\Rr                  5      rH " S2 S3\Rr                  5      rI\0 " S4 S5\)5      5       rJ\\0" S6S79 " S8 S9\/5      5       5       rK\0" S:S79 " S; S<\J5      5       rL\0" S=S79 " S> S?\J5      5       rM\0" S@S79 " SA SB\J\5      5       rN\0 " SC SD\J5      5       rO\0" SES79 " SF SG\J5      5       rP\0" SHS79 " SI SJ\J5      5       rQ\0 " SK SL\J5      5       rR\0 " SM SN\J5      5       rS\0 " SO SP\J5      5       rT/ SQQrUg)RzPyTorch BERT model.    N)	dataclass)OptionalUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringget_torch_versionlogging   )
BertConfigc           	          SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ n	U H]  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        U	R                  U5        M_     [        X5       GH  u  pU
R                  S5      n
[!        S U
 5       5      (       a)  [        R                  S	SR#                  U
5       35        MW  U nU
 H  nUR%                  S
U5      (       a  UR                  SU5      nOU/nUS   S:X  d	  US   S:X  a  ['        US5      nOZUS   S:X  d	  US   S:X  a  ['        US5      nO;US   S:X  a  ['        US5      nO%US   S:X  a  ['        US5      nO ['        XS   5      n[+        U5      S:  d  M  [-        US   5      nUU   nM     WSS S:X  a  ['        US5      nOUS:X  a  UR/                  U5      n UR0                  UR0                  :w  a&  [3        SUR0                   SUR0                   S35      e [        R                  SU
 35        [6        R8                  " U5      Ul        GM     U $ ! [         a    [        R                  S5        e f = f! [(         a,    [        R                  S	SR#                  U
5       35         GM  f = f! [2         a1  nU=R4                  UR0                  UR0                  4-  sl        e SnAff = f)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   *   #    U H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/bert/modeling_bert.py	<genexpr>*load_tf_weights_in_bert.<locals>.<genexpr>T   s      
 nns   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r$   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr?   nptftf_path	init_varsnamesarraysnamerW   arraypointerm_namescope_namesnumes                     r1   load_tf_weights_in_bertrm   7   s   
 ggoo01G
KK8	BC''0IEF (l5'BC&&w5Te	 ! 5)zz#  

 
 
 KK)CHHTN#345F||,f55 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g1~>G ;1$+a.)!#,+ , #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.Y *Z L  Q	
 	Z & KK)CHHTN+; <=  	FFw}}ekk22F	s6   J' K,A L'!K1L L
L?,L::L?c                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\
S	\R                  4S
 jjrSrU =r$ )BertEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r$   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrR   ru   register_bufferrZ   arangeexpandzerosrw   sizelongselfr^   	__class__s     r1   r~   BertEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsrz   rw   inputs_embedspast_key_values_lengthreturnc                 d   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      n	U	nO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R                  U5      nU R                  U5      nU$ )Nrx   r$   rz   r   r|   devicerv   )r   rw   hasattrrz   r   rZ   r   r   r   r   r   ru   r   r   r   )r   r   rz   rw   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r1   forwardBertEmbeddings.forward   sC     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r   )r   r   ru   r   r   r   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r~   r   rZ   
LongTensorFloatTensorrU   Tensorr   __static_attributes____classcell__r   s   @r1   ro   ro      s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
' 'r   ro   c                     ^  \ rS rSrSU 4S jjr      SS\R                  S\\R                     S\\R                     S\\R                     S\\	   S\\
   S	\\R                     S
\\R                     4S jjrSrU =r$ )BertSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()ru   rv   relative_keyrelative_key_queryr=   r$   )r}   r~   r   num_attention_headsr   rX   rU   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rR   ru   r   r   distance_embedding
is_decoder	layer_idxr   r^   ru   r   r   s       r1   r~   BertSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"r   hidden_statesattention_mask	head_maskencoder_hidden_statespast_key_valueoutput_attentionscache_positionr   c                 `	   UR                   u  pn
U R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUS LnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nUR                  USU R                  U R                  5      R                  SS5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  UOS nWR%                  UUU R                  SU05      u  nnU(       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nU R*                  S:X  d  U R*                  S:X  Ga  UR                   S   UR                   S   nnUbB  [&        R,                  " US-
  [&        R.                  UR0                  S	9R                  SS5      nO>[&        R2                  " U[&        R.                  UR0                  S	9R                  SS5      n[&        R2                  " U[&        R.                  UR0                  S	9R                  SS5      nUU-
  nU R5                  UU R6                  -   S-
  5      nUR9                  UR:                  S
9nU R*                  S:X  a  [&        R<                  " SUU5      nUU-   nOHU R*                  S:X  a8  [&        R<                  " SUU5      n[&        R<                  " SUU5      nUU-   U-   nU[>        R@                  " U R                  5      -  nUb  UU-   n[B        RD                  RG                  USS9nU RI                  U5      nUb  UU-  n[&        R(                  " UU5      nURK                  SSSS5      RM                  5       nURO                  5       S S U RP                  4-   nUR                  U5      nUU4$ )Nrx   r$   r=   r   Tr   r   r   r{   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   ))rW   r   viewr   r   rV   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updaterZ   matmulru   tensorr   r   r   r   r   tor|   einsummathsqrtr   
functionalsoftmaxr   permute
contiguousr   r   )r   r   r   r   r   r   r   r   
batch_sizer   _query_layeris_cross_attentionr   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r1   r   BertSelfAttention.forward   sO    %2$7$7!
jj/!&&z2t7O7OQUQiQijttq
 3$>%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn= !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r   )r   r   r   r   r   r   r   r   r   ru   r   r   NNNNNNFNr   r   r   r   r~   rZ   r   r   r   r   booltupler   r   r   r   s   @r1   r   r      s    #< 7;15=A*.,115d.||d. !!2!23d. E--.	d.
  ((9(9:d. !d. $D>d. !.d. 
u||	d. d.r   r   c                     ^  \ rS rSrSU 4S jjr      SS\R                  S\\R                     S\\R                     S\\R                     S\\	   S\\
   S	\\R                     S
\\R                     4U 4S jjjrSrU =r$ )BertSdpaSelfAttentioniC  c                    > [         TU ]  XUS9  UR                  U l        [        R
                  " [        5       5      [        R
                  " S5      :  U l        g )Nru   r   z2.2.0)r}   r~   r   dropout_probr   parser"   require_contiguous_qkvr   s       r1   r~   BertSdpaSelfAttention.__init__D  sH    \ef"??&-mm4E4G&H7==Y`Ka&a#r   r   r   r   r   r   r   r   r   c           	        > U R                   S:w  d
  U(       d  Ub*  [        R                  S5        [        TU ]  UUUUUUU5      $ UR                  5       u  pn
U R                  U5      R                  USU R                  U R                  5      R                  SS5      nUS LnU(       a  UOUnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                   nOUR"                  nOUnU(       a  UOUnU(       aQ  UbN  W(       aG  WR$                  U R                     R&                  nUR$                  U R                     R(                  nOU R+                  U5      R                  USU R                  U R                  5      R                  SS5      nU R-                  U5      R                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  UOS nWR/                  UUU R                  SU05      u  nnU(       a  SUR                  U R                  '   U R0                  (       aM  UR2                  R4                  S:X  a3  Ub0  UR7                  5       nUR7                  5       nUR7                  5       nU R8                  =(       a    U(       + =(       a    US L =(       a    U	S:  n[:        R<                  R>                  RA                  UUUUU RB                  (       a  U RD                  OS	US
9nUR                  SS5      nURG                  XU RH                  5      nUS 4$ )Nrv   a  BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rx   r$   r=   r   Tcuda        )	attn_mask	dropout_p	is_causal)%ru   rC   warning_oncer}   r   r   r   r   r   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   typer   r   rZ   r   r   scaled_dot_product_attentiontrainingr  reshaper   )r   r   r   r   r   r   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r  attn_outputr   s                       r1   r   BertSdpaSelfAttention.forwardJ  s    '':59JiNcH 7?%!  (,,.a JJ}%**3D4L4LdNfNfgqqrsuvw 	 3$>2D.-%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK (c2t779Q9QR1a  

>*c2t779Q9QR1a  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn=
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OOi,>(>i>UYCYi^ehi^i	hh))FF$+/==d''c G 
 "++Aq1!))#8J8JKD  r   )r  r  r   r   r   r   s   @r1   r  r  C  s    b 2615=A*.,115e!||e! !.e! E--.	e!
  ((9(9:e! !e! $D>e! !.e! 
u||	e! e!r   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BertSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nrs   )r}   r~   r   r   r   denser   r   r   r   r   r   s     r1   r~   BertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr  r   r   r   r   r  s      r1   r   BertSelfOutput.forward  5    

=1]3}'CDr   r   r  r   
r   r   r   r   r~   rZ   r   r   r   r   r   s   @r1   r  r    6    >U\\  RWR^R^  r   r  )eagersdpac                     ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\
   S	\\   S
\\R                     S\\R                     4S jjrSrU =r$ )BertAttentioni  c                    > [         TU ]  5         [        UR                     " UUUS9U l        [        U5      U l        [        5       U l        g )Nr  )	r}   r~   BERT_SELF_ATTENTION_CLASSES_attn_implementationr   r  outputsetpruned_headsr   s       r1   r~   BertAttention.__init__  sF    /0K0KL$;
	
 %V,Er   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r$   r   )rT   r   r   r   r   r1  r   r   r   r   r/  r  r   union)r   headsindexs      r1   prune_headsBertAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           
      l    U R                  UUUUUUUS9nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   r   r   r   r   r   r$   )r   r/  )r   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              r1   r   BertAttention.forward  s\     yy)"7)/) ! 
  ;;|AF#%QR(88r   )r/  r1  r   r   r   )r   r   r   r   r~   r7  rZ   r   r   r   r   r   r   r   r   r   r   s   @r1   r+  r+    s    ";* 7;15=A*.,115|| !!2!23 E--.	
  ((9(9: ! $D> !. 
u||	 r   r+  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r}   r~   r   r   r   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r1   r~   BertIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r  rE  r   r   s     r1   r   BertIntermediate.forward  s&    

=100?r   rH  r&  r   s   @r1   r@  r@    s(    9U\\ ell  r   r@  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
BertOutputi
  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r}   r~   r   r   rB  r   r  r   r   r   r   r   r   s     r1   r~   BertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r  r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r!  r"  s      r1   r   BertOutput.forward  r$  r   r%  r&  r   s   @r1   rL  rL  
  r'  r   rL  c                   .  ^  \ rS rSrSU 4S jjr       SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\
   S
\\R                     S\\R                     4S jjrS rSrU =r$ )	BertLayeri  c                 r  > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        USUS9U l	        [        U5      U l        [        U5      U l        g )Nr$   r   z> should be used as a decoder model if cross attention is addedrv   r  )r}   r~   chunk_size_feed_forwardseq_len_dimr+  	attentionr   add_cross_attentionrX   crossattentionr@  intermediaterL  r/  )r   r^   r   r   s      r1   r~   BertLayer.__init__  s    '-'E'E$&vC ++#)#=#= ##?? D6)g!hii"/PZfo"pD,V4 (r   r   r   r   r   encoder_attention_maskr   r   r   r   c	           
      P   U R                  UUUUUUS9n	U	S   n
U	SS  nU R                  (       aD  UbA  [        U S5      (       d  [        SU  S35      eU R	                  U
UUUUUUS9nUS   n
XSS  -   n[        U R                  U R                  U R                  U
5      nU4U-   nU$ )N)r   r   r   r   r   r   r$   rY  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r:  )	rW  r   r   rX   rY  r   feed_forward_chunkrU  rV  )r   r   r   r   r   r\  r   r   r   self_attention_outputsr<  r=  cross_attention_outputslayer_outputs                 r1   r   BertLayer.forward'  s    "&)/)) "0 "
 2!4(,??4@4!122 =dV DD D 
 '+&9&9 5#&;-"3- ': '#  7q9 ;;G0##T%A%A4CSCSUe
  /G+r   c                 J    U R                  U5      nU R                  X!5      nU$ r   )rZ  r/  )r   r<  intermediate_outputra  s       r1   r^  BertLayer.feed_forward_chunkW  s)    "//0@A{{#6Ir   )rX  rW  rU  rY  rZ  r   r/  rV  r   )NNNNNFN)r   r   r   r   r~   rZ   r   r   r   r   r   r   r   r^  r   r   r   s   @r1   rR  rR    s    )" 7;15=A>B*.,115.||. !!2!23. E--.	.
  ((9(9:. !)):): ;. !. $D>. !.. 
u||	.` r   rR  c                   v  ^  \ rS rSrSU 4S jjr          SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\R                     S\\	\R                     \4   4S jjrSrU =r$ )BertEncoderi]  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l        SU l	        g s  snf )NrT  F)
r}   r~   r^   r   
ModuleListrangenum_hidden_layersrR  layergradient_checkpointing)r   r^   r   ir   s       r1   r~   BertEncoder.__init__^  sS    ]]ERXRjRjLk#lLkqIf$BLk#lm
&+# $ms   A#r   r   r   r   r\  past_key_values	use_cacher   output_hidden_statesreturn_dictr   r   c                 `   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       a]  U R                   R                  (       aB  [        U[        5      (       d-  [        R                  S5        Sn[        R                  " U5      n[        U R                  5       Hf  u  nnU	(       a  X4-   nUb  UU   OS nU" UUUUUUUUS9nUS   nU(       d  M7  UUS   4-   nU R                   R                  (       d  M]  UUS	   4-   nMh     U	(       a  X4-   nU(       a  UR                  5       nU
(       d  [        S
 UUUUU4 5       5      $ [        UUUUUS9$ )Nr.   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.T)r\  r   r   r   r   r$   r=   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r.   )r/   vs     r1   r2   &BertEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_staterp  r   
attentionscross_attentions)r^   rX  rm  r  rC   r  r   r   r   r   from_legacy_cache	enumeraterl  to_legacy_cacher   r   )r   r   r   r   r   r\  rp  rq  r   rr  rs  r   all_hidden_statesall_self_attentionsall_cross_attentionsreturn_legacy_cachern  layer_modulelayer_head_masklayer_outputss                       r1   r   BertEncoder.forwardd  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#//
?TY8Z8Z\
 #'1CCOTO(4OA|#$58H$H!.7.CilO(%'=."3-	M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U(+  5.   14D D-==?O 
 "#%'(
 
 
 9+++*1
 	
r   )r^   rm  rl  r   )
NNNNNNFFTN)r   r   r   r   r~   rZ   r   r   r   r   r   r   r   r   r   r   r   s   @r1   rg  rg  ]  s(   , 7;15=A>BEI$(,1/4&*15R
||R
 !!2!23R
 E--.	R

  ((9(9:R
 !)):): ;R
 "%e.?.?(@"ABR
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
uU\\"$MM	NR
 R
r   rg  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
BertPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r}   r~   r   r   r   r  Tanh
activationr   s     r1   r~   BertPooler.__init__  s9    YYv1163E3EF
'')r   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r  )r   r   first_token_tensorpooled_outputs       r1   r   BertPooler.forward  s6     +1a40

#566r   )r  r  r&  r   s   @r1   r  r    s(    $
U\\ ell  r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r  )r}   r~   r   r   r   r  r   rC  rD  r   transform_act_fnr   r   r   s     r1   r~   $BertPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r   rI  s     r1   r   #BertPredictionHeadTransform.forward  s4    

=1--m<}5r   )r   r  r  r&  r   s   @r1   r  r    s)    UU\\ ell  r   r  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )BertLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)r9   )r}   r~   r  	transformr   r   r   r   decoder	ParameterrZ   r   r9   r   s     r1   r~   BertLMPredictionHead.__init__  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr   c                 :    U R                   U R                  l         g r   )r9   r  r   s    r1   _tie_weights!BertLMPredictionHead._tie_weights  s     IIr   c                 J    U R                  U5      nU R                  U5      nU$ r   )r  r  rI  s     r1   r   BertLMPredictionHead.forward  s$    }5]3r   )r9   r  r  )	r   r   r   r   r~   r  r   r   r   r   s   @r1   r  r    s    && r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g r   )r}   r~   r  predictionsr   s     r1   r~   BertOnlyMLMHead.__init__  s    /7r   sequence_outputr   c                 (    U R                  U5      nU$ r   r  )r   r  prediction_scoress      r1   r   BertOnlyMLMHead.forward  s     ,,_=  r   r  r&  r   s   @r1   r  r    s(    8!u|| ! ! !r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertOnlyNSPHeadi  c                 n   > [         TU ]  5         [        R                  " UR                  S5      U l        g Nr=   )r}   r~   r   r   r   seq_relationshipr   s     r1   r~   BertOnlyNSPHead.__init__  s'     "		&*<*<a @r   c                 (    U R                  U5      nU$ r   r  )r   r  seq_relationship_scores      r1   r   BertOnlyNSPHead.forward  s    !%!6!6}!E%%r   r  r   r   r   r   r~   r   r   r   r   s   @r1   r  r    s    A& &r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertPreTrainingHeadsi  c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  S5      U l        g r  )r}   r~   r  r  r   r   r   r  r   s     r1   r~   BertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @r   c                 L    U R                  U5      nU R                  U5      nX44$ r   r  r  )r   r  r  r  r  s        r1   r   BertPreTrainingHeads.forward
  s-     ,,_=!%!6!6}!E 88r   r  r  r   s   @r1   r  r    s    A
9 9r   r  c                   6    \ rS rSr% \\S'   \rSrSr	Sr
S rSrg)BertPreTrainedModeli  r^   bertTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsr  )meanstdNg      ?)r   r   r   r6   r\   normal_r^   initializer_ranger9   zero_r   rr   r   fill_r  )r   modules     r1   _init_weights!BertPreTrainedModel._init_weights  s3   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 455KK""$ 6r   r.   N)r   r   r   r   r%   __annotations__rm   load_tf_weightsbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  r   r.   r   r1   r  r    s#    -O&*#N%r   r  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
BertForPreTrainingOutputi+  ar  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the masked language modeling loss and the next sequence prediction
    (classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
    before SoftMax).
Nlossprediction_logitsseq_relationship_logitsr   ry  r.   )r   r   r   r   r   r  r   rZ   r   r  r  r  r   r   ry  r   r.   r   r1   r  r  +  s~    	 )-D(5$$
%,59x 1 129;?Xe&7&78?8<M8E%"3"345<59Ju00129r   r  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c            "         ^  \ rS rSrSS/rSU 4S jjrS rS rS r\	              SS\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\\R                        S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\R                     \4   4S jj5       rSrU =r$ )	BertModeliD  ro   rR  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        UR                  U l
        UR                  U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r}   r~   r^   ro   r   rg  encoderr  poolerr.  attn_implementationru   	post_init)r   r^   add_pooling_layerr   s      r1   r~   BertModel.__init__S  sg    
 	 (0"6*,=j(4#)#>#> '-'E'E$ 	r   c                 .    U R                   R                  $ r   r   r   r  s    r1   get_input_embeddingsBertModel.get_input_embeddingsf  s    ...r   c                 $    XR                   l        g r   r  )r   r   s     r1   set_input_embeddingsBertModel.set_input_embeddingsi  s    */'r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rl  rW  r7  )r   heads_to_prunerl  r5  s       r1   _prune_headsBertModel._prune_headsl  s<    
 +002LELLu%//;;EB 3r   r   r   rz   rw   r   r   r   r\  rp  rq  r   rr  rs  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nSnU	b:  [        U	[        5      (       d  U	S   S   R                  S   OU	R                  5       nUcs  [        U R                  S5      (       a4  U R                  R                   S S 2S U24   nUR#                  UU5      nUnO$[$        R&                  " U[$        R(                  US9nU R                  UUUUUS	9nUc  [$        R*                  " UUU-   4US
9nU R,                  S:H  =(       a(    U R.                  S:H  =(       a    US L =(       a    U(       + nU(       aT  UR1                  5       S:X  a@  U R                   R                  (       a  [3        UUUU5      nO'[5        UUR6                  US9nOU R9                  X/5      nU R                   R                  (       av  Ubs  UR                  5       u  nnnUU4nUc  [$        R*                  " UUS
9nU(       a*  UR1                  5       S:X  a  [5        UUR6                  US9nOU R;                  U5      nOS nU R=                  XPR                   R>                  5      nU RA                  UUUUUU	U
UUUUS9nUS   nU RB                  b  U RC                  U5      OS n U(       d
  UU 4USS  -   $ [E        UU URF                  URH                  URJ                  URL                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerx   z5You have to specify either input_ids or inputs_embedsr   r   rz   r   )r   rw   rz   r   r   )r   r)  rv   r=   )r  )
r   r   r   r\  rp  rq  r   rr  rs  r   r$   )rx  pooler_outputrp  r   ry  rz  )'r^   r   rr  use_return_dictr   rq  rX   %warn_if_padding_and_no_attention_maskr   r   r   r   rW   get_seq_lengthr   r   rz   r   rZ   r   r   onesr  ru   r   r   r   r|   get_extended_attention_maskinvert_attention_maskget_head_maskrk  r  r  r   rp  r   ry  rz  )!r   r   r   rz   rw   r   r   r   r\  rp  rq  r   rr  rs  r   r   r   r   r   r   r   r   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputsr  r  s!                                    r1   r   BertModel.forwardt  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/599  "1%++B/$335 # !t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y++2O2OP	,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )r  r^   r   r  r  ru   )T)NNNNNNNNNNNNNN)r   r   r   r   _no_split_modulesr~   r  r  r  r!   r   rZ   r   listr   r   r   r   r   r   r   r   r   s   @r1   r  r  D  s    *;7&/0C  -11515/3,0048<9==A$(,0/3&*15S
ELL)S
 !.S
 !.	S

 u||,S
 ELL)S
  -S
  (5S
 !) 6S
 "$u'8'8"9:S
 D>S
 $D>S
 'tnS
 d^S
 !.S
  
uU\\"$PP	Q!S
 S
r   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                     ^  \ rS rSrSS/rU 4S jrS rS r\           SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )BertForPreTrainingi  predictions.decoder.biascls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r}   r~   r  r  r  clsr  r   s     r1   r~   BertForPreTraining.__init__  s4     f%	'/ 	r   c                 B    U R                   R                  R                  $ r   r  r  r  r  s    r1   get_output_embeddings(BertForPreTraining.get_output_embeddings      xx##+++r   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r  r  r9   r   new_embeddingss     r1   set_output_embeddings(BertForPreTraining.set_output_embeddings   *    '5$$2$7$7!r   r   r   rz   rw   r   r   labelsnext_sentence_labelr   rr  rs  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUSS u  pU R                  X5      u  nnSnUbv  Ubs  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU" UR                  SS5      UR                  S5      5      nUU-   nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
    the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
    pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits
```
Nr   rz   rw   r   r   r   rr  rs  r=   rx   )r  r  r  r   ry  )
r^   r  r  r  r	   r   r   r  r   ry  )r   r   r   rz   rw   r   r   r  r  r   rr  rs  r=  r  r  r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr/  s                         r1   r   BertForPreTraining.forward$  sC   V &1%<k$++B]B]))))%'/!5#  

 *1!&48HH_4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J')?@712;NF/9/EZMF*Q6Q'/$:!//))
 	
r   r  r  NNNNNNNNNNN)r   r   r   r   _tied_weights_keysr~   r
  r  r!   r   rZ   r   r   r   r   r  r   r   r   r   s   @r1   r  r    sC    56VW,8  -11515/3,004)-6:,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
 &L
 &ell3L
 $D>L
 'tnL
 d^L
 
uU\\"$<<	=L
 L
r   r  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c            $         ^  \ rS rSrSS/rU 4S jrS rS r\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\
R                     \4   4 S jj5       rSrU =r$ )BertLMHeadModelit  zcls.predictions.decoder.biasr  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r}   r~   r   rC   warningr  r  r  r  r  r   s     r1   r~   BertLMHeadModel.__init__|  sL       NNijf>	"6* 	r   c                 B    U R                   R                  R                  $ r   r	  r  s    r1   r
  %BertLMHeadModel.get_output_embeddings  r  r   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r  s     r1   r  %BertLMHeadModel.set_output_embeddings  r  r   r   r   rz   rw   r   r   r   r\  r  rp  rq  r   rr  rs  r   r   c                    Ub  UOU R                   R                  nU	b  SnU R                  UUUUUUUUU
UUUUUS9nUS   nU R                  U5      nSnU	b(  U R                  " UXR                   R
                  40 UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
NF)r   rz   rw   r   r   r   r\  rp  rq  r   rr  rs  r   r   r=   )r  logitsrp  r   ry  rz  )r^   r  r  r  loss_functionr   r   rp  r   ry  rz  )r   r   r   rz   rw   r   r   r   r\  r  rp  rq  r   rr  rs  r   loss_kwargsr=  r  r  lm_lossr/  s                         r1   r   BertLMHeadModel.forward  s   4 &1%<k$++B]B]I))))%'"7#9+/!5#)  
" "!* HH_5(():FKKDZDZj^ijG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   r  )NNNNNNNNNNNNNNN)r   r   r   r   r  r~   r
  r  r!   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @r1   r!  r!  t  s    9:Z[
,8  -11515/3,0048<9=)-8<$(,0/3&*15!@
ELL)@
 !.@
 !.	@

 u||,@
 ELL)@
  -@
  (5@
 !) 6@
 &@
 "$u||"45@
 D>@
 $D>@
 'tn@
 d^@
  !.!@
$ 
uU\\"$EE	F%@
 @
r   r!  c                     ^  \ rS rSrSS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSS jr\S\4S j5       rSrU =r$ )BertForMaskedLMi  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr#  r$  r   s     r1   r~   BertForMaskedLM.__init__  sR     NN1
 f>	"6* 	r   c                 B    U R                   R                  R                  $ r   r	  r  s    r1   r
  %BertForMaskedLM.get_output_embeddings  r  r   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r  s     r1   r  %BertForMaskedLM.set_output_embeddings  r  r   r   r   rz   rw   r   r   r   r\  r  r   rr  rs  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nU R                  U5      nSnU	bF  [	        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
N)
r   rz   rw   r   r   r   r\  r   rr  rs  r   rx   r=   r  r,  r   ry  )
r^   r  r  r  r	   r   r   r   r   ry  )r   r   r   rz   rw   r   r   r   r\  r  r   rr  rs  r=  r  r  r  r  r/  s                      r1   r   BertForMaskedLM.forward  s    . &1%<k$++B]B]))))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    UR                   nUS   nU R                  R                  c  [        S5      e[        R
                  " X"R                  UR                   S   S45      /SS9n[        R                  " US4U R                  R                  [        R                  UR                  S9n[        R
                  " X/SS9nXS.$ )Nr   z.The PAD token should be defined for generationr$   rx   r   r   )r   r   )
rW   r^   r   rX   rZ   cat	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r1   prepare_inputs_for_generation-BertForMaskedLM.prepare_inputs_for_generation'  s    oo*1~ ;;##+MNNN4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy6A>	&IIr   c                     g)z
Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
`prepare_inputs_for_generation` method.
Fr.   )r  s    r1   can_generateBertForMaskedLM.can_generate7  s     r   r  )NNNNNNNNNNNNr   )r   r   r   r   r  r~   r
  r  r!   r   rZ   r   r   r   r   r   r   rC  classmethodrF  r   r   r   s   @r1   r2  r2    sj   46VW,8  -11515/3,0048<9=)-,0/3&*7
ELL)7
 !.7
 !.	7

 u||,7
 ELL)7
  -7
  (57
 !) 67
 &7
 $D>7
 'tn7
 d^7
 
uU\\"N2	37
 7
rJ  T  r   r2  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForNextSentencePredictioni@  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r}   r~   r  r  r  r  r  r   s     r1   r~   &BertForNextSentencePrediction.__init__F  s4     f%	"6* 	r   r   r   rz   rw   r   r   r  r   rr  rs  r   c                    SU;   a,  [         R                  " S[        5        UR                  S5      nU
b  U
OU R                  R
                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUb2  [        5       nU" UR                  SS5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )	a"  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring). Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForNextSentencePrediction
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
```
r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r$   rx   r=   r:  )warningswarnFutureWarningpopr^   r  r  r  r	   r   r   r   ry  )r   r   r   rz   rw   r   r   r  r   rr  rs  kwargsr=  r  seq_relationship_scoresr  r  r/  s                     r1   r   %BertForNextSentencePrediction.forwardO  s   T !F*MM%
 ZZ 56F%0%<k$++B]B]))))%'/!5#  

  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//))	
 	
r   r  
NNNNNNNNNN)r   r   r   r   r~   r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   rJ  rJ  @  s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   rJ  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForSequenceClassificationi  c                 r  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )r}   r~   
num_labelsr^   r  r  classifier_dropoutr   r   r   r   r   r   r<   r  r   r^   rZ  r   s      r1   r~   &BertForSequenceClassification.__init__  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   rz   rw   r   r   r  r   rr  rs  r   c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r$   
regressionsingle_label_classificationmulti_label_classificationrx   r=   r:  )r^   r  r  r   r<   problem_typerY  r|   rZ   r   rU   r
   squeezer	   r   r   r   r   ry  )r   r   r   rz   rw   r   r   r  r   rr  rs  r=  r  r,  r  r  r/  s                    r1   r   %BertForSequenceClassification.forward  s   ( &1%<k$++B]B]))))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   )r  r<   r^   r   rY  rU  )r   r   r   r   r~   r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   rW  rW    s     -11515/3,004)-,0/3&*E
ELL)E
 !.E
 !.	E

 u||,E
 ELL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\\"$<<	=E
 E
r   rW  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForMultipleChoicei  c                 0  > [         TU ]  U5        [        U5      U l        UR                  b  UR                  OUR
                  n[        R                  " U5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr$   )r}   r~   r  r  rZ  r   r   r   r   r   r   r<   r  r[  s      r1   r~   BertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	r   r   r   rz   rw   r   r   r  r   rr  rs  r   c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr$   rx   r   r  r=   r:  )r^   r  rW   r   r   r  r   r<   r	   r   r   ry  )r   r   r   rz   rw   r   r   r  r   rr  rs  num_choicesr=  r  r,  reshaped_logitsr  r  r/  s                      r1   r   BertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   )r  r<   r   rU  )r   r   r   r   r~   r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   re  re    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   re  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForTokenClassificationin  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g NFr#  )r}   r~   rY  r  r  rZ  r   r   r   r   r   r   r<   r  r[  s      r1   r~   #BertForTokenClassification.__init__p  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   rz   rw   r   r   r  r   rr  rs  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   rx   r=   r:  )r^   r  r  r   r<   r	   r   rY  r   r   ry  )r   r   r   rz   rw   r   r   r  r   rr  rs  r=  r  r,  r  r  r/  s                    r1   r   "BertForTokenClassification.forward~  s    $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   )r  r<   r   rY  rU  )r   r   r   r   r~   r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   rm  rm  n  s     -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\\"$99	:2
 2
r   rm  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g ro  )
r}   r~   rY  r  r  r   r   r   
qa_outputsr  r   s     r1   r~   !BertForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r   r   r   rz   rw   r   r   start_positionsend_positionsr   rr  rs  r   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r$   rx   r   )ignore_indexr=   )r  start_logits
end_logitsr   ry  )r^   r  r  rv  rN   rb  r   rT   r   clampr	   r   r   ry  )r   r   r   rz   rw   r   r   rx  ry  r   rr  rs  r=  r  r,  r|  r}  r  ignored_indexr  
start_lossend_lossr/  s                          r1   r    BertForQuestionAnswering.forward  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )r  rY  rv  r  )r   r   r   r   r~   r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   rt  rt    s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
uU\\"$@@	A>
 >
r   rt  )r2  re  rJ  r  rt  rW  rm  rR  r!  r  r  rm   )Vr   r   rE   rN  dataclassesr   typingr   r   rZ   torch.utils.checkpoint	packagingr   r   torch.nnr   r	   r
   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr    r!   r"   r#   configuration_bertr%   
get_loggerr   rC   rm   Modulero   r   r  r  r-  r+  r@  rL  rR  rg  r  r  r  r  r  r  r  r  r  r  r!  r2  rJ  rW  re  rm  rt  __all__r.   r   r1   <module>r     s!      	  ! "     A A ! 5 ) w 9
 
 
 . l l L L * 
		H	%FR=RYY =@@.		 @.Fl!- l!^RYY  ! 2BII 2jryy  B* BJY
")) Y
x ")) "299 .!bii !&bii &	9299 	9 %/ % %4 
:{ : :& 	x
# x
x
v `
, `
`
F 
W
)? W

W
t i) i iX 
\
$7 \

\
~ V
$7 V
V
r g
/ g
 g
T B
!4 B
 B
J J
2 J
 J
Zr   