
    <hR                        S r SSKrSSKJrJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJr  SS
KJrJr  SSKJr  \R8                  " \5      r " S S\R>                  5      r  " S S\R>                  5      r! " S S\RD                  5      r# " S S\R>                  5      r$ " S S\R>                  5      r% " S S\R>                  5      r& " S S\R>                  5      r' " S S\R>                  5      r( " S S\R>                  5      r) " S S \R>                  5      r* " S! S"\R>                  5      r+ " S# S$\R>                  5      r,\ " S% S&\5      5       r-\ " S' S(\-5      5       r.\ " S) S*\-5      5       r/\" S+S,9 " S- S.\-5      5       r0\ " S/ S0\-5      5       r1\ " S1 S2\-5      5       r2\ " S3 S4\-5      5       r3/ S5Qr4g)6zPyTorch SqueezeBert model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )SqueezeBertConfigc                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )SqueezeBertEmbeddings-   zGConstruct the embeddings from word, position and token_type embeddings.c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  S[&        R(                  " UR                  5      R+                  S5      SS9  g )N)padding_idxepsposition_ids)r   F)
persistent)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormhidden_sizelayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     l/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/squeezebert/modeling_squeezebert.pyr"   SqueezeBertEmbeddings.__init__0   s    !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
    c                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      nU R                  U5      nXG-   U-   n	U R                  U	5      n	U R                  U	5      n	U	$ )Nr   r   dtypedevice)sizer   r3   zeroslongr@   r'   r)   r+   r,   r1   )
r7   	input_idstoken_type_idsr   inputs_embedsinput_shape
seq_lengthr)   r+   
embeddingss
             r:   forwardSqueezeBertEmbeddings.forward@   s     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
r<   )r,   r1   r)   r+   r'   )NNNN	__name__
__module____qualname____firstlineno____doc__r"   rJ   __static_attributes____classcell__r9   s   @r:   r   r   -   s    Q
  r<   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MatMulWrapperY   z
Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
c                 "   > [         TU ]  5         g N)r!   r"   )r7   r9   s    r:   r"   MatMulWrapper.__init___   s    r<   c                 .    [         R                  " X5      $ )a  

:param inputs: two torch tensors :return: matmul of these tensors

Here are the typical dimensions found in BERT (the B is optional) mat1.shape: [B, <optional extra dims>, M, K]
mat2.shape: [B, <optional extra dims>, K, N] output shape: [B, <optional extra dims>, M, N]
)r3   matmul)r7   mat1mat2s      r:   rJ   MatMulWrapper.forwardb   s     ||D''r<    rL   rT   s   @r:   rV   rV   Y   s    
( (r<   rV   c                   (    \ rS rSrSrSS jrS rSrg)SqueezeBertLayerNormm   z
This is a nn.LayerNorm subclass that accepts NCW data layout and performs normalization in the C dimension.

N = batch C = channels W = sequence length
c                 @    [         R                  R                  XUS9  g )N)normalized_shaper   )r   r,   r"   )r7   r-   r   s      r:   r"   SqueezeBertLayerNorm.__init__t   s    
dcJr<   c                     UR                  SSS5      n[        R                  R                  X5      nUR                  SSS5      $ )Nr      r   )permuter   r,   rJ   )r7   xs     r:   rJ   SqueezeBertLayerNorm.forwardw   s;    IIaALL  )yyAq!!r<   r`   N)g-q=)rM   rN   rO   rP   rQ   r"   rJ   rR   r`   r<   r:   rb   rb   m   s    K"r<   rb   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ConvDropoutLayerNorm}   z0
ConvDropoutLayerNorm: Conv, Dropout, LayerNorm
c                    > [         TU ]  5         [        R                  " XSUS9U l        [        U5      U l        [        R                  " U5      U l        g Nr   in_channelsout_channelskernel_sizegroups)	r!   r"   r   Conv1dconv1drb   	layernormr/   r1   )r7   cincoutru   dropout_probr9   s        r:   r"   ConvDropoutLayerNorm.__init__   s@    iiCPQZ`a-d3zz,/r<   c                 t    U R                  U5      nU R                  U5      nX2-   nU R                  U5      nU$ rY   rw   r1   rx   )r7   hidden_statesinput_tensorrj   s       r:   rJ   ConvDropoutLayerNorm.forward   s8    KK&LLONN1r<   r~   rL   rT   s   @r:   rm   rm   }   s    0 r<   rm   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ConvActivation   z"
ConvActivation: Conv, Activation
c                 t   > [         TU ]  5         [        R                  " XSUS9U l        [
        U   U l        g rp   )r!   r"   r   rv   rw   r
   act)r7   ry   rz   ru   r   r9   s        r:   r"   ConvActivation.__init__   s/    iiCPQZ`a#;r<   c                 F    U R                  U5      nU R                  U5      $ rY   )rw   r   )r7   rj   outputs      r:   rJ   ConvActivation.forward   s    Qxxr<   )r   rw   rL   rT   s   @r:   r   r      s    
   r<   r   c                   D   ^  \ rS rSrSU 4S jjrS rS rS rS rSr	U =r
$ )	SqueezeBertSelfAttention   c                 n  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X"SUS9U l	        [        R                  " X"SUS9U l
        [        R                  " X"SUS9U l        [        R                  " UR                  5      U l        [        R                  " SS9U l        [#        5       U l        [#        5       U l        g	)
z
config = used for some things; ignored for others (work in progress...) cin = input channels = output channels
groups = number of groups to use in conv1d layers
r   zcin (z6) is not a multiple of the number of attention heads ()r   rq   r   dimN)r!   r"   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rv   querykeyvaluer/   attention_probs_dropout_probr1   SoftmaxsoftmaxrV   	matmul_qk
matmul_qkv)r7   r8   ry   q_groupsk_groupsv_groupsr9   s         r:   r"   !SqueezeBertSelfAttention.__init__   s    
 	+++q0uRSYSmSmRnnop  $*#=#= #&s-G-G'G#H !558P8PPYY3aX`a
99AV^_YY3aX`a
zz&"E"EFzzb)&'/r<   c                     UR                  5       S   U R                  U R                  UR                  5       S   4nUR                  " U6 nUR	                  SSSS5      $ )zg
- input: [N, C, W]
- output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
r   r   r   r	   rh   )rA   r   r   viewri   r7   rj   new_x_shapes      r:   transpose_for_scores-SqueezeBertSelfAttention.transpose_for_scores   s[    
 vvx{D$<$<d>V>VXYX^X^X`acXdeFFK yyAq!$$r<   c                     UR                  5       S   U R                  U R                  UR                  5       S   4nUR                  " U6 nU$ )zg
- input: [N, C, W]
- output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
r   r   )rA   r   r   r   r   s      r:   transpose_key_for_scores1SqueezeBertSelfAttention.transpose_key_for_scores   sK    
 vvx{D$<$<d>V>VXYX^X^X`acXdeFFK r<   c                     UR                  SSSS5      R                  5       nUR                  5       S   U R                  UR                  5       S   4nUR                  " U6 nU$ )z-
- input: [N, C1, W, C2]
- output: [N, C, W]
r   r   r	   rh   )ri   
contiguousrA   r   r   r   s      r:   transpose_output)SqueezeBertSelfAttention.transpose_output   sZ    
 IIaAq!,,.vvx{D$6$6DFFK r<   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      n	U R                  Xx5      n
U
[        R                  " U R                  5      -  n
X-   n
U R                  U
5      nU R                  U5      nU R                  X5      nU R                  U5      nSU0nU(       a  XS'   U$ )z
expects hidden_states in [N, C, W] data layout.

The attention_mask data layout is [N, W], and it does not need to be transposed.
context_layerattention_score)r   r   r   r   r   r   mathsqrtr   r   r1   r   r   )r7   r   attention_maskoutput_attentionsmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerr   attention_probsr   results                 r:   rJ    SqueezeBertSelfAttention.forward   s     !JJ}5((=1 JJ}5//0AB11/B	//0AB ..@)DIId6N6N,OO): ,,7 ,,7E--m<!=1(7$%r<   )
r   r   r1   r   r   r   r   r   r   r   )r   r   r   )rM   rN   rO   rP   r"   r   r   r   rJ   rR   rS   rT   s   @r:   r   r      s!    *0%! !r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertModule   c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR                  n[	        XUR
                  UR                  UR                  S9U l        [        X#UR                  UR                  S9U l        [        X4UR                  UR                  S9U l        [        XEUR"                  UR                  S9U l        g)aP  
- hidden_size = input chans = output chans for Q, K, V (they are all the same ... for now) = output chans for
  the module
- intermediate_size = output chans for intermediate layer
- groups = number of groups for all layers in the BertModule. (eventually we could change the interface to
  allow different groups for different layers)
)r8   ry   r   r   r   )ry   rz   ru   r{   )ry   rz   ru   r   N)r!   r"   r-   intermediate_sizer   r   r   r   	attentionrm   post_attention_groupsr0   post_attentionr   intermediate_groups
hidden_actintermediateoutput_groupsr   )r7   r8   c0c1c2c3r9   s         r:   r"   SqueezeBertModule.__init__   s     	%%1FOOfoo`f`o`o
 3F$@$@vOiOi
 +r6C]C]cictctu*F$8$8vGaGa
r<   c                     U R                  XU5      nUS   nU R                  XQ5      nU R                  U5      nU R                  Xv5      nSU0n	U(       a  US   U	S'   U	$ )Nr   feature_mapr   )r   r   r   r   )
r7   r   r   r   attattention_outputpost_attention_outputintermediate_outputlayer_outputoutput_dicts
             r:   rJ   SqueezeBertModule.forward  su    nn]<MN/ $ 3 34D T"//0EF{{#6N$l3-01B-CK)*r<   )r   r   r   r   rM   rN   rO   rP   r"   rJ   rR   rS   rT   s   @r:   r   r      s    
4 r<   r   c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )SqueezeBertEncoderi$  c                    >^ [         TU ]  5         TR                  TR                  :X  d   S5       e[        R
                  " U4S j[        TR                  5       5       5      U l        g )NzIf you want embedding_size != intermediate hidden_size, please insert a Conv1d layer to adjust the number of channels before the first SqueezeBertModule.c              3   8   >#    U H  n[        T5      v   M     g 7frY   )r   ).0_r8   s     r:   	<genexpr>.SqueezeBertEncoder.__init__.<locals>.<genexpr>.  s     #gGf!$5f$=$=Gfs   )	r!   r"   r%   r-   r   
ModuleListrangenum_hidden_layerslayersr6   s    `r:   r"   SqueezeBertEncoder.__init__%  sW    $$(:(:: 	
2	
: mm#guVMeMeGf#ggr<   c                     Uc  SnO#UR                  S 5      [        U5      :X  a  SnOSnUSL d   S5       eUR                  SSS5      nU(       a  SOS nU(       a  SOS n	U R                   H]  n
U(       a+  UR                  SSS5      nX4-  nUR                  SSS5      nU
R	                  XU5      nUS   nU(       d  MU  XS	   4-  n	M_     UR                  SSS5      nU(       a  X4-  nU(       d  [        S
 XU	4 5       5      $ [        XU	S9$ )NTFzAhead_mask is not yet supported in the SqueezeBert implementation.r   rh   r   r`   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frY   r`   )r   vs     r:   r   -SqueezeBertEncoder.forward.<locals>.<genexpr>[  s     h$Vq$Vs   	)last_hidden_stater   
attentions)countlenri   r   rJ   tupler   )r7   r   r   	head_maskr   output_hidden_statesreturn_dicthead_mask_is_all_noneall_hidden_statesall_attentionslayerr   s               r:   rJ   SqueezeBertEncoder.forward0  s6    $(!__T"c)n4$(!$)!$,q.qq, &--aA6"6BD0d[[E# - 5 5aA >!%55! - 5 5aA > ==HYZL(7M  0A#B"DD ! &--aA6!11h]~$Vhhh+Yg
 	
r<   )r   )NNFFTr   rT   s   @r:   r   r   $  s$    	h ".
 .
r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertPooleria  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rY   )r!   r"   r   Linearr-   denseTanh
activationr6   s     r:   r"   SqueezeBertPooler.__init__b  s9    YYv1163E3EF
'')r<   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r7   r   first_token_tensorpooled_outputs       r:   rJ   SqueezeBertPooler.forwardg  s6     +1a40

#566r<   )r   r   r   rT   s   @r:   r   r   a  s    $
 r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"SqueezeBertPredictionHeadTransformip  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g )Nr   )r!   r"   r   r   r-   r   
isinstancer   strr
   transform_act_fnr,   r.   r6   s     r:   r"   +SqueezeBertPredictionHeadTransform.__init__q  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr<   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rY   )r   r
  r,   r7   r   s     r:   rJ   *SqueezeBertPredictionHeadTransform.forwardz  s4    

=1--m<}5r<   )r,   r   r
  r   rT   s   @r:   r  r  p  s    U r<   r  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )SqueezeBertLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)bias)r!   r"   r  	transformr   r   r-   r$   decoder	Parameterr3   rB   r  r6   s     r:   r"   $SqueezeBertLMPredictionHead.__init__  sm    ;FC yy!3!3V5F5FUSLLV->->!?@	 !IIr<   c                 :    U R                   U R                  l         g rY   )r  r  r7   s    r:   _tie_weights(SqueezeBertLMPredictionHead._tie_weights  s     IIr<   c                 J    U R                  U5      nU R                  U5      nU$ rY   )r  r  r  s     r:   rJ   #SqueezeBertLMPredictionHead.forward  s$    }5]3r<   )r  r  r  )returnN)	rM   rN   rO   rP   r"   r  rJ   rR   rS   rT   s   @r:   r  r    s    && r<   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SqueezeBertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g rY   )r!   r"   r  predictionsr6   s     r:   r"   SqueezeBertOnlyMLMHead.__init__  s    6v>r<   c                 (    U R                  U5      nU$ rY   r!  )r7   sequence_outputprediction_scoress      r:   rJ   SqueezeBertOnlyMLMHead.forward  s     ,,_=  r<   r$  r   rT   s   @r:   r  r    s    ?! !r<   r  c                   *    \ rS rSr% \\S'   SrS rSrg)SqueezeBertPreTrainedModeli  r8   transformerc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       a%  UR                  R
                  R                  5         gg)zInitialize the weightsg        )meanstdNg      ?)r  r   r   rv   weightdatanormal_r8   initializer_ranger  zero_r#   r   r,   fill_r  )r7   modules     r:   _init_weights(SqueezeBertPreTrainedModel._init_weights  s<   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) ;<<KK""$ =r<   r`   N)	rM   rN   rO   rP   r   __annotations__base_model_prefixr5  rR   r`   r<   r:   r)  r)    s    %%r<   r)  c                   D  ^  \ rS rSrU 4S jrS rS rS r\         SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )SqueezeBertModeli  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  5         g rY   )	r!   r"   r   rI   r   encoderr   pooler	post_initr6   s     r:   r"   SqueezeBertModel.__init__  s@     /7)&1'/ 	r<   c                 .    U R                   R                  $ rY   rI   r'   r  s    r:   get_input_embeddings%SqueezeBertModel.get_input_embeddings  s    ...r<   c                 $    XR                   l        g rY   rA  r7   new_embeddingss     r:   set_input_embeddings%SqueezeBertModel.set_input_embeddings  s    *8'r<   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr<  r   r   prune_heads)r7   heads_to_pruner   headss       r:   _prune_headsSqueezeBertModel._prune_heads  s<    
 +002LELLu%//;;EB 3r<   rD   r   rE   r   r   rF   r   r   r   r  c
           	      H   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eUb  UR                  OUR                  nUc  [        R                  " XS9nUc$  [        R                  " U
[        R                  US9nU R                  X*5      nU R                  XPR                   R                  5      nU R                  XX6S9nU R!                  UUUUUU	S9nUS   nU R#                  U5      nU	(       d
  UU4US	S  -   $ [%        UUUR&                  UR(                  S
9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r@   r>   )rD   r   rE   rF   )r   r   r   r   r   r   r   r   )r   pooler_outputr   r   )r8   r   r   use_return_dictr   %warn_if_padding_and_no_attention_maskrA   r@   r3   onesrB   rC   get_extended_attention_maskget_head_maskr   rI   r<  r=  r   r   r   )r7   rD   r   rE   r   r   rF   r   r   r   rG   r@   extended_attention_maskembedding_outputencoder_outputsr%  r  s                    r:   rJ   SqueezeBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU%.%:!!@T@T!"ZZCN!"[[EJJvVN"&"B"B>"_ &&y++2O2OP	??> + 
 ,,*2/!5# ' 
 *!,O4#]3oab6III)-')77&11	
 	
r<   )rI   r<  r=  )	NNNNNNNNN)rM   rN   rO   rP   r"   rB  rG  rN  r   r   r3   TensorFloatTensorboolr   r   r   rJ   rR   rS   rT   s   @r:   r:  r:    s   /9C  -11515/3,059,0/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
   1 12A
 $D>A
 'tnA
 d^A
 
u00	1A
 A
r<   r:  c                   f  ^  \ rS rSrSS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )SqueezeBertForMaskedLMi  zcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rY   )r!   r"   r:  r*  r  clsr>  r6   s     r:   r"   SqueezeBertForMaskedLM.__init__  s5     +F3)&1 	r<   c                 B    U R                   R                  R                  $ rY   )ra  r!  r  r  s    r:   get_output_embeddings,SqueezeBertForMaskedLM.get_output_embeddings&  s    xx##+++r<   c                     XR                   R                  l        UR                  U R                   R                  l        g rY   )ra  r!  r  r  rE  s     r:   set_output_embeddings,SqueezeBertForMaskedLM.set_output_embeddings)  s*    '5$$2$7$7!r<   rD   r   rE   r   r   rF   labelsr   r   r   r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr   rE   r   r   rF   r   r   r   r   r   rh   losslogitsr   r   )
r8   rR  r*  ra  r   r   r$   r   r   r   )r7   rD   r   rE   r   r   rF   ri  r   r   r   outputsr%  r&  masked_lm_lossloss_fctr   s                    r:   rJ   SqueezeBertForMaskedLM.forward-  s    ( &1%<k$++B]B]""))%'/!5# # 

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r<   )ra  r*  
NNNNNNNNNN)rM   rN   rO   rP   _tied_weights_keysr"   rd  rg  r   r   r3   r[  r]  r   r   r   rJ   rR   rS   rT   s   @r:   r_  r_    s   :<Z[,8  -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
r<   r_  z
    SqueezeBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )$SqueezeBertForSequenceClassificationic  c                 P  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  U R                  R                  5      U l        U R                  5         g rY   )r!   r"   
num_labelsr8   r:  r*  r   r/   r0   r1   r   r-   
classifierr>  r6   s     r:   r"   -SqueezeBertForSequenceClassification.__init__j  ss      +++F3zz&"<"<=))F$6$68N8NO 	r<   rD   r   rE   r   r   rF   ri  r   r   r   r  c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrk  r   
regressionsingle_label_classificationmulti_label_classificationr   rh   rl  )r8   rR  r*  r1   rz  problem_typery  r?   r3   rC   r   r   squeezer   r   r   r   r   r   )r7   rD   r   rE   r   r   rF   ri  r   r   r   ro  r  rn  rm  rq  r   s                    r:   rJ   ,SqueezeBertForSequenceClassification.forwardv  s   ( &1%<k$++B]B]""))%'/!5# # 

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r<   )rz  r8   r1   ry  r*  rs  )rM   rN   rO   rP   r"   r   r   r3   r[  r]  r   r   r   rJ   rR   rS   rT   s   @r:   rw  rw  c  s   
  -11515/3,004)-,0/3&*F
ELL)F
 !.F
 !.	F

 u||,F
 ELL)F
  -F
 &F
 $D>F
 'tnF
 d^F
 
u..	/F
 F
r<   rw  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )SqueezeBertForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r!   r"   r:  r*  r   r/   r0   r1   r   r-   rz  r>  r6   s     r:   r"   %SqueezeBertForMultipleChoice.__init__  sW     +F3zz&"<"<=))F$6$6: 	r<   rD   r   rE   r   r   rF   ri  r   r   r   r  c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
    *input_ids* above)
Nr   r   rk  rh   rl  )r8   rR  shaper   rA   r*  r1   rz  r   r   r   r   )r7   rD   r   rE   r   r   rF   ri  r   r   r   num_choicesro  r  rn  reshaped_logitsrm  rq  r   s                      r:   rJ   $SqueezeBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ""))%'/!5# # 

  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r<   )rz  r1   r*  rs  )rM   rN   rO   rP   r"   r   r   r3   r[  r]  r   r   r   rJ   rR   rS   rT   s   @r:   r  r    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
u//	0X
 X
r<   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )!SqueezeBertForTokenClassificationi(  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rY   )r!   r"   ry  r:  r*  r   r/   r0   r1   r   r-   rz  r>  r6   s     r:   r"   *SqueezeBertForTokenClassification.__init__*  sj      +++F3zz&"<"<=))F$6$68I8IJ 	r<   rD   r   rE   r   r   rF   ri  r   r   r   r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nrk  r   r   rh   rl  )r8   rR  r*  r1   rz  r   r   ry  r   r   r   )r7   rD   r   rE   r   r   rF   ri  r   r   r   ro  r%  rn  rm  rq  r   s                    r:   rJ   )SqueezeBertForTokenClassification.forward5  s    $ &1%<k$++B]B]""))%'/!5# # 

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r<   )rz  r1   ry  r*  rs  )rM   rN   rO   rP   r"   r   r   r3   r[  r]  r   r   r   rJ   rR   rS   rT   s   @r:   r  r  (  s    	  -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
r<   r  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )SqueezeBertForQuestionAnsweringik  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rY   )
r!   r"   ry  r:  r*  r   r   r-   
qa_outputsr>  r6   s     r:   r"   (SqueezeBertForQuestionAnswering.__init__m  sT      +++F3))F$6$68I8IJ 	r<   rD   r   rE   r   r   rF   start_positionsend_positionsr   r   r   r  c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nrk  r   r   r   r   )ignore_indexrh   )rm  start_logits
end_logitsr   r   )r8   rR  r*  r  splitr  r   r   rA   clampr   r   r   r   )r7   rD   r   rE   r   r   rF   r  r  r   r   r   ro  r%  rn  r  r  
total_lossignored_indexrq  
start_lossend_lossr   s                          r:   rJ   'SqueezeBertForQuestionAnswering.forwardw  s    &1%<k$++B]B]""))%'/!5# # 

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r<   )ry  r  r*  )NNNNNNNNNNN)rM   rN   rO   rP   r"   r   r   r3   r[  r]  r   r   r   rJ   rR   rS   rT   s   @r:   r  r  k  s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r<   r  )r_  r  r  rw  r  r:  r   r)  )5rQ   r   typingr   r   r3   r   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_squeezebertr   
get_loggerrM   loggerModuler   rV   r,   rb   rm   r   r   r   r   r   r  r  r  r)  r:  r_  rw  r  r  r  __all__r`   r<   r:   <module>r     s   !  "   A A !   . 9 
		H	%)BII )X(BII (("2<< " 299 ( RYY  Wryy Wt'		 'T:
 :
z		  "")) .!RYY ! % % %. [
1 [
 [
| F
7 F
 F
R T
+E T
T
n d
#= d
 d
N ?
(B ?
 ?
D J
&@ J
 J
Z	r<   