
    <h                        S r SSKrSSKJr  SSKJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJr  SSKJr  SSKJrJrJ r   SSK!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  \&RR                  " \*5      r+Sq,S r-S r.SIS jr/SIS jr0SIS jr1S r2 " S S\Rf                  Rh                  5      r5 " S S\Rf                  Rh                  5      r6 " S S5      r7SJS jr8S r9   SKS jr: " S  S!\	Rv                  5      r< " S" S#\	Rv                  5      r= " S$ S%\	Rv                  5      r> " S& S'\	Rv                  5      r? " S( S)\	Rv                  5      r@ " S* S+\	Rv                  5      rA " S, S-\5      rB " S. S/\	Rv                  5      rC " S0 S1\	Rv                  5      rD " S2 S3\	Rv                  5      rE " S4 S5\	Rv                  5      rF\" " S6 S7\5      5       rG\" " S8 S9\G5      5       rH\" " S: S;\G5      5       rI " S< S=\	Rv                  5      rJ\"" S>S?9 " S@ SA\G5      5       rK\" " SB SC\G5      5       rL\" " SD SE\G5      5       rM\" " SF SG\G5      5       rN/ SHQrOg)LzPyTorch MRA model.    N)Path)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_cuda_platformis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                     ^ [        [        5      R                  5       R                  R                  R                  S-  S-  mU4S jn U " / SQ5      n[	        SUSS9qg )Nkernelsmrac                 8   > U  Vs/ sH  nTU-  PM
     sn$ s  snf N )filesfile
src_folders     \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mra/modeling_mra.pyappend_root&load_cuda_kernels.<locals>.append_root4   s     .34ed
T!e444s   )zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verbose)r   __file__resolveparentr
   mra_cuda_kernel)r)   	src_filesr'   s     @r(   load_cuda_kernelsr2   0   sQ    h'')0077>>JURJ5 WXI=)TBO    c                 H   [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  S5      S:w  a  [        S5      eU R                  S5      S:w  a  [        S5      eU R                  S	S
9R                  R                  SS	5      nUR                  5       nUR                  5       nUR                  5       n[        R                  XAX#5      u  pVUR                  SS	5      SS2SS2SSS24   nXV4$ )z0
Computes maximum values for softmax stability.
   z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr0   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r(   
sparse_maxrM   <   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::P_!oH'11"b9!Qa-H%%r3   c                    [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  S   UR                  S   :w  a  [        S5      eU R                  u  p4XB-  n[        R
                  " UR                  S5      [        R                  UR                  S9nU R                  X5U5      n XSS2S4   X-  R                  5       SS24   n U $ )zF
Converts attention mask to a sparse mask for high resolution logits.
r6   z$mask must be a 2-dimensional tensor.r7   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r=   r>   r?   shapetorcharangelongrQ   reshape)maskrG   
block_size
batch_sizeseq_len	num_block	batch_idxs          r(   sparse_maskr]   X   s     499;1?@@
7<<>aBCCzz!}a((]^^**J%IW\\!_EJJw~~VI<<
z:D!T'"W%8$>$>$@!CDDKr3   c                 f   U R                  5       u  pEnUR                  5       u  pxnXS-  S:w  a  [        S5      eX-  S:w  a  [        S5      eU R                  XEU-  X65      R                  SS5      n UR                  XHU-  X65      R                  SS5      n[	        U R                  5       5      S:w  a  [        S5      e[	        UR                  5       5      S:w  a  [        S5      e[	        UR                  5       5      S	:w  a  [        S
5      eU R                  S5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eU R                  5       n UR                  5       nUR                  5       nUR                  5       n[        R                  XUR                  5       5      $ )z/
Performs Sampled Dense Matrix Multiplication.
r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r<   r9   r5   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r6   r7   r   r8   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r>   r?   rV   rB   r=   rC   rD   r0   mm_to_sparse)	dense_query	dense_keyrG   rX   rY   
query_sizer;   _key_sizes	            r(   ra   ra   o   s    #."2"2"4JC ~~'A!#opp!kll%%j
2JJ\ffgikmnK!!**.DjV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''NNr3   c                 B   UR                  5       u  pVnXd-  S:w  a  [        S5      eU R                  S5      U:w  a  [        S5      eU R                  S5      U:w  a  [        S5      eUR                  XVU-  XG5      R                  SS5      n[	        U R                  5       5      S	:w  a  [        S
5      e[	        UR                  5       5      S	:w  a  [        S5      e[	        UR                  5       5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eU R                  5       n UR                  5       nUR                  5       nUR                  5       n[        R                  XX#5      nUR                  SS5      R                  XSU-  U5      nU$ )zH
Performs matrix multiplication of a sparse matrix with a dense matrix.
r   r_   r6   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r<   r9   r5   ,sparse_query must be a 4-dimensional tensor.r`   r7   r8   z8The size of the third dimension of dense_key must be 32.)	r>   r?   rV   rB   r=   rC   rD   r0   sparse_dense_mm)	sparse_queryrG   rc   rH   rX   rY   rf   r;   dense_qk_prods	            r(   ri   ri      s    !* 0J#!kllz)lmmz)kll!!**.DjV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L9fM!++B3;;JZdHdfijMr3   c                 X    X-  U-  [         R                  " XSS9-   R                  5       $ )Nfloorrounding_mode)rS   divrU   )rG   dim_1_blockdim_2_blocks      r(   transpose_indicesrs      s*    "k1EIIgbi4jjpprrr3   c                   H    \ rS rSr\S 5       r\S 5       r\SS j5       rSrg)MraSampledDenseMatMul   c                 N    [        XX45      nU R                  XU5        X@l        U$ r#   )ra   save_for_backwardrX   )ctxrb   rc   rG   rX   rF   s         r(   forwardMraSampledDenseMatMul.forward   s)    %kgRkg>#r3   c                     U R                   u  p#nU R                  nUR                  S5      U-  nUR                  S5      U-  n[        XFU5      n[	        UR                  SS5      XU5      n	[	        XX65      n
XS S 4$ Nr   r<   r9   )saved_tensorsrX   r>   rs   ri   rB   )ry   gradrb   rc   rG   rX   rH   rI   	indices_Tgrad_key
grad_querys              r(   backwardMraSampledDenseMatMul.backward   s    *-*;*;'^^
%**1-;!q)Z7%gN	"4>>"b#99S`a$TIO
T4//r3   c                 .    [         R                  XX#5      $ r#   )ru   apply)rb   rc   rG   rX   s       r(   operator_call#MraSampledDenseMatMul.operator_call   s    $**;7WWr3   r$   Nr8   	__name__
__module____qualname____firstlineno__staticmethodrz   r   r   __static_attributes__r$   r3   r(   ru   ru      s>      0 0 X Xr3   ru   c                   D    \ rS rSr\S 5       r\S 5       r\S 5       rSrg)MraSparseDenseMatMul   c                 N    [        XX45      nU R                  XU5        X@l        U$ r#   )ri   rx   rH   )ry   rj   rG   rc   rH   rF   s         r(   rz   MraSparseDenseMatMul.forward   s*    (	[lY?-r3   c                     U R                   u  p#nU R                  nUR                  S5      UR                  S5      -  n[        X5U5      n[	        UR                  SS5      XqU5      n[        XU5      n	U	S US 4$ r}   )r~   rH   r>   rs   ri   rB   ra   )
ry   r   rj   rG   rc   rH   rI   r   r   r   s
             r(   r   MraSparseDenseMatMul.backward   s~    +.+<+<(y--!q)\->->r-BB%gN	"<#9#9"b#A9Tab!$7;
44//r3   c                 .    [         R                  XX#5      $ r#   )r   r   )rj   rG   rc   rH   s       r(   r   "MraSparseDenseMatMul.operator_call   s    #)),\\r3   r$   Nr   r$   r3   r(   r   r      s>      0 0 ] ]r3   r   c                   $    \ rS rSr\S 5       rSrg)MraReduceSum   c                    U R                  5       u  pEpg[        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  5       u    pvnUR                  5       u  pEU R                  SS9R	                  XE-  U5      n [
        R                  " UR                  S5      [
        R                  UR                  S9n[
        R                  " XSS	9R                  5       US S 2S 4   U-  -   R	                  XE-  5      n	[
        R                  " XB-  U4U R                  U R                  S9n
U
R                  SX5      R	                  XBU5      nUR	                  XBU-  5      nU$ )
Nr5   rh   r6   r7   r:   r   rO   rm   rn   )r>   r=   r?   sumrV   rS   rT   rU   rQ   rp   zerosrP   	index_add)rj   rG   rH   rI   rY   r[   rX   re   r\   global_idxestempoutputs               r(   r   MraReduceSum.operator_call   sb   /;/@/@/B,
z|  "#q(KLLw||~!#FGG*//11! '
#''A'.66z7MzZLLa

7>>Z	IIgGDIIKiXY[_X_N`crNrr
'*(
) 	 {{):6l>P>PYeYlYl
 <>FFzdno
j,HIr3   r$   N)r   r   r   r   r   r   r   r$   r3   r(   r   r      s     r3   r   c                    U R                  5       u  pVnXb-  nSn	Ub  UR                  XXU5      R                  SS9n
U R                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  nUR                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  nUb/  UR                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  n	OU[        R                  " XX[        R
                  U R                  S9-  n
U R                  XXX'5      R                  SS9nUR                  XXX'5      R                  SS9nUb  UR                  XXX'5      R                  SS9n	[        R                  " XR                  SS5      5      [        R                  " U5      -  nUR                  SSS9R                  nUb0  US	U
SS2SSS24   U
SS2SS2S4   -  S
:  R                  5       -  -
  nXX4$ )z'
Compute low resolution approximation.
Nr<   r:   r9   ư>rO   T)r;   keepdims     @g      ?)r>   rV   r   rS   onesfloatrQ   meanmatmulrB   mathsqrtr@   rA   )querykeyrX   rW   valuerY   rZ   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r(   get_low_resolution_logitr     s    %*JJL!J-Ill:*MQQVXQYMM*VZZ_aZb1d
#d*
	 ++jZRVV[]V^1d
#d*
 jZZ^^ce^fAq$J'$.I !5::jSXS^S^glgsgs#ttMM*V[[`b[c	++jZRWW\^W_jZZ__df_gI <<	3D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JUUr3   c                    U R                   u  pVnUS:  a]  US-  n[        R                  " XfU R                  S9n	[        R                  " [        R
                  " X* S9US9n
X
SSS2SS24   S-  -   n US:  a:  U SS2SU2SS24   S-   U SS2SU2SS24'   U SS2SS2SU24   S-   U SS2SS2SU24'   [        R                  " U R                  US5      USSS	S
9nUR                  nUS:X  a@  UR                  R                  SS9R                  nXSS2SS4   :  R                  5       nX4$ US:X  a  SnX4$ [        U S35      e)zR
Compute the indices of the subset of components to be used in the approximation.
r   r6   rQ   )diagonalNg     @r<   TF)r;   largestsortedfullr:   sparsez# is not a valid approx_model value.)rR   rS   r   rQ   triltriutopkrV   rG   rA   minr   r?   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrY   total_blocks_per_rowre   offset	temp_maskdiagonal_mask
top_k_valsrG   	thresholdhigh_resolution_masks                  r(   get_block_idxesr   7  s    +?*D*D'Ja&*0A5JJ3RfRmRmn	

5::i'#JU[\3D!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4!T4-8P PWWY (( 
	 # (( K=(KLMMr3   c	                    [         c$  [        R                  " U 5      R                  5       $ U R	                  5       u  ppX-  nX-  S:w  a  [        S5      eX-  nU R                  XU5      n UR                  XU5      nUR                  XU5      nUb*  XSS2SS2S4   -  n XSS2SS2S4   -  nX#SS2SS2S4   -  nUS:X  a  [        XXcU5      u  nnnnOAUS:X  a0  [        R                  " 5          [        XXc5      u  nnnnSSS5        O[        S5      e[        R                  " 5          WW-
  n[        UUUUU5      u  nnSSS5        [        R                  XWUS9[        R                  " U5      -  n[        UUX5      u  nnUU-
  nUb"  USS	[!        UU5      SS2SS2SS2S4   -
  -  -
  n[        R"                  " U5      n[$        R                  UUX.5      n[&        R                  UUX5      nUS:X  Gax  [        R"                  " WW-
  SW-  -
  5      WSS2SSS24   -  n[        R(                  " UW5      SS2SS2SSS24   R+                  S	S	US	5      R                  XU5      nUR-                  S
S9SS2SS2S4   R+                  S	S	U5      R                  X5      nUR+                  S	S	U5      R                  X5      U-
  n Ub  U U-  n [        R"                  " U U S:*  R/                  5       -  5      n!UU!SS2SS2S4   -  nUU!-  n[        R"                  " U * U S:  R/                  5       -  5      n"UU"SS2SS2S4   -  nUU"-  nUU-   USS2SS2S4   USS2SS2S4   -   S-   -  n#O$US:X  a  UUSS2SS2S4   S-   -  n#O[        S5      eUb  U#USS2SS2S4   -  n#U#R                  XX5      n#U#$ ! , (       d  f       GN= f! , (       d  f       GN= f)z(
Use Mra to approximate self-attention.
Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rX   r   r   r<   r:   r   z-config.approx_mode must be "full" or "sparse")r0   rS   
zeros_likerequires_grad_r>   r?   rV   r   no_grad	Exceptionr   ru   r   r   r   rM   r]   expr   r   r   repeatr   r   )$r   r   r   rW   r   r   rX   r   r   rY   num_headrZ   r   
meta_batchr   r   r   r   r   re   low_resolution_logit_normalizedrG   r   high_resolution_logitrK   rL   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r(   mra2_attentionr   ]  sq    &5577.3jjl+J'&Jq OPP-MM*x8E
++j8
4CMM*x8EQ4Z((At$$Q4Z((fUm
%V
Rk+G 
	 ]]_QiJRN +/KQ _
 @AA	*>A]*]'(7+(+)
%% 
 2??G
 @ 		( ",,A7L]!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu  ".!;!;g'8" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcm 	" 6<<Q:NVVWaknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*RMS _ 
s   1N?,O?
O
O c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )MraEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c           	      f  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  S-   UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      S-   5        [+        USS5      U l        U R#                  S[$        R.                  " U R0                  R3                  5       [$        R4                  U R0                  R6                  S	9S
S9  g )N)padding_idxr6   epsposition_ids)r   r<   position_embedding_typeabsolutetoken_type_idsrO   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrS   rT   expandgetattrr   r   r   r>   rU   rQ   selfconfig	__class__s     r(   r   MraEmbeddings.__init__  s?   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mn'.v7PR\']$KK))..0

4K\K\KcKcd 	 	
r3   c                 `   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr<   r   r   r   rO   r   )r>   r   hasattrr   r  rS   r   rU   rQ   r   r   r   r   r   r  )r  	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r(   rz   MraEmbeddings.forward  s:    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r3   )r   r  r   r   r   r   )NNNN	r   r   r   r   __doc__r   rz   r   __classcell__r	  s   @r(   r   r     s    Q
(   r3   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )MraSelfAttentioni
  c                 
  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      e[        S Ln[        5       (       a0  [        5       (       a!  [        5       (       a  U(       d   [        5         UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        ["        R$                  " UR                  U R                   5      U l        ["        R$                  " UR                  U R                   5      U l        ["        R$                  " UR                  U R                   5      U l        ["        R,                  " UR.                  5      U l        Ub  UOUR2                  U l        UR4                  S-  UR6                  -  U l        [;        U R8                  [        UR4                  S-  S-  5      5      U l        UR<                  U l        UR>                  U l        UR@                  U l         g ! [         a#  n[        R                  SU 35         S nAGNS nAff = f)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r8   r6   )!r   r   r   num_attention_headsr  r?   r0   r   r   r   r2   r   loggerwarningrD   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr  r   r   block_per_rowr[   r   r   r   r   )r  r  r   kernel_loadeder	  s        r(   r   MraSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 (t3"$$)9););@R@T@T]jn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$ !88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,+  n!hijhklmmns   !
I 
JI==Jc                 @   UR                   u  p4nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nSUS-  -   nUR                  5       R                  SU R                  S5      R                  X0R                  -  U5      R                  5       nSn	U R                  U	:  a  X0R                  XIU R                  -
  4n
[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[!        UR#                  5       UR#                  5       UR#                  5       UR#                  5       U R$                  U R&                  U R(                  U R*                  S	9nU R                  U	:  a  US S 2S S 2S S 2S U R                  24   nUR                  X0R                  X@R                  5      nUR-                  S
SSS5      R/                  5       nUR1                  5       S S U R2                  4-   nUR                  " U6 nU4nU$ )Nr<   r   r6         ?r   r8   r   r:   )r   r   r   r   r   r9   )rR   r   viewr  r!  rB   r   r   squeezer   rV   rD   rS   catr   rQ   r   r   r[   r   r   r   permuterC   r>   r"  )r  hidden_statesattention_maskrY   rZ   re   query_layer	key_layervalue_layergpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                 r(   rz   MraSelfAttention.forward.  s   !.!4!4
QJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ~77""$VAt//3WZ":"::GDSU	 	 ##m3!#;#;WVZVnVnFnnH))[%++hOaOa2b$ciklK		9ekk(K[K[.\"]cefI))[%++hOaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 ##m3)!Q3MT5M5M3M*MNM%--j:R:RT[]u]uv%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD "r3   )r"  r   r!  r  r   r   r   r  r[   r   r   r   r#   r   r   r   r   r   rz   r   r  r  s   @r(   r  r  
  s    !VF< <r3   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )MraSelfOutputin  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r   r   r   r#  r   denser   r   r   r  r  r  s     r(   r   MraSelfOutput.__init__o  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r3   r/  input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r#   r>  r  r   r  r/  r@  s      r(   rz   MraSelfOutput.forwardu  5    

=1]3}'CDr3   r   r>  r  
r   r   r   r   r   rS   Tensorrz   r   r  r  s   @r(   r;  r;  n  6    >U\\  RWR^R^  r3   r;  c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )MraAttentioni|  c                 |   > [         TU ]  5         [        XS9U l        [	        U5      U l        [        5       U l        g )N)r   )r   r   r  r  r;  r   setpruned_heads)r  r  r   r	  s      r(   r   MraAttention.__init__}  s0    $V]	#F+Er3   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r:   )r=   r   r  r  r!  rO  r   r   r   r   r   r>  r"  union)r  headsindexs      r(   prune_headsMraAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r3   c                 d    U R                  X5      nU R                  US   U5      nU4USS  -   nU$ Nr   r   )r  r   )r  r/  r0  self_outputsattention_outputr7  s         r(   rz   MraAttention.forward  s>    yy?;;|AF#%QR(88r3   )r   rO  r  r#   )	r   r   r   r   r   rU  rz   r   r  r  s   @r(   rL  rL  |  s    ";$ r3   rL  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r#   )r   r   r   r#  r   intermediate_sizer>  
isinstance
hidden_actstrr   intermediate_act_fnr  s     r(   r   MraIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r3   r/  rA  c                 J    U R                  U5      nU R                  U5      nU$ r#   r>  rc  r  r/  s     r(   rz   MraIntermediate.forward  s&    

=100?r3   rf  rH  r  s   @r(   r]  r]    s(    9U\\ ell  r3   r]  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	MraOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r=  )r   r   r   r#  r_  r   r>  r   r   r   r  r  r  s     r(   r   MraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r3   r/  r@  rA  c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r#   rC  rD  s      r(   rz   MraOutput.forward  rF  r3   rG  rH  r  s   @r(   rj  rj    rJ  r3   rj  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )MraLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        [        U5      U l        [        U5      U l
        g Nr   )r   r   chunk_size_feed_forwardseq_len_dimrL  	attentionadd_cross_attentionr]  intermediaterj  r   r  s     r(   r   MraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r3   c                     U R                  X5      nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ rX  )ru  r   feed_forward_chunkrs  rt  )r  r/  r0  self_attention_outputsrZ  r7  layer_outputs          r(   rz   MraLayer.forward  sa    !%!N1!4(,0##T%A%A4CSCSUe
  /G+r3   c                 J    U R                  U5      nU R                  X!5      nU$ r#   )rw  r   )r  rZ  intermediate_outputr|  s       r(   rz  MraLayer.feed_forward_chunk  s)    "//0@A{{#6Ir3   )rv  ru  rs  rw  r   rt  r#   )	r   r   r   r   r   rz   rz  r   r  r  s   @r(   rp  rp    s    ( r3   rp  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )
MraEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r  r   
ModuleListrangenum_hidden_layersrp  layergradient_checkpointing)r  r  re   r	  s      r(   r   MraEncoder.__init__  sR    ]]eFD\D\>]#^>]HV$4>]#^_
&+# $_s   A%c                     U(       a  SOS n[        U R                  5       H  u  pxU(       a  Xa4-   nU" X5      n	U	S   nM      U(       a  Xa4-   nU(       d  [        S X4 5       5      $ [        UUS9$ )Nr$   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr#   r$   ).0vs     r(   	<genexpr>%MraEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)last_hidden_stater/  )	enumerater  tupler   )
r  r/  r0  	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss
             r(   rz   MraEncoder.forward  s     #7BD(4OA#$58H$H!(GM)!,M  5   14D DX]$FXXX1++
 	
r3   )r  r  r  )NNFTr9  r  s   @r(   r  r    s     , "
 
r3   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r=  )r   r   r   r#  r   r>  r`  ra  rb  r   transform_act_fnr   r   r  s     r(   r   #MraPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr3   r/  rA  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r#   )r>  r  r   rg  s     r(   rz   "MraPredictionHeadTransform.forward  s4    

=1--m<}5r3   )r   r>  r  rH  r  s   @r(   r  r    s)    UU\\ ell  r3   r  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )MraLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)bias)r   r   r  	transformr   r#  r   r   decoder	ParameterrS   r   r  r  s     r(   r   MraLMPredictionHead.__init__  sm    3F; yy!3!3V5F5FUSLLV->->!?@	 !IIr3   c                 :    U R                   U R                  l         g r#   )r  r  r  s    r(   _tie_weights MraLMPredictionHead._tie_weights  s     IIr3   c                 J    U R                  U5      nU R                  U5      nU$ r#   )r  r  rg  s     r(   rz   MraLMPredictionHead.forward   s$    }5]3r3   )r  r  r  )	r   r   r   r   r   r  rz   r   r  r  s   @r(   r  r    s    && r3   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraOnlyMLMHeadi'  c                 B   > [         TU ]  5         [        U5      U l        g r#   )r   r   r  predictionsr  s     r(   r   MraOnlyMLMHead.__init__(  s    .v6r3   sequence_outputrA  c                 (    U R                  U5      nU$ r#   r  )r  r  prediction_scoress      r(   rz   MraOnlyMLMHead.forward,  s     ,,_=  r3   r  rH  r  s   @r(   r  r  '  s(    7!u|| ! ! !r3   r  c                   J    \ rS rSr% \\S'   SrSrS\R                  4S jr
Srg)	MraPreTrainedModeli1  r  r!   Tmodulec                 h   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsg        )r   stdNr*  )r  initializer_ranger`  r   r#  weightdatanormal_r  zero_r   r   r   fill_r  )r  r  r  s      r(   _init_weights MraPreTrainedModel._init_weights8  s.   kk++fbii(( MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--KK""$MM$$S) 344KK""$ 5r3   r$   N)r   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   Moduler  r   r$   r3   r(   r  r  1  s&     &*#%BII %r3   r  c                   8  ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )MraModeliL  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r#   )r   r   r  r   r  r  encoder	post_initr  s     r(   r   MraModel.__init__N  s9     '/!&) 	r3   c                 .    U R                   R                  $ r#   r  r   r  s    r(   get_input_embeddingsMraModel.get_input_embeddingsX  s    ...r3   c                 $    XR                   l        g r#   r  )r  r   s     r(   set_input_embeddingsMraModel.set_input_embeddings[  s    */'r3   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  ru  rU  )r  heads_to_pruner  rS  s       r(   _prune_headsMraModel._prune_heads^  s<    
 +002LELLu%//;;EB 3r3   r  r0  r   r   r  r  r  r  rA  c	                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb"  U R	                  X5        UR                  5       n	O"Ub  UR                  5       S S n	O[        S5      eU	u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U	[        R                  US9nU R                  X)5      nU R!                  XPR                   R"                  5      nU R                  UUUUS9nU R%                  UUUUUS9nUS	   nU(       d	  U4US
S  -   $ ['        UUR(                  UR*                  UR,                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer<   z5You have to specify either input_ids or inputs_embedsr   r   rO   )r  r   r   r  )r0  r  r  r  r   r   )r  r/  
attentionscross_attentions)r  r  use_return_dictr?   %warn_if_padding_and_no_attention_maskr>   rQ   rS   r   r  r  r   r  r   rU   get_extended_attention_maskget_head_maskr  r  r   r/  r  r  )r  r  r0  r   r   r  r  r  r  r  rY   r  rQ   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r(   rz   MraModel.forwardf  s    %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,2!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r3   )r  r  r  )NNNNNNNN)r   r   r   r   r   r  r  r  r   r   rS   rI  boolr   r  r   rz   r   r  r  s   @r(   r  r  L  s    /0C  -11515/3,004/3&*J
ELL)J
 !.J
 !.	J

 u||,J
 ELL)J
  -J
 'tnJ
 d^J
 
u88	9J
 J
r3   r  c                   Z  ^  \ rS rSrSS/rU 4S jrS rS r\         SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )MraForMaskedLMi  zcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r#   )r   r   r  r!   r  clsr  r  s     r(   r   MraForMaskedLM.__init__  s4     F#!&) 	r3   c                 B    U R                   R                  R                  $ r#   )r  r  r  r  s    r(   get_output_embeddings$MraForMaskedLM.get_output_embeddings  s    xx##+++r3   c                     XR                   R                  l        UR                  U R                   R                  l        g r#   )r  r  r  r  )r  new_embeddingss     r(   set_output_embeddings$MraForMaskedLM.set_output_embeddings  s*    '5$$2$7$7!r3   r  r0  r   r   r  r  labelsr  r  rA  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr0  r   r   r  r  r  r  r   r<   r   losslogitsr/  r  )
r  r  r!   r  r   r+  r   r   r/  r  )r  r  r0  r   r   r  r  r  r  r  r7  r  r  masked_lm_lossloss_fctr   s                   r(   rz   MraForMaskedLM.forward  s    & &1%<k$++B]B](())%'!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r3   )r  r!   	NNNNNNNNN)r   r   r   r   _tied_weights_keysr   r  r  r   r   rS   rI  r  r   r  r   rz   r   r  r  s   @r(   r  r    s   :<Z[,8  -11515/3,004)-/3&*0
ELL)0
 !.0
 !.	0

 u||,0
 ELL)0
  -0
 &0
 'tn0
 d^0
 
un$	%0
 0
r3   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MraClassificationHeadi  z-Head for sentence-level classification tasks.c                 8  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        Xl        g r#   )r   r   r   r#  r   r>  r   r  r  
num_labelsout_projr  r  s     r(   r   MraClassificationHead.__init__   se    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr3   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  r>  r   r  ra  r  )r  featureskwargsxs       r(   rz   MraClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r3   )r  r>  r  r  r  r  s   @r(   r  r    s    7 r3   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   F  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r#   )r   r   r  r  r!   r  
classifierr  r  s     r(   r   %MraForSequenceClassification.__init__  sA      ++F#/7 	r3   r  r0  r   r   r  r  r  r  r  rA  c
                 .   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                   U
R"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr<   r  )r  r  r!   r  problem_typer  rP   rS   rU   rD   r	   r,  r   r+  r   r   r/  r  )r  r  r0  r   r   r  r  r  r  r  r7  r  r  r  r  r   s                   r(   rz   $MraForSequenceClassification.forward"  s   & &1%<k$++B]B](())%'!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r3   )r  r!   r  r  )r   r   r   r   r   r   r   rS   rI  r  r   r  r   rz   r   r  r  s   @r(   r  r    s      -11515/3,004)-/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
  -A
 &A
 'tnA
 d^A
 
u..	/A
 A
r3   r  c                   F  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForMultipleChoiceig  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  S5      U l        U R                  5         g rr  )
r   r   r  r!   r   r#  r   pre_classifierr  r  r  s     r(   r   MraForMultipleChoice.__init__i  s_     F# ii(:(:F<N<NO))F$6$6: 	r3   r  r0  r   r   r  r  r  r  r  rA  c
                    U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   n
Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nUSS2S4   nU R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SU
5      nSnUb  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r<   r9   r  r   r  )r  r  rR   r+  r>   r!   r  r   ReLUr  r   r   r/  r  )r  r  r0  r   r   r  r  r  r  r  num_choicesr7  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r(   rz   MraForMultipleChoice.forwards  s   V &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  	
 qz$QT*++M:	-0/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r3   )r  r!   r  r  )r   r   r   r   r   r   r   rS   rI  r  r   r  r   rz   r   r  r  s   @r(   r  r  g  s      -11515/3,004)-/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 'tnX
 d^X
 
u//	0X
 X
r3   r  c                   F  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r#   )r   r   r  r  r!   r   r   r  r  r#  r   r  r  r  s     r(   r   "MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r3   r  r0  r   r   r  r  r  r  r  rA  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " XR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r<   r   r  )r  r  r!   r  r  r   r+  r  rS   wheretensorignore_indextype_asr   r/  r  )r  r  r0  r   r   r  r  r  r  r  r7  r  r  r  r  active_lossactive_logitsactive_labelsr   s                      r(   rz   !MraForTokenClassification.forward  sf   " &1%<k$++B]B](())%'!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r3   )r  r  r!   r  r  )r   r   r   r   r   r   r   rS   rI  r  r   r  r   rz   r   r  r  s   @r(   r$  r$    s    	  -11515/3,004)-/3&*9
ELL)9
 !.9
 !.	9

 u||,9
 ELL)9
  -9
 &9
 'tn9
 d^9
 
u++	,9
 9
r3   r$  c                   f  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForQuestionAnsweringi  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )Nr6   )
r   r   r  r  r!   r   r#  r   
qa_outputsr  r  s     r(   r    MraForQuestionAnswering.__init__  s[      ++F#))F$6$68I8IJ 	r3   r  r0  r   r   r  r  start_positionsend_positionsr  r  rA  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  pUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU
(       d  X4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r<   r:   )r*  r6   )r  start_logits
end_logitsr/  r  )r  r  r!   r3  splitr,  r=   r>   clampr   r   r/  r  )r  r  r0  r   r   r  r  r5  r6  r  r  r7  r  r  r8  r9  
total_lossignored_indexr  
start_lossend_lossr   s                         r(   rz   MraForQuestionAnswering.forward'  s    &1%<k$++B]B](())%'!5#  	
 "!*1#)<<r<#: #++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r3   )r!   r  r3  )
NNNNNNNNNN)r   r   r   r   r   r   r   rS   rI  r  r   r  r   rz   r   r  r  s   @r(   r1  r1    s   
  -11515/3,0042604/3&*<
ELL)<
 !.<
 !.	<

 u||,<
 ELL)<
  -<
 "%,,/<
  -<
 'tn<
 d^<
 
u22	3<
 <
r3   r1  )r  r  r1  r  r$  rp  r  r  r   )NN)r8   r   r   )Pr  r   pathlibr   typingr   r   rS   torch.utils.checkpointr   torch.nnr   r   r	   torch.utils.cpp_extensionr
   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_mrar   
get_loggerr   r  r0   r2   rM   r]   ra   ri   rs   autogradFunctionru   r   r   r   r   r   r  r   r  r;  rL  r]  rj  rp  r  r  r  r  r  r  r  r  r  r  r$  r1  __all__r$   r3   r(   <module>rQ     s      "    A A * ! 9  . l l k k ( 
		H	%	C&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf7BII 7t`ryy `HBII 299 Bbii  		 ) :!
 !
J $")) 0!RYY ! % % %2 d
! d
 d
N D
' D
 D
PBII * L
#5 L
L
^ d
- d
 d
N F
 2 F
 F
R J
0 J
 J
Z	r3   