
    <h              	          S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJrJr  SSKJrJrJr  SSKJr  \R@                  " \!5      r"S>S\#S\#S\\#   S\#4S jjr$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r( " S S\RJ                  5      r) " S S\RJ                  5      r* " S S\RJ                  5      r+ " S  S!\RJ                  5      r, " S" S#\RJ                  5      r- " S$ S%\RJ                  5      r. " S& S'\RJ                  5      r/ " S( S)\5      r0 " S* S+\RJ                  5      r1\ " S, S-\5      5       r2\ " S. S/\25      5       r3\" S0S19 " S2 S3\25      5       r4 " S4 S5\RJ                  5      r5 " S6 S7\RJ                  5      r6 " S8 S9\RJ                  5      r7\" S:S19 " S; S<\25      5       r8/ S=Qr9g)?zPyTorch MobileViT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    Uc  Un[        U[        XS-  -   5      U-  U-  5      nUSU -  :  a  X1-  n[        U5      $ )z
Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
original TensorFlow repo. It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
   g?)maxint)r   r   r   	new_values       h/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler"   ,   sO     	Is5Q;#677BWLMI3;	y>    c                      ^  \ rS rSr      SS\S\S\S\S\S\S\S	\S
\S\\\4   SS4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )MobileViTConvLayer;   configin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 D  > [         TU ]  5         [        US-
  S-  5      U-  nX&-  S:w  a  [        SU SU S35      eX6-  S:w  a  [        SU SU S35      e[        R
                  " UUUUUUUUSS	9	U l        U	(       a  [        R                  " US
SSSS9U l        OS U l        U
(       an  [        U
[        5      (       a  [        U
   U l        g [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g S U l        g )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r(   r)   r*   r+   paddingr.   r,   r-   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr
   
activation
hidden_act)selfr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r3   	__class__s               r!   r;   MobileViTConvLayer.__init__<   s,    	{Q!+,x71$/}<STZS[[cdee A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#.."("8F--s33"():):";"("3"3"DOr#   featuresc                     U R                  U5      nU R                  b  U R                  U5      nU R                  b  U R                  U5      nU$ N)r>   r@   rC   )rE   rH   s     r!   forwardMobileViTConvLayer.forwardr   sK    ##H-)))(3H??&x0Hr#   )rC   r>   r@   )r   r   Fr   TT)__name__
__module____qualname____firstlineno__r   r   boolr   rB   r;   torchTensorrK   __static_attributes____classcell__rF   s   @r!   r%   r%   ;   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4#l   r#   r%   c                      ^  \ rS rSrSr SS\S\S\S\S\SS	4U 4S
 jjjrS\R                  S\R                  4S jr
SrU =r$ )MobileViTInvertedResidual{   zQ
Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
r'   r(   r)   r+   r.   r   Nc           
      6  > [         TU ]  5         [        [        [	        X!R
                  -  5      5      S5      nUS;  a  [        SU S35      eUS:H  =(       a    X#:H  U l        [        XUSS9U l	        [        UUUSUUUS9U l
        [        UUUSS	S
9U l        g )N   )r   r   zInvalid stride .r   r(   r)   r*   r	   )r(   r)   r*   r+   r,   r.   Fr(   r)   r*   r0   )r:   r;   r"   r   roundexpand_ratior<   use_residualr%   
expand_1x1conv_3x3
reduce_1x1)rE   r'   r(   r)   r+   r.   expanded_channelsrF   s          r!   r;   "MobileViTInvertedResidual.__init__   s     	*3u[CVCV5V/W+XZ[\vha899#q[K{/J,:KYZ
 +)*$
 -)% 
r#   rH   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  X!-   $ U$ rJ   )rb   rc   rd   ra   )rE   rH   residuals      r!   rK   !MobileViTInvertedResidual.forward   sG    ??8,==*??8,&*&7&7x"EXEr#   )rc   rb   rd   ra   r   )rM   rN   rO   rP   __doc__r   r   r;   rR   rS   rK   rT   rU   rV   s   @r!   rX   rX   {   sn    
 jk
%
47
GJ
TW
cf
	
 
BF F F Fr#   rX   c                      ^  \ rS rSr SS\S\S\S\S\SS4U 4S	 jjjrS
\R                  S\R                  4S jr	Sr
U =r$ )MobileViTMobileNetLayer   r'   r(   r)   r+   
num_stagesr   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        U5       H4  n[        UUUUS:X  a  UOSS9nU R                  R                  U5        UnM6     g )Nr   r   )r(   r)   r+   )r:   r;   r   
ModuleListlayerrangerX   append)	rE   r'   r(   r)   r+   ro   irr   rF   s	           r!   r;    MobileViTMobileNetLayer.__init__   sc     	]]_
z"A-')!"avQ	E JJe$&K #r#   rH   c                 <    U R                    H  nU" U5      nM     U$ rJ   rr   )rE   rH   layer_modules      r!   rK   MobileViTMobileNetLayer.forward   s     JJL#H-H 'r#   rx   )r   r   rM   rN   rO   rP   r   r   r;   rR   rS   rK   rT   rU   rV   s   @r!   rm   rm      s`    op'%'47'GJ'TW'il'	' '    r#   rm   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfAttention   r'   hidden_sizer   Nc                 r  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X R                  UR                  S9U l
        [        R                  " X R                  UR                  S9U l        [        R                  " X R                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   zThe hidden size z4 is not a multiple of the number of attention heads r\   )r-   )r:   r;   num_attention_headsr<   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrE   r'   r   rF   s      r!   r;   MobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{5O5O'O#P !558P8PPYY{,>,>V__U
99[*<*<6??SYY{,>,>V__U
zz&"E"EFr#   hidden_statesc                    UR                   u  p#nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      n[        R                  " XVR                  SS5      5      nU[        R                  " U R                  5      -  n[        R                  R                  USS9n	U R                  U	5      n	[        R                  " X5      n
U
R!                  SSSS5      R#                  5       n
U
R%                  5       S S U R&                  4-   nU
R                  " U6 n
U
$ )Nr   r   dimr   r	   )shaper   viewr   r   	transposer   r   rR   matmulmathsqrtr   
functionalsoftmaxr   permute
contiguoussizer   )rE   r   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes               r!   rK   MobileViTSelfAttention.forward   s   $1$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDr#   )r   r   r   r   r   r   r   r{   rV   s   @r!   r}   r}      sA    G GS GT G&"U\\ "ell " "r#   r}   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfOutput   r'   r   r   Nc                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g rJ   r:   r;   r   r   denser   hidden_dropout_probr   r   s      r!   r;   MobileViTSelfOutput.__init__   s4    YY{8
zz&"<"<=r#   r   c                 J    U R                  U5      nU R                  U5      nU$ rJ   r   r   rE   r   s     r!   rK   MobileViTSelfOutput.forward  s$    

=1]3r#   r   r{   rV   s   @r!   r   r      s=    > >S >T >
U\\ ell  r#   r   c                      ^  \ rS rSrS\S\SS4U 4S jjrS\\   SS4S jrS	\	R                  S\	R                  4S
 jrSrU =r$ )MobileViTAttentioni  r'   r   r   Nc                    > [         TU ]  5         [        X5      U l        [	        X5      U l        [        5       U l        g rJ   )r:   r;   r}   	attentionr   outputsetpruned_headsr   s      r!   r;   MobileViTAttention.__init__  s0    /D)&>Er#   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rE   r   indexs      r!   prune_headsMobileViTAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r#   r   c                 J    U R                  U5      nU R                  U5      nU$ rJ   )r   r   )rE   r   self_outputsattention_outputs       r!   rK   MobileViTAttention.forward   s%    ~~m4;;|4r#   )r   r   r   )rM   rN   rO   rP   r   r   r;   r   r   rR   rS   rK   rT   rU   rV   s   @r!   r   r     sT    " "S "T ";S ;d ;$ U\\  ell    r#   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTIntermediatei&  r'   r   intermediate_sizer   Nc                    > [         TU ]  5         [        R                  " X#5      U l        [        UR                  [        5      (       a  [        UR                     U l	        g UR                  U l	        g rJ   )
r:   r;   r   r   r   rA   rD   rB   r
   intermediate_act_fnrE   r'   r   r   rF   s       r!   r;   MobileViTIntermediate.__init__'  sR    YY{>
f''--'-f.?.?'@D$'-'8'8D$r#   r   c                 J    U R                  U5      nU R                  U5      nU$ rJ   r   r   r   s     r!   rK   MobileViTIntermediate.forward/  s&    

=100?r#   r   r{   rV   s   @r!   r   r   &  sF    9 9S 9UX 9]a 9U\\ ell  r#   r   c                      ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S	\R                  S\R                  4S
 jr	Sr
U =r$ )MobileViTOutputi5  r'   r   r   r   Nc                    > [         TU ]  5         [        R                  " X25      U l        [        R
                  " UR                  5      U l        g rJ   r   r   s       r!   r;   MobileViTOutput.__init__6  s5    YY0>
zz&"<"<=r#   r   input_tensorc                 R    U R                  U5      nU R                  U5      nX-   nU$ rJ   r   )rE   r   r   s      r!   rK   MobileViTOutput.forward;  s,    

=1]3%4r#   r   r{   rV   s   @r!   r   r   5  sT    > >S >UX >]a >
U\\  RWR^R^  r#   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformerLayeriB  r'   r   r   r   Nc                   > [         TU ]  5         [        X5      U l        [	        XU5      U l        [        XU5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  S9U l        g )Nr6   )r:   r;   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r!   r;   "MobileViTTransformerLayer.__init__C  sg    +F@1&GXY%f;LM "[>S>S T!||K=R=RSr#   r   c                     U R                  U R                  U5      5      nX!-   nU R                  U5      nU R                  U5      nU R	                  X15      nU$ rJ   )r   r   r   r   r   )rE   r   r   layer_outputs       r!   rK   !MobileViTTransformerLayer.forwardK  sX    >>$*?*?*NO(8++M:((6{{<?r#   )r   r   r   r   r   r{   rV   s   @r!   r   r   B  sK    T TS TUX T]a TU\\ ell  r#   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformeriU  r'   r   ro   r   Nc           	         > [         TU ]  5         [        R                  " 5       U l        [        U5       H>  n[        UU[        X!R                  -  5      S9nU R                  R                  U5        M@     g )N)r   r   )
r:   r;   r   rq   rr   rs   r   r   	mlp_ratiort   )rE   r'   r   ro   r   transformer_layerrF   s         r!   r;   MobileViTTransformer.__init__V  sa    ]]_
z"A 9'"%k4D4D&D"E!
 JJ/0 #r#   r   c                 <    U R                    H  nU" U5      nM     U$ rJ   rx   )rE   r   ry   s      r!   rK   MobileViTTransformer.forwardb  s      JJL(7M 'r#   rx   r{   rV   s   @r!   r   r   U  sE    
1 
1S 
1c 
1VZ 
1U\\ ell  r#   r   c                     ^  \ rS rSrSr SS\S\S\S\S\S\S	\S
S4U 4S jjjrS\R                  S
\
\R                  \4   4S jrS\R                  S\S
\R                  4S jrS\R                  S
\R                  4S jrSrU =r$ )MobileViTLayerih  z;
MobileViT block: https://huggingface.co/papers/2110.02178
r'   r(   r)   r+   r   ro   r.   r   Nc           	        > [         TU ]  5         UR                  U l        UR                  U l        US:X  a(  [        UUUUS:X  a  UOSUS:  a  US-  OSS9U l        UnOS U l        [        UUUUR                  S9U l	        [        UUUSSSS9U l
        [        UUUS9U l        [        R                  " XQR                  S9U l        [        XUSS9U l        [        USU-  X!R                  S9U l        g )	Nr   r   )r(   r)   r+   r.   r]   F)r(   r)   r*   r/   r0   )r   ro   r   )r:   r;   
patch_sizepatch_widthpatch_heightrX   downsampling_layerr%   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rE   r'   r(   r)   r+   r   ro   r.   rF   s	           r!   r;   MobileViTLayer.__init__m  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 k7L7LM1+ST 
 )KkWnWn
r#   rH   c                 n   U R                   U R                  p2[        X#-  5      nUR                  u  pVpx[        R
                  R                  5       (       a$  [        [        R                  " Xs-  5      U-  5      O#[        [        R                  " Xs-  5      U-  5      n	[        R
                  R                  5       (       a$  [        [        R                  " X-  5      U-  5      O#[        [        R                  " X-  5      U-  5      n
SnX:w  d  X:w  a#  [        R                  R                  XU
4SSS9nSnX-  nX-  nX-  nUR                  XV-  U-  X<U5      nUR                  SS5      nUR                  XVX5      nUR                  SS5      nUR                  XT-  US5      nXx4UUUUUUS	.nUU4$ )
NFbilinearr   modealign_cornersTr   r   r	   r   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rR   jit
is_tracingr   ceilr   r   r   r  reshaper   )rE   rH   r   r   
patch_arear   r  orig_height
orig_width
new_height	new_widthr  num_patch_widthnum_patch_heightr  patches	info_dicts                    r!   	unfoldingMobileViTLayer.unfolding  s   $($4$4d6G6G\34
8@5
k yy##%% ejj!;<|KLTYY{9:\IJ 	 yy##%% ejj!9:[HITYYz78;FG 	 "j&?}}00I6ZW\ 1 H K $2%5&8 ""!$44lU`
 ##Aq)//*P##Aq)//*"9;K &2$ &&!0"2
	 	!!r#   r  r  c                    U R                   U R                  pC[        X4-  5      nUS   nUS   nUS   nUS   n	US   n
UR                  5       R	                  XeUS5      nUR                  SS5      nUR                  Xg-  U	-  XU5      nUR                  SS	5      nUR                  XgX-  X-  5      nUS
   (       a"  [        R                  R                  XS   SSS9nU$ )Nr   r  r  r  r  r   r   r	   r   r  r  r   Fr   )
r   r   r   r   r   r   r  r   r   r  )rE   r  r  r   r   r  r   r  r  r  r  rH   s               r!   foldingMobileViTLayer.folding  s   $($4$4d6G6G\34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44oU`
 %%a+##"2"A?C`
 ]#}}005JV[ 1 H r#   c                    U R                   (       a  U R                  U5      nUnU R                  U5      nU R                  U5      nU R                  U5      u  p4U R	                  U5      nU R                  U5      nU R                  X45      nU R                  U5      nU R                  [        R                  " X!4SS95      nU$ Nr   r   )r   r   r   r  r   r   r  r   r   rR   cat)rE   rH   rh   r  r  s        r!   rK   MobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy()=1EFr#   )	r   r   r   r   r   r   r   r   r   rj   )rM   rN   rO   rP   rk   r   r   r;   rR   rS   tupledictr  r  rK   rT   rU   rV   s   @r!   r   r   h  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
 8
t1"%,, 1"5t9K3L 1"fu||   :   r#   r   c                   t   ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\S\S\	\
\4   4S	 jjrS
rU =r$ )MobileViTEncoderi  r'   r   Nc           
        > [         T
U ]  5         Xl        [        R                  " 5       U l        SU l        S=p#UR                  S:X  a  SnSnOUR                  S:X  a  SnSn[        UUR                  S   UR                  S   SSS9nU R
                  R                  U5        [        UUR                  S   UR                  S   SS	S9nU R
                  R                  U5        [        UUR                  S   UR                  S	   SUR                  S   SS
9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S	   UR                  S   SUR                  S   SUS9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S   UR                  S   SUR                  S   S	US9n	U R
                  R                  U	5        g )NFr[   T   r   r   )r(   r)   r+   ro   r   r	   )r(   r)   r+   r   ro      )r(   r)   r+   r   ro   r.      )r:   r;   r'   r   rq   rr   gradient_checkpointingoutput_striderm   neck_hidden_sizesrt   r   hidden_sizes)rE   r'   dilate_layer_4dilate_layer_5r.   layer_1layer_2layer_3layer_4layer_5rF   s             r!   r;   MobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r#   r   output_hidden_statesreturn_dictc                     U(       a  SOS n[        U R                  5       H  u  pVU" U5      nU(       d  M  XA4-   nM     U(       d  [        S X4 5       5      $ [        XS9$ )N c              3   ,   #    U H  oc  M  Uv   M     g 7frJ   r5  ).0vs     r!   	<genexpr>+MobileViTEncoder.forward.<locals>.<genexpr>k  s     X$Fq$Fs   	)last_hidden_stater   )	enumeraterr   r  r   )rE   r   r2  r3  all_hidden_statesru   ry   s          r!   rK   MobileViTEncoder.forward\  sc     #7BD(4OA(7M##$58H$H!	  5 X]$FXXX-oor#   )r'   r&  rr   )FT)rM   rN   rO   rP   r   r;   rR   rS   rQ   r   r  r   rK   rT   rU   rV   s   @r!   r!  r!    sg    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5p pr#   r!  c                   X    \ rS rSr% \\S'   SrSrSrS/r	S\
R                  SS	4S
 jrSrg	)MobileViTPreTrainedModelip  r'   	mobilevitpixel_valuesTr   moduler   Nc                 (   [        U[        R                  [        R                  [        R                  45      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)rA   r   r   r=   r?   weightdatanormal_r'   initializer_ranger-   zero_r   fill_)rE   rC  s     r!   _init_weights&MobileViTPreTrainedModel._init_weightsx  s    fryy"))R^^DEE MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r#   r5  )rM   rN   rO   rP   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   ModulerM  rT   r5  r#   r!   r@  r@  p  s9    #$O&*#)*
*BII 
*$ 
*r#   r@  c                      ^  \ rS rSrSS\S\4U 4S jjjrS r\   SS\	\
R                     S\	\   S\	\   S	\\\4   4S
 jj5       rSrU =r$ )MobileViTModeli  r'   expand_outputc                 F  > [         TU ]  U5        Xl        X l        [	        UUR
                  UR                  S   SSS9U l        [        U5      U l	        U R                  (       a+  [	        UUR                  S   UR                  S   SS9U l
        U R                  5         g	)
a%  
expand_output (`bool`, *optional*, defaults to `True`):
    Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
    1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
r   r	   r   )r(   r)   r*   r+   r%     r   r]   N)r:   r;   r'   rW  r%   num_channelsr(  	conv_stemr!  encoderconv_1x1_exp	post_init)rE   r'   rW  rF   s      r!   r;   MobileViTModel.__init__  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r#   c                    UR                  5        Hm  u  p#U R                  R                  U   n[        U[        5      (       d  M5  UR
                  R                   H  nUR                  R                  U5        M      Mo     g)zPrunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
N)itemsr\  rr   rA   r   r   r   r   )rE   heads_to_prunelayer_indexr   mobilevit_layerr   s         r!   _prune_headsMobileViTModel._prune_heads  sg     #1"6"6"8K"ll00=O/>::)8)D)D)J)J%%//;;EB *K #9r#   rB  r2  r3  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUUS9nU R                  (       a-  U R                  US   5      n[        R                  " USS/SS9nOUS   nS nU(       d  Ub  Xg4OU4nXSS  -   $ [        UUUR                  S	9$ )
Nz You have to specify pixel_valuesr2  r3  r   r   r   F)r   keepdimr   )r;  pooler_outputr   )r'   r2  use_return_dictr<   r[  r\  rW  r]  rR   rE  r   r   )	rE   rB  r2  r3  embedding_outputencoder_outputsr;  pooled_outputr   s	            r!   rK   MobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFAB///7/')77
 	
r#   )r'   r]  r[  r\  rW  )T)NNN)rM   rN   rO   rP   r   rQ   r;   re  r   r   rR   rS   r   r  r   rK   rT   rU   rV   s   @r!   rV  rV    s     t  >C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r#   rV  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\
   S\\R                     S	\\
   S\\\4   4
S
 jj5       rSrU =r$ )MobileViTForImageClassificationi  r'   r   Nc                 ~  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  SS9U l        UR                  S:  a.  [
        R                  " UR                  S   UR                  5      O[
        R                  " 5       U l        U R                  5         g )NT)inplacer   r   )r:   r;   
num_labelsrV  rA  r   r   classifier_dropout_probr   r   r(  Identity
classifierr^  rE   r'   rF   s     r!   r;   (MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r#   rB  r2  labelsr3  c                 P   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U R                  U5      5      nSnUGb  U R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       n	U R                  S:X  a&  U	" UR                  5       UR                  5       5      nOU	" Xs5      nOU R                   R                  S:X  a=  [        5       n	U	" UR                  SU R                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [!        5       n	U	" Xs5      nU(       d  U4USS -   n
Ub  U4U
-   $ U
$ [#        UUUR$                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrh  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r'   rk  rA  rj  rx  r   problem_typeru  dtyperR   longr   r   squeezer   r   r   r   r   )rE   rB  r2  r{  r3  outputsrn  r  r  loss_fctr   s              r!   rK   'MobileViTForImageClassification.forward  s    &1%<k$++B]B]..fq.r1<--'!*m!<={{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r#   )rx  r   rA  ru  NNNN)rM   rN   rO   rP   r   r;   r   r   rR   rS   rQ   r   r  r   rK   rT   rU   rV   s   @r!   rr  rr    s     4   04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
r#   rr  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTASPPPoolingi)  r'   r(   r)   r   Nc           
      |   > [         TU ]  5         [        R                  " SS9U l        [        UUUSSSSS9U l        g )Nr   )output_sizeTrelu)r(   r)   r*   r+   r/   r0   )r:   r;   r   AdaptiveAvgPool2dglobal_poolr%   r   )rE   r'   r(   r)   rF   s       r!   r;   MobileViTASPPPooling.__init__*  sB    //A>*#%"!
r#   rH   c                     UR                   SS  nU R                  U5      nU R                  U5      n[        R                  R                  XSSS9nU$ )Nr   r   Fr   )r   r  r   r   r   r  )rE   rH   spatial_sizes      r!   rK   MobileViTASPPPooling.forward9  sQ    ~~bc*##H-==*==,,Xzin,or#   )r   r  r{   rV   s   @r!   r  r  )  sF    
 
S 
PS 
X\ 
   r#   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTASPPiA  z{
ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
r'   r   Nc                 n  > [         TU ]  5         UR                  S   nUR                  n[	        UR
                  5      S:w  a  [        S5      e[        R                  " 5       U l	        [        UUUSSS9nU R                  R                  U5        U R                  R                  UR
                   Vs/ sH  n[        UUUSUSS9PM     sn5        [        XU5      nU R                  R                  U5        [        USU-  USSS9U l        [        R                  " UR                   S	9U l        g s  snf )
Nr   r	   z"Expected 3 values for atrous_ratesr   r  r^   )r(   r)   r*   r.   r0   r%  )p)r:   r;   r(  aspp_out_channelsr   atrous_ratesr<   r   rq   convsr%   rt   extendr  projectr   aspp_dropout_probr   )rE   r'   r(   r)   in_projectionrate
pool_layerrF   s          r!   r;   MobileViTASPP.__init__F  s-   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
 0D # +!- !!#) 0
	
 *&|L


*%)L 0|YZkq
 zzF$<$<=)
s   4D2rH   c                     / nU R                    H  nUR                  U" U5      5        M     [        R                  " USS9nU R	                  U5      nU R                  U5      nU$ r  )r  rt   rR   r  r  r   )rE   rH   pyramidconvpooled_featuress        r!   rK   MobileViTASPP.forwardq  sW    JJDNN4>* ))G+,,w/,,7r#   )r  r   r  rM   rN   rO   rP   rk   r   r;   rR   rS   rK   rT   rU   rV   s   @r!   r  r  A  s<    )> )>4 )>V   r#   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTDeepLabV3i|  zB
DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
r'   r   Nc           
         > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        UUR                  UR                  SSSSS9U l        g )Nr   FT)r(   r)   r*   r/   r0   r-   )r:   r;   r  asppr   	Dropout2drv  r   r%   r  ru  rx  ry  s     r!   r;   MobileViTDeepLabV3.__init__  s]    !&)	||F$B$BC,00**# 
r#   r   c                 r    U R                  US   5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  r   rx  )rE   r   rH   s      r!   rK   MobileViTDeepLabV3.forward  s6    99]2./<<)??8,r#   )r  rx  r   r  rV   s   @r!   r  r  |  s;    
 
4 
 U\\ ell  r#   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\R                     S\\
   S	\\
   S\\\4   4
S
 jj5       rSrU =r$ ) MobileViTForSemanticSegmentationi  r'   r   Nc                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g )NF)rW  )r:   r;   ru  rV  rA  r  segmentation_headr^  ry  s     r!   r;   )MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r#   rB  r{  r2  r3  c                 z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  USUS9nU(       a  UR                  OUS   nU R                  U5      nSnUbQ  [        R                  R                  XrR                  SS SSS	9n	[        U R                   R                  S
9n
U
" X5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [        UUU(       a  UR                  SS9$ SSS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
>>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTrh  r   r   Fr   )ignore_indexr   )r  r  r   
attentions)r'   r2  rk  ru  r<   rA  r   r  r   r   r  r   r   semantic_loss_ignore_indexr   )rE   rB  r{  r2  r3  r  encoder_hidden_statesr  r  upsampled_logitsr  r   s               r!   rK   (MobileViTForSemanticSegmentation.forward  sm   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88\\"#.Zu  9   (T[[5[5[\H,5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r#   )rA  ru  r  r  )rM   rN   rO   rP   r   r;   r   r   rR   rS   rQ   r   r  r   rK   rT   rU   rV   s   @r!   r  r    s     4   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r#   r  )rr  r  rV  r@  )r[   N):rk   r   typingr   r   rR   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrM   loggerr   r"   rT  r%   rX   rm   r}   r   r   r   r   r   r   r   r!  r@  rV  rr  r  r  r  r  __all__r5  r#   r!   <module>r     s	  "   "    A A ! 9  . Q 7 7 4 
		H	%#  HSM UX = =@-F		 -F`bii .6RYY 6r	")) 	   >BII 
bii 
		 &299 &f/ fR\pryy \p~ * * *( R
- R
 R
j E
&> E
E
P299 08BII 8v 8 
U
'? U

U
pr#   