
    h              	          d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZ  ej:                  e      Zd>de de dee    de fdZ! G d dejD                        Z# G d dejD                        Z$ G d dejD                        Z% G d dejD                        Z& G d dejD                        Z' G d dejD                        Z( G d  d!ejD                        Z) G d" d#ejD                        Z* G d$ d%ejD                        Z+ G d& d'ejD                        Z, G d( d)e      Z- G d* d+ejD                        Z.e G d, d-e             Z/e G d. d/e/             Z0 ed01       G d2 d3e/             Z1 G d4 d5ejD                        Z2 G d6 d7ejD                        Z3 G d8 d9ejD                        Z4 ed:1       G d; d<e/             Z5g d=Z6y)?zPyTorch MobileViT model.    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r   	new_values       o/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler    +   sS     	Is57Q;#677BWLMI3;W	y>    c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r%   r&   r'   r(   paddingr+   r)   r*   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r0   	__class__s               r   r8   zMobileViTConvLayer.__init__;   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr!   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S N)r;   r=   r@   )rB   rD   s     r   forwardzMobileViTConvLayer.forwardq   sK    ##H-)))(3H??&x0Hr!   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   r?   r8   torchTensorrG   __classcell__rC   s   @r   r#   r#   :   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r!   r#   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r$   r%   r&   r(   r+   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   r%   r&   r'   r   )r%   r&   r'   r(   r)   r+   Fr%   r&   r'   r-   )r7   r8   r    r   roundexpand_ratior9   use_residualr#   
expand_1x1conv_3x3
reduce_1x1)rB   r$   r%   r&   r(   r+   expanded_channelsrC   s          r   r8   z"MobileViTInvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J,:KYZ
 +)*$
 -)% 
r!   rD   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S rF   )rZ   r[   r\   rY   )rB   rD   residuals      r   rG   z!MobileViTInvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr!   r   )rH   rI   rJ   __doc__r   r   r8   rL   rM   rG   rN   rO   s   @r   rQ   rQ   z   sc    
 jk
%
47
GJ
TW
cf
	
BF F Fr!   rQ   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTMobileNetLayerr$   r%   r&   r(   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r%   r&   r(   )r7   r8   r   
ModuleListlayerrangerQ   append)	rB   r$   r%   r&   r(   rd   irg   rC   s	           r   r8   z MobileViTMobileNetLayer.__init__   sc     	]]_
z"A-')!"avQ	E JJe$&K #r!   rD   c                 8    | j                   D ]
  } ||      } |S rF   rg   )rB   rD   layer_modules      r   rG   zMobileViTMobileNetLayer.forward   s     JJL#H-H 'r!   )r   r   
rH   rI   rJ   r   r   r8   rL   rM   rG   rN   rO   s   @r   rc   rc      sV    op'%'47'GJ'TW'il'	'   r!   rc   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfAttentionr$   hidden_sizer   Nc                    t         |           ||j                  z  dk7  rt        d| d|j                   d      |j                  | _        t	        ||j                  z        | _        | j                  | j
                  z  | _        t        j                  || j                  |j                        | _
        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  |j                        | _        y )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rT   )r*   )r7   r8   num_attention_headsr9   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrB   r$   rq   rC   s      r   r8   zMobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{V5O5O'O#P !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
zz&"E"EFr!   hidden_statesc                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }	| j                  |	      }	t        j                  |	|      }
|
j!                  dddd      j#                         }
|
j%                         d d | j&                  fz   } |
j                  | }
|
S )Nr   r   dimr   r   )shaperx   viewrs   rt   	transposery   r   rL   matmulmathsqrtr   
functionalsoftmaxr|   permute
contiguoussizeru   )rB   r~   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes               r   rG   zMobileViTSelfAttention.forward   s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDr!   rn   rO   s   @r   rp   rp      s<    G GS GT G&"U\\ "ell "r!   rp   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfOutputr$   rq   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rF   r7   r8   r   rv   denserz   hidden_dropout_probr|   r}   s      r   r8   zMobileViTSelfOutput.__init__   s6    YY{K8
zz&"<"<=r!   r~   c                 J    | j                  |      }| j                  |      }|S rF   r   r|   rB   r~   s     r   rG   zMobileViTSelfOutput.forward   s$    

=1]3r!   rn   rO   s   @r   r   r      s8    > >S >T >
U\\ ell r!   r   c                   z     e Zd Zdededdf fdZdee   ddfdZdej                  dej                  fd	Z
 xZS )
MobileViTAttentionr$   rq   r   Nc                     t         |           t        ||      | _        t	        ||      | _        t               | _        y rF   )r7   r8   rp   	attentionr   outputsetpruned_headsr}   s      r   r8   zMobileViTAttention.__init__  s4    /D)&+>Er!   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r   rs   rt   r   r   rx   ry   r   r   r   ru   union)rB   r   indexs      r   prune_headszMobileViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r!   r~   c                 J    | j                  |      }| j                  |      }|S rF   )r   r   )rB   r~   self_outputsattention_outputs       r   rG   zMobileViTAttention.forward  s%    ~~m4;;|4r!   )rH   rI   rJ   r   r   r8   r   r   rL   rM   rG   rN   rO   s   @r   r   r     sO    " "S "T ";S ;d ;$ U\\  ell  r!   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTIntermediater$   rq   intermediate_sizer   Nc                     t         |           t        j                  ||      | _        t        |j                  t              rt        |j                     | _	        y |j                  | _	        y rF   )
r7   r8   r   rv   r   r>   rA   r?   r   intermediate_act_fnrB   r$   rq   r   rC   s       r   r8   zMobileViTIntermediate.__init__&  sR    YY{,=>
f''-'-f.?.?'@D$'-'8'8D$r!   r~   c                 J    | j                  |      }| j                  |      }|S rF   )r   r   r   s     r   rG   zMobileViTIntermediate.forward.  s&    

=100?r!   rn   rO   s   @r   r   r   %  sA    9 9S 9UX 9]a 9U\\ ell r!   r   c                        e Zd Zdedededdf fdZdej                  dej                  dej                  fd	Z xZ	S )
MobileViTOutputr$   rq   r   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rF   r   r   s       r   r8   zMobileViTOutput.__init__5  s7    YY0+>
zz&"<"<=r!   r~   input_tensorc                 T    | j                  |      }| j                  |      }||z   }|S rF   r   )rB   r~   r   s      r   rG   zMobileViTOutput.forward:  s.    

=1]3%4r!   rn   rO   s   @r   r   r   4  sO    > >S >UX >]a >
U\\  RWR^R^ r!   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerLayerr$   rq   r   r   Nc                 $   t         |           t        ||      | _        t	        |||      | _        t        |||      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        y )Nr3   )r7   r8   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r   r8   z"MobileViTTransformerLayer.__init__B  sq    +FK@1&+GXY%fk;LM "[f>S>S T!||KV=R=RSr!   r~   c                     | j                  | j                  |            }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S rF   )r   r   r   r   r   )rB   r~   r   layer_outputs       r   rG   z!MobileViTTransformerLayer.forwardJ  s\    >>$*?*?*NO(=8++M:((6{{<?r!   rn   rO   s   @r   r   r   A  sF    T TS TUX T]a TU\\ ell r!   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerr$   rq   rd   r   Nc           	          t         |           t        j                         | _        t        |      D ]A  }t        ||t        ||j                  z              }| j                  j                  |       C y )N)rq   r   )
r7   r8   r   rf   rg   rh   r   r   	mlp_ratiori   )rB   r$   rq   rd   r   transformer_layerrC   s         r   r8   zMobileViTTransformer.__init__U  sc    ]]_
z"A 9'"%kF4D4D&D"E!
 JJ/0 #r!   r~   c                 8    | j                   D ]
  } ||      } |S rF   rl   )rB   r~   rm   s      r   rG   zMobileViTTransformer.forwarda  s      JJL(7M 'r!   rn   rO   s   @r   r   r   T  s@    
1 
1S 
1c 
1VZ 
1U\\ ell r!   r   c                        e Zd ZdZ	 ddededededededed	d
f fdZdej                  d	e	ej                  e
f   fdZdej                  de
d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r$   r%   r&   r(   rq   rd   r+   r   Nc                    t         |           |j                  | _        |j                  | _        |dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                        | _	        t        |||ddd      | _
        t        |||      | _        t        j                  ||j                        | _        t        |||d      | _        t        |d|z  ||j                        | _        y )	Nr   r   )r%   r&   r(   r+   rU   F)r%   r&   r'   r,   r-   )rq   rd   r   )r7   r8   
patch_sizepatch_widthpatch_heightrQ   downsampling_layerr#   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rB   r$   r%   r&   r(   rq   rd   r+   rC   s	           r   r8   zMobileViTLayer.__init__l  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 kv7L7LM1+ST 
 )KkW]WnWn
r!   rD   c                 |   | j                   | j                  }}t        ||z        }|j                  \  }}}}t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }	t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }
d}|
|k7  s|	|k7  r't        j                  j                  ||	|
fdd      }d}|
|z  }|	|z  }||z  }|j                  ||z  |z  |||      }|j                  dd      }|j                  ||||      }|j                  dd      }|j                  ||z  |d      }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r   r   r   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rL   jit
is_tracingr   ceilr   r   r   r   reshaper   )rB   rD   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r   	unfoldingzMobileViTLayer.unfolding  s   $($4$4d6G6G\|34
8@5
Hk: yy##% ejj|!;<|KLTYY{\9:\IJ 	 yy##% ejjk!9:[HITYYzK78;FG 	 
"jK&?}}00
I6ZW\ 1 H K ${2%5&8 ""!$44lOU`
 ##Aq)//*hZP##Aq)//*z"9;K &z2$ &&!0"2
	 	!!r!   r   r   c                    | j                   | j                  }}t        ||z        }|d   }|d   }|d   }|d   }	|d   }
|j                         j	                  |||d      }|j                  dd      }|j                  ||z  |	z  |
||      }|j                  dd	      }|j                  |||	|z  |
|z        }|d
   r&t        j                  j                  ||d   dd      }|S )Nr   r   r   r   r   r   r   r   r   r   r   r   Fr   )
r   r   r   r   r   r   r   r   r   r   )rB   r   r   r   r   r   r   r   r   r   r   rD   s               r   foldingzMobileViTLayer.folding  s&   $($4$4d6G6G\|34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44o|U`
 %%a+##"2\"A?U`C`
 ]#}}00y5JV[ 1 H r!   c                    | j                   r| j                  |      }|}| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }| j                  t        j                  ||fd            }|S Nr   r   )r   r   r   r   r   r   r   r   r   rL   cat)rB   rD   r_   r   r   s        r   rG   zMobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy(H)=1EFr!   r`   )rH   rI   rJ   ra   r   r   r8   rL   rM   tupledictr   r   rG   rN   rO   s   @r   r   r   g  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
t1"%,, 1"5t9K3L 1"fu||   :  r!   r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTEncoderr$   r   Nc           	         t         
|           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        ||j                  d   |j                  d   dd      }| j
                  j                  |       t        ||j                  d   |j                  d   dd	      }| j
                  j                  |       t        ||j                  d   |j                  d	   d|j                  d   d
      }| j
                  j                  |       |r|dz  }t        ||j                  d	   |j                  d   d|j                  d   d|      }| j
                  j                  |       |r|dz  }t        ||j                  d   |j                  d   d|j                  d   d	|      }	| j
                  j                  |	       y )NFrS   T   r   r   )r%   r&   r(   rd   r   r   )r%   r&   r(   rq   rd      )r%   r&   r(   rq   rd   r+      )r7   r8   r$   r   rf   rg   gradient_checkpointingoutput_striderc   neck_hidden_sizesri   r   hidden_sizes)rB   r$   dilate_layer_4dilate_layer_5r+   layer_1layer_2layer_3layer_4layer_5rC   s             r   r8   zMobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r!   r~   output_hidden_statesreturn_dictc                     |rdnd }t        | j                        D ]  \  }} ||      }|s||fz   } |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wrF   r  ).0vs     r   	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>j  s     X$Fq!-$Fs   )last_hidden_stater~   )	enumeraterg   r   r
   )rB   r~   r  r  all_hidden_statesrj   rm   s          r   rG   zMobileViTEncoder.forward[  sj     #7BD(4OA|(7M#$58H$H!	  5 X]4E$FXXX-]noor!   )FT)rH   rI   rJ   r   r8   rL   rM   rK   r   r   r
   rG   rN   rO   s   @r   r   r     sa    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5pr!   r   c                   N    e Zd ZU eed<   dZdZdZdgZde	j                  ddfd	Zy)
MobileViTPreTrainedModelr$   	mobilevitpixel_valuesTr   moduler   Nc                    t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)r>   r   rv   r:   r<   weightdatanormal_r$   initializer_ranger*   zero_r   fill_)rB   r  s     r   _init_weightsz&MobileViTPreTrainedModel._init_weightsw  s    fryy"))R^^DE MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r!   )rH   rI   rJ   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler'  r  r!   r   r  r  o  s9    #$O&*#)*
*BII 
*$ 
*r!   r  c                        e Zd Zd
dedef fdZd Ze	 	 	 ddee	j                     dee   dee   deeef   fd	       Z xZS )MobileViTModelr$   expand_outputc                 L   t         |   |       || _        || _        t	        ||j
                  |j                  d   dd      | _        t        |      | _	        | j                  r.t	        ||j                  d   |j                  d   d      | _
        | j                          y	)
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r%   r&   r'   r(   r     r   rU   N)r7   r8   r$   r0  r#   num_channelsr  	conv_stemr   encoderconv_1x1_exp	post_init)rB   r$   r0  rC   s      r   r8   zMobileViTModel.__init__  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r!   c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr5  rg   r>   r   r   r   r   )rB   heads_to_prunelayer_indexr   mobilevit_layerr   s         r   _prune_headszMobileViTModel._prune_heads  sf     #1"6"6"8K"ll00=O/>:)8)D)D)J)J%%//;;EB *K #9r!   r  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r/| j                  |d         }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr  r  r   r   r   F)r   keepdimr   )r  pooler_outputr~   )r$   r  use_return_dictr9   r4  r5  r0  r6  rL   r  r   r~   )	rB   r  r  r  embedding_outputencoder_outputsr  pooled_outputr   s	            r   rG   zMobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r!   )T)NNN)rH   rI   rJ   r   rK   r8   r=  r   r   rL   rM   r   r   r   rG   rN   rO   s   @r   r/  r/    s}     t >C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r!   r/  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     dee	   deej                     dee	   de
eef   f
d	       Z xZS )MobileViTForImageClassificationr$   r   Nc                 |   t         |   |       |j                  | _        t        |      | _        t        j                  |j                  d      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _        | j                          y )NT)inplacer   r   )r7   r8   
num_labelsr/  r  r   rz   classifier_dropout_probr|   rv   r  Identity
classifierr7  rB   r$   rC   s     r   r8   z(MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r!   r  r  labelsr  c                 `   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  | j                  |            }d}|| j                  ||| j                         }|s|f|dd z   }	||f|	z   S |	S t        |||j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr?  r   r   )losslogitsr~   )	r$   rB  r  rA  rN  r|   loss_functionr   r~   )
rB   r  r  rP  r  outputsrE  rS  rR  r   s
             r   rG   z'MobileViTForImageClassification.forward  s     &1%<k$++B]B]..DXfq.r1<--'!*m!<=%%ffdkkBDY,F)-)9TGf$EvE3!//
 	
r!   NNNN)rH   rI   rJ   r   r8   r   r   rL   rM   rK   r   r   r   rG   rN   rO   s   @r   rH  rH    s     4   04/3)-&*!
u||,!
 'tn!
 &	!

 d^!
 
u::	;!
 !
r!   rH  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTASPPPoolingr$   r%   r&   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )output_sizeTrelu)r%   r&   r'   r(   r,   r-   )r7   r8   r   AdaptiveAvgPool2dglobal_poolr#   r   )rB   r$   r%   r&   rC   s       r   r8   zMobileViTASPPPooling.__init__  sB    //A>*#%"!
r!   rD   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr   r   Fr   )r   r]  r   r   r   r   )rB   rD   spatial_sizes      r   rG   zMobileViTASPPPooling.forward%  sS    ~~bc*##H-==*==,,XLzin,or!   rn   rO   s   @r   rX  rX    sA    
 
S 
PS 
X\ 
  r!   rX  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r$   r   Nc                 ~   t         |           |j                  d   }|j                  }t	        |j
                        dk7  rt        d      t        j                         | _	        t        |||dd      }| j                  j                  |       | j                  j                  |j
                  D cg c]  }t        |||d|d       c}       t        |||      }| j                  j                  |       t        |d|z  |dd      | _        t        j                  |j                   	      | _        y c c}w )
Nr   r   z"Expected 3 values for atrous_ratesr   r[  rV   )r%   r&   r'   r+   r-   r  )p)r7   r8   r  aspp_out_channelsr   atrous_ratesr9   r   rf   convsr#   ri   extendrX  projectrz   aspp_dropout_probr|   )rB   r$   r%   r&   in_projectionrate
pool_layerrC   s          r   r8   zMobileViTASPP.__init__2  s/   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
 0D # +!- !!#) 0
	
 *&+|L


*%)L 0|YZkq
 zzF$<$<=)
s   5D:rD   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S r   )rf  ri   rL   r   rh  r|   )rB   rD   pyramidconvpooled_featuress        r   rG   zMobileViTASPP.forward]  sW    JJDNN4>* ))G+,,w/,,7r!   
rH   rI   rJ   ra   r   r8   rL   rM   rG   rN   rO   s   @r   ra  ra  -  s7    )> )>4 )>V  r!   ra  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r$   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r%   r&   r'   r,   r-   r*   )r7   r8   ra  asppr   	Dropout2drL  r|   r#   rd  rK  rN  rO  s     r   r8   zMobileViTDeepLabV3.__init__m  s]    !&)	||F$B$BC,00**# 
r!   r~   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )ru  r|   rN  )rB   r~   rD   s      r   rG   zMobileViTDeepLabV3.forward}  s6    99]2./<<)??8,r!   rq  rO   s   @r   rs  rs  h  s6    
 
4 
 U\\ ell r!   rs  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     deej                     dee	   dee	   de
eef   f
d	       Z xZS ) MobileViTForSemanticSegmentationr$   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r0  )r7   r8   rK  r/  r  rs  segmentation_headr7  rO  s     r   r8   z)MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r!   r  rP  r  r  c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a{  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr?  r   r   Fr   )ignore_indexr   )rR  rS  r~   
attentions)r$   r  rB  rK  r9   r  r~   r{  r   r   r   r   r   semantic_loss_ignore_indexr   )rB   r  rP  r  r  rU  encoder_hidden_statesrS  rR  upsampled_logitsloss_fctr   s               r   rG   z(MobileViTForSemanticSegmentation.forward  sq   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r!   rV  )rH   rI   rJ   r   r8   r   r   rL   rM   rK   r   r   r   rG   rN   rO   s   @r   ry  ry    s     4   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r!   ry  )rH  ry  r/  r  )rS   N)7ra   r   typingr   r   rL   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrH   loggerr   r    r-  r#   rQ   rc   rp   r   r   r   r   r   r   r   r   r  r/  rH  rX  ra  rs  ry  __all__r  r!   r   <module>r     s  "   "   % ! 9  . Q 7 7 4 
		H	%#  HSM UX = =@-F		 -F`bii .6RYY 6r	")) 	   >BII 
bii 
		 &299 &f/ fR\pryy \p~ * * *( R
- R
 R
j 2
&> 2
2
j299 08BII 8v 8 
U
'? U

U
pr!   