
    Ph              	          d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej0                  e      Zd<dededee   defdZ ed       ed      fdedededefdZ G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z# G d  d!ej>                        Z$ G d" d#ej>                        Z% G d$ d%ej>                        Z& G d& d'e      Z' G d( d)ej>                        Z(e G d* d+e             Z)e G d, d-e)             Z* ed./       G d0 d1e)             Z+ G d2 d3ej>                        Z, G d4 d5ej>                        Z- G d6 d7ej>                        Z. ed8/       G d9 d:e)             Z/g d;Z0y)=zPyTorch MobileViTV2 model.    )OptionalUnionN)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging   )MobileViTV2Configvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r   	new_values       n/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler   *   sS     	Is57Q;#677BWLMI3;W	y>    z-infinfmin_valmax_valc                 .    t        |t        ||             S N)r   minr   r    r!   s      r   clipr&   9   s    wGU+,,r   c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTV2ConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r*   r+   r,   r-   paddingr0   r.   r/   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r5   	__class__s               r   r=   zMobileViTV2ConvLayer.__init__?   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S r#   )r@   rB   rE   )rG   rI   s     r   forwardzMobileViTV2ConvLayer.forwardu   sK    ##H-)))(3H??&x0Hr   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   rD   r=   torchTensorrK   __classcell__rH   s   @r   r(   r(   >   s     "&+/4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r   r(   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTV2InvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r)   r*   r+   r-   r0   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   )r*   r+   r,   r   )r*   r+   r,   r-   r.   r0   Fr*   r+   r,   r2   )r<   r=   r   r   roundexpand_ratior>   use_residualr(   
expand_1x1conv_3x3
reduce_1x1)rG   r)   r*   r+   r-   r0   expanded_channelsrH   s          r   r=   z$MobileViTV2InvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J.:KYZ
 -)*$
 /)% 
r   rI   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S r#   )r]   r^   r_   r\   )rG   rI   residuals      r   rK   z#MobileViTV2InvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr   )r   rL   rM   rN   __doc__r   r   r=   rP   rQ   rK   rR   rS   s   @r   rU   rU      sc    
 lm
'
69
IL
VY
eh
	
BF F Fr   rU   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTV2MobileNetLayerr)   r*   r+   r-   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r*   r+   r-   )r<   r=   r   
ModuleListlayerrangerU   append)	rG   r)   r*   r+   r-   rg   irj   rH   s	           r   r=   z"MobileViTV2MobileNetLayer.__init__   sc     	]]_
z"A/')!"avQ	E JJe$&K #r   rI   c                 8    | j                   D ]
  } ||      } |S r#   rj   )rG   rI   layer_modules      r   rK   z!MobileViTV2MobileNetLayer.forward   s     JJL#H-H 'r   )r   r   
rL   rM   rN   r   r   r=   rP   rQ   rK   rR   rS   s   @r   rf   rf      sV    qr'''69'IL'VY'kn'	'   r   rf   c                   h     e Zd ZdZdededdf fdZdej                  dej                  fdZ	 xZ
S )	MobileViTV2LinearSelfAttentionay  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://huggingface.co/papers/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r)   	embed_dimr   Nc           	          t         |           t        ||dd|z  z   dddd      | _        t	        j
                  |j                        | _        t        |||dddd      | _        || _        y )Nr   r   TF)r)   r*   r+   r/   r,   r1   r2   p)	r<   r=   r(   qkv_projr   Dropoutattn_dropoutout_projrt   )rG   r)   rt   rH   s      r   r=   z'MobileViTV2LinearSelfAttention.__init__   s{    ,!a)m,# 
 JJ)<)<=,!"# 
 #r   hidden_statesc                    | j                  |      }t        j                  |d| j                  | j                  gd      \  }}}t        j                  j
                  j                  |d      }| j                  |      }||z  }t        j                  |dd      }t        j                  j
                  j                  |      |j                  |      z  }| j                  |      }|S )Nr   )split_size_or_sectionsdimr   Tr   keepdim)rx   rP   splitrt   r   
functionalsoftmaxrz   sumrelu	expand_asr{   )	rG   r|   qkvquerykeyr   context_scorescontext_vectorouts	            r   rK   z&MobileViTV2LinearSelfAttention.forward   s    mmM*
 "KKQX\XfXfDgmnosE ,,44U4C**>: ~->r4H hh!!&&u-0H0H0OOmmC 
r   rc   rS   s   @r   rs   rs      s>    	#0 #S #T #2U\\ ell r   rs   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2FFNr)   rt   ffn_latent_dimffn_dropoutr   Nc           
          t         |           t        |||ddddd      | _        t	        j
                  |      | _        t        |||ddddd      | _        t	        j
                  |      | _        y )Nr   TF)r)   r*   r+   r,   r-   r/   r1   r2   )	r<   r=   r(   conv1r   ry   dropout1conv2dropout2)rG   r)   rt   r   r   rH   s        r   r=   zMobileViTV2FFN.__init__  s|     	)!'#	

 

;/)&"# 	

 

;/r   r|   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r#   )r   r   r   r   )rG   r|   s     r   rK   zMobileViTV2FFN.forward'  s@    

=1m4

=1m4r           rL   rM   rN   r   r   floatr=   rP   rQ   rK   rR   rS   s   @r   r   r     sY     !0!0 0 	0
 0 
0@U\\ ell r   r   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2TransformerLayerr)   rt   r   dropoutr   Nc                 P   t         |           t        j                  d||j                        | _        t        ||      | _        t        j                  |      | _	        t        j                  d||j                        | _
        t        ||||j                        | _        y )Nr   
num_groupsnum_channelsr8   rv   )r<   r=   r   	GroupNormlayer_norm_epslayernorm_beforers   	attentionry   r   layernorm_afterr   r   ffn)rG   r)   rt   r   r   rH   s        r   r=   z$MobileViTV2TransformerLayer.__init__0  s~     	 "	W]WlWl m7	J

W-!||qyV\VkVkl!&)^VEWEWXr   r|   c                     | j                  |      }| j                  |      }||z   }| j                  |      }| j                  |      }||z   }|S r#   )r   r   r   r   )rG   r|   layernorm_1_outattention_outputlayer_outputs        r   rK   z#MobileViTV2TransformerLayer.forward>  sY    //>>>/:(=8++M:xx-#m3r   r   r   rS   s   @r   r   r   /  s^     Y!Y Y 	Y
 Y 
Y	U\\ 	ell 	r   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2Transformerr)   n_layersd_modelr   Nc                 <   t         	|           |j                  }||z  g|z  }|D cg c]  }t        |dz  dz         }}t	        j
                         | _        t        |      D ].  }t        ||||         }| j                  j                  |       0 y c c}w )N   )rt   r   )
r<   r=   ffn_multiplierr   r   ri   rj   rk   r   rl   )
rG   r)   r   r   r   ffn_dimsd	block_idxtransformer_layerrH   s
            r   r=   zMobileViTV2Transformer.__init__K  s    .."W,-8 2::ACbB':]]_
xI ;'(9:M! JJ/0	 ) ;s   Br|   c                 8    | j                   D ]
  } ||      } |S r#   ro   )rG   r|   rp   s      r   rK   zMobileViTV2Transformer.forward\  s      JJL(7M 'r   rq   rS   s   @r   r   r   J  sA    10 1C 1# 1RV 1"U\\ ell r   r   c                       e Zd ZdZ	 	 	 ddededededededed	d
f fdZdej                  d	e	ej                  e	eef   f   fdZ
dej                  de	eef   d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTV2LayerzE
    MobileViTV2 layer: https://huggingface.co/papers/2206.02680
    r)   r*   r+   attn_unit_dimn_attn_blocksr0   r-   r   Nc                    t         	|           |j                  | _        |j                  | _        |}|dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                  |      | _	        t        |||ddd      | _
        t        |||      | _        t        j                  d||j                        | _        t        |||dd	d      | _        y )
Nr   r   )r*   r+   r-   r0   )r*   r+   r,   r.   F)r*   r+   r,   r1   r2   )r   r   r   T)r<   r=   
patch_sizepatch_widthpatch_heightrU   downsampling_layerr(   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rG   r)   r*   r+   r   r   r0   r-   cnn_out_dimrH   s
            r   r=   zMobileViTV2Layer.__init__g  s    	!,,"--#Q;&A')!)QvA*2Q,QA'D# 'K&*D# -#$//
 -#$# 
 2&-Zgh TZTiTij  4#$"  
r   feature_mapc                 "   |j                   \  }}}}t        j                  j                  || j                  | j
                  f| j                  | j
                  f      }|j                  ||| j                  | j
                  z  d      }|||ffS )N)r,   r-   r   )shaper   r   unfoldr   r   reshape)rG   r   
batch_sizer*   
img_height	img_widthpatchess          r   	unfoldingzMobileViTV2Layer.unfolding  s    9D9J9J6
KY--&&**D,<,<=%%t'7'78 ' 

 //*k4;L;LtO_O_;_acdY///r   r   output_sizec                     |j                   \  }}}}|j                  |||z  |      }t        j                  j	                  ||| j
                  | j                  f| j
                  | j                  f      }|S )N)r   r,   r-   )r   r   r   r   foldr   r   )rG   r   r   r   in_dimr   	n_patchesr   s           r   foldingzMobileViTV2Layer.folding  sz    4;MM1
FJ	//*fz.A9Mmm((#**D,<,<=%%t'7'78	 ) 
 r   rI   c                 6   | j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }|S r#   )r   r   r   r   r   r   r   r   )rG   rI   r   r   s       r   rK   zMobileViTV2Layer.forward  s    ""..x8H ==*==*  $~~h7 ""7+..) <<5''1r   )r   r   r   )rL   rM   rN   rd   r   r   r=   rP   rQ   tupler   r   rK   rR   rS   s   @r   r   r   b  s     ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
z	0U\\ 	0eELL%PSUXPX/<Y6Z 	0u|| %S/ ell   r   r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTV2Encoderr)   r   Nc           	         t         |           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        t        d|j                  z  dd      dd	      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }	t        d|j                  z  d
      }
t        |||dd      }| j
                  j                  |       t        |||dd      }| j
                  j                  |       t        |||t        |j                  d   |j                  z  d
      |j                  d         }| j
                  j                  |       |r|dz  }t        |||	t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       |r|dz  }t        ||	|
t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       y )NFrW   Tr   r       @   r%   r   r   r         i     )r*   r+   r-   rg   r   r   )r*   r+   r   r   )r*   r+   r   r   r0   )r<   r=   r)   r   ri   rj   gradient_checkpointingoutput_strider   r&   width_multiplierrf   rl   r   base_attn_unit_dimsr   )rG   r)   dilate_layer_4dilate_layer_5r0   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rH   s                   r   r=   zMobileViTV2Encoder.__init__  s|   ]]_
&+# +0/1$!N!N!!R'!N$rF333RLVWce
 %R&*A*A%A2N$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN+#$
 	

'"+#$
 	

'""#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"r   r|   output_hidden_statesreturn_dictc                     |rdnd }t        | j                        D ]  \  }} ||      }|s||fz   } |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wr#   r   ).0vs     r   	<genexpr>z-MobileViTV2Encoder.forward.<locals>.<genexpr>5  s     X$Fq!-$Fs   )last_hidden_stater|   )	enumeraterj   r   r
   )rG   r|   r   r   all_hidden_statesrm   rp   s          r   rK   zMobileViTV2Encoder.forward&  sj     #7BD(4OA|(7M#$58H$H!	  5 X]4E$FXXX-]noor   )FT)rL   rM   rN   r   r=   rP   rQ   rO   r   r   r
   rK   rR   rS   s   @r   r   r     sb    O#0 O#T O#h &+ 	p||p #p 	p
 
u44	5pr   r   c                   N    e Zd ZU eed<   dZdZdZdgZde	j                  ddfd	Zy)
MobileViTV2PreTrainedModelr)   mobilevitv2pixel_valuesTr   moduler   Nc                    t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsr   )meanstdNg      ?)rC   r   Linearr?   rA   weightdatanormal_r)   initializer_ranger/   zero_r   fill_)rG   r  s     r   _init_weightsz(MobileViTV2PreTrainedModel._init_weightsB  s    fryy"))R^^DE MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r   )rL   rM   rN   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler  r   r   r   r  r  :  s9    %$O&*#+,
*BII 
*$ 
*r   r  c                        e Zd Zd
dedef fdZd Ze	 	 	 ddee	j                     dee   dee   deeef   fd	       Z xZS )MobileViTV2Modelr)   expand_outputc           	         t         |   |       || _        || _        t	        t        d|j                  z  dd      dd      }t        ||j                  |ddd	d	
      | _	        t        |      | _        | j                          y)a  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
            hidden states. If `False`, only the hidden states will be returned.
        r   r   r   r%   rW   r   r   r   Tr*   r+   r,   r-   r1   r2   N)r<   r=   r)   r  r   r&   r   r(   r   	conv_stemr   encoder	post_init)rG   r)   r  r   rH   s       r   r=   zMobileViTV2Model.__init__Q  s     	 *$rF333RLVWce
 .++$"
 *&1 	r   c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  rj   rC   r   r   r   prune_heads)rG   heads_to_prunelayer_indexheadsmobilevitv2_layerr   s         r   _prune_headszMobileViTV2Model._prune_headsm  si     #1"6"6"8K $ 2 2; ?+-=>):)F)F)L)L%%//;;EB *M #9r   r  r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r |d   }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr   r   r   r   Fr   r   )r   pooler_outputr|   )r)   r   use_return_dictr>   r  r  r  rP   r	  r   r|   )	rG   r  r   r   embedding_outputencoder_outputsr   pooled_outputoutputs	            r   rK   zMobileViTV2Model.forwardw  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  / 2 "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r   )T)NNN)rL   rM   rN   r   rO   r=   r(  r   r   rP   rQ   r   r   r   rK   rR   rS   s   @r   r  r  O  s~    0  8C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r   r  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     dee	   deej                     dee	   de
eef   f
d	       Z xZS )!MobileViTV2ForImageClassificationr)   r   Nc                 L   t         |   |       |j                  | _        t        |      | _        t        d|j                  z  d      }|j                  dkD  r!t        j                  ||j                        nt        j                         | _
        | j                          y )Nr   rW   r   r   )in_featuresout_features)r<   r=   
num_labelsr  r  r   r   r   r  Identity
classifierr   )rG   r)   r+   rH   s      r   r=   z*MobileViTV2ForImageClassification.__init__  s      +++F3%cF,C,C&CQO   1$ II,V=N=NO 	 	r   r  r   labelsr   c                 B   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  |      }d}|| j                  ||| j                         }|s|f|dd z   }	||f|	z   S |	S t        |||j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr*  r   r   )losslogitsr|   )r)   r-  r  r,  r:  loss_functionr   r|   )
rG   r  r   r;  r   outputsr0  r>  r=  r1  s
             r   rK   z)MobileViTV2ForImageClassification.forward  s     &1%<k$++B]B]""<FZhs"t1<--'!*/%%ffdkkBDY,F)-)9TGf$EvE3!//
 	
r   NNNN)rL   rM   rN   r   r=   r   r   rP   rQ   rO   r   r   r   rK   rR   rS   s   @r   r4  r4    s    0 T "  04/3)-&*!
u||,!
 'tn!
 &	!

 d^!
 
u::	;!
 !
r   r4  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2ASPPPoolingr)   r*   r+   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )r   Tr   r  )r<   r=   r   AdaptiveAvgPool2dglobal_poolr(   r   )rG   r)   r*   r+   rH   s       r   r=   zMobileViTV2ASPPPooling.__init__  sB    //A>,#%"!
r   rI   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr+  bilinearFsizemodealign_corners)r   rF  r   r   r   interpolate)rG   rI   spatial_sizes      r   rK   zMobileViTV2ASPPPooling.forward  sS    ~~bc*##H-==*==,,XLzin,or   rq   rS   s   @r   rC  rC    sB    
0 
s 
RU 
Z^ 
  r   rC  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2ASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r)   r   Nc                    t         |           t        d|j                  z  d      }|}|j                  }t        |j                        dk7  rt        d      t        j                         | _
        t        |||dd      }| j                  j                  |       | j                  j                  |j                  D cg c]  }t        |||d|d	       c}       t        |||      }| j                  j                  |       t        |d
|z  |dd      | _        t        j                   |j"                        | _        y c c}w )Nr   rW   r   r   z"Expected 3 values for atrous_ratesr   r   rY   )r*   r+   r,   r0   r2      rv   )r<   r=   r   r   aspp_out_channelslenatrous_ratesr>   r   ri   convsr(   rl   extendrC  projectry   aspp_dropout_probr   )	rG   r)   encoder_out_channelsr*   r+   in_projectionrate
pool_layerrH   s	           r   r=   zMobileViTV2ASPP.__init__  s=   -cF4K4K.KUVW*//v""#q(ABB]]_
,#%!
 	

-(

 #//
 0D % +!- !!#) 0
	
 ,FKN


*%+L 0|YZkq
 zzF$<$<=)
s   ErI   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S )Nr   r   )rV  rl   rP   catrX  r   )rG   rI   pyramidconvpooled_featuress        r   rK   zMobileViTV2ASPP.forward)  sW    JJDNN4>* ))G+,,w/,,7r   
rL   rM   rN   rd   r   r=   rP   rQ   rK   rR   rS   s   @r   rP  rP    s8    *>0 *>T *>X  r   rP  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2DeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r)   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r*   r+   r,   r1   r2   r/   )r<   r=   rP  asppr   	Dropout2dclassifier_dropout_probr   r(   rS  r8  r:  rG   r)   rH   s     r   r=   zMobileViTV2DeepLabV3.__init__:  s]    #F+	||F$B$BC.00**# 
r   r|   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )rg  r   r:  )rG   r|   rI   s      r   rK   zMobileViTV2DeepLabV3.forwardJ  s6    99]2./<<)??8,r   rc  rS   s   @r   re  re  5  s7    
0 
T 
 U\\ ell r   re  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     deej                     dee	   dee	   de
eef   f
d	       Z xZS )"MobileViTV2ForSemanticSegmentationr)   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r  )r<   r=   r8  r  r  re  segmentation_headr   rj  s     r   r=   z+MobileViTV2ForSemanticSegmentation.__init__W  sE      +++F%H!5f!= 	r   r  r;  r   r   c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr*  r+  rH  FrI  )ignore_indexr   )r=  r>  r|   
attentions)r)   r   r-  r8  r>   r  r|   ro  r   r   rM  r   r   semantic_loss_ignore_indexr   )rG   r  r;  r   r   r@  encoder_hidden_statesr>  r=  upsampled_logitsloss_fctr1  s               r   rK   z*MobileViTV2ForSemanticSegmentation.forwarda  ss   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO""!%# # 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r   rA  )rL   rM   rN   r   r=   r   r   rP   rQ   rO   r   r   r   rK   rR   rS   s   @r   rm  rm  Q  s    0 T   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r   rm  )r4  rm  r  r  )rW   N)1rd   typingr   r   rP   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   utilsr   r   configuration_mobilevitv2r   
get_loggerrL   loggerr   r   r   r&   r  r(   rU   rf   rs   r   r   r   r   r   r  r  r4  rC  rP  re  rm  __all__r   r   r   <module>r     s  " ! "   % ! 9  . , 8 
		H	%#  HSM UX  ).fe - - - -Y^ -
=299 =B-F")) -Fb		 .<RYY <~&RYY &R")) 6RYY 0o1 odcp cpL * * *( O
1 O
 O
d 4
(B 4
4
pRYY 09bii 9z299 8 
U
)C U

U
pr   