
    <h~              	       (   S r SSKrSSKrSSKJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  \R4                  " \5      r " S S\	R:                  5      r\R>                  R@                  S 5       r!S r" " S S\	R:                  5      r#S2S\RH                  S\%S\&S\RH                  4S jjr' " S S\	R:                  5      r( " S S\	R:                  5      r) " S S\	R:                  5      r* " S S \	R:                  5      r+S! r,S" r- " S# S$\5      r. " S% S&\	R:                  5      r/S'\	R:                  SS4S( jr0\ " S) S*\5      5       r1\ " S+ S,\15      5       r2\" S-S.9 " S/ S0\1\5      5       r3/ S1Qr4g)3zPyTorch ViTDet backbone.    N)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutput)PreTrainedModel)auto_docstringlogging)BackboneMixin   )VitDetConfigc                   l   ^  \ rS rSrSrU 4S jrS rS\R                  S\R                  4S jr	Sr
U =r$ )	VitDetEmbeddings%   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) to be consumed by a Transformer.
c                 x  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l
        X0l        X@l        X`l        UR                  (       a@  US-   n[        R                  " [        R                   " SXqR
                  5      5      U l        OS U l        [        R$                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)super__init__pretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patches use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)	selfconfigr!   r   r   r   r"   num_positions	__class__s	           b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   VitDetEmbeddings.__init__+   s    !'!;!;V=N=NJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&22'!OM')||EKK=RdRd4e'fD$'+D$))L:i    c                    U(       a  USS2SS24   nUR                   S   n[        [        R                  " U5      5      nXf-  U:w  a  [	        S5      e[
        R                  R                  5       (       d
  Xc:w  d  Xd:w  aX  [        R                  R                  UR                  SXfS5      R                  SSSS5      X44SS	S
9nUR                  SSSS5      $ UR                  SX4S5      $ )a?  
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
original embeddings.

Args:
    abs_pos_embeddings (`torch.Tensor`):
        Absolute positional embeddings with (1, num_position, num_channels).
    has_cls_token (`bool`):
        If true, has 1 embedding in abs_pos_embeddings for cls token.
    height (`int`):
        Height of input image tokens.
    width (`int`):
        Width of input image tokens.

Returns:
    Absolute positional embeddings after processing with shape (1, height, width, num_channels)
Nr   z5Absolute position embeddings must be a square number.r   r      bicubicF)sizemodealign_corners)shapeintmathsqrt
ValueErrorr%   jit
is_tracingr   
functionalinterpolatereshapepermute)r*   abs_pos_embeddingshas_cls_tokenheightwidthnum_positionr5   new_abs_pos_embeddingss           r.   get_absolute_positions'VitDetEmbeddings.get_absolute_positionsA   s    $ !3AqrE!:)//2499\*+;,&TUU99!!dn%']]%>%>"**1d"=EEaAqQ_#	 &? &" *11!Q1==%--aCCr0   pixel_valuesreturnc                 z   UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      nU R                  bb  UR                  SSSS5      nX0R                  U R                  SUR                   S   UR                   S   5      -   nUR                  SSSS5      nU$ )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r3   r   T)r8   r   r<   r)   r'   rB   rI   )r*   rK   r   
embeddingss       r.   forwardVitDetEmbeddings.forwardg   s    #))!,,,,!../yaI  __\2
##/#++Aq!Q7J#&A&A(($
0@0@0CZEUEUVWEX' J $++Aq!Q7Jr0   )r!   r   r"   r   r'   r)   )__name__
__module____qualname____firstlineno____doc__r   rI   r%   TensorrP   __static_attributes____classcell__r-   s   @r.   r   r   %   s5    
j,$DLELL U\\  r0   r   c                 H   [        S[        X5      -  S-
  5      nUR                  S   U:w  aq  [        R                  R                  UR                  SUR                  S   S5      R                  SSS5      USS9nUR                  SU5      R                  SS5      nOUn[        R                  " U 5      SS2S4   [        X-  S5      -  n[        R                  " U5      SSS24   [        X-  S5      -  nXV-
  US-
  [        X-  S5      -  -   nXGR                  5          $ )	aq  
Get relative positional embeddings according to the relative positions of query and key sizes.

Args:
    q_size (`int`):
        Size of query q.
    k_size (`int`):
        Size of key k.
    rel_pos (`torch.Tensor`):
        Relative position embeddings (num_embeddings, num_channels).

Returns:
    Extracted positional embeddings according to relative positions.
r3   r   r   r2   linear)r5   r6   N      ?)r9   maxr8   r   r?   r@   rA   rB   r%   arangelong)q_sizek_sizerel_posmax_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           r.   get_rel_posri   }   s     q3v..23L}}Q<'--33OOAw}}Q/4<<Q1E 4 

 *11"lCKKAqQ! ||F#AtG,s6?C/HHH||F#D!G,s6?C/HHH*vzSRU=V.VVO//122r0   c                    Uu  pgUu  p[        XhU5      n
[        XyU5      nUR                  u  pnUR                  XX~5      n[        R                  " SX5      n
[        R                  " SX5      nU R                  XXxU	5      U
SS2SS2SS2SS2S4   -   USS2SS2SS2SSS24   -   R                  XU-  X-  5      n U $ )ax  
Calculate decomposed Relative Positional Embeddings as introduced in
[MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

Args:
    attn (`torch.Tensor`):
        Attention map.
    queries (`torch.Tensor`):
        Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
    rel_pos_h (`torch.Tensor`):
        Relative position embeddings (Lh, num_channels) for height axis.
    rel_pos_w (`torch.Tensor`):
        Relative position embeddings (Lw, num_channels) for width axis.
    q_size (`tuple[int]`):
        Spatial sequence size of query q with (queries_height, queries_width).
    k_size (`tuple[int]`):
        Spatial sequence size of key k with (keys_height, keys_width).

Returns:
    attn (Tensor): attention map with added relative positional embeddings.
zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)ri   r8   rA   r%   einsumview)attnqueries	rel_pos_h	rel_pos_wra   rb   queries_heightqueries_widthkeys_height
keys_widthrelative_heightrelative_width
batch_size_dimr_qrelative_weights                    r.   !add_decomposed_relative_positionsr|      s    , %+!N$K!.yIO IFN J3
//*m
ICll#3SJOll#3SIO 			*m*U
!Q1d*
+	,
!Q4*
+	, d:5{7OP	 	 Kr0   c                   :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )VitDetAttention   z=Multi-head Attention block with relative position embeddings.c                 (  > [         TU ]  5         UR                  nUR                  nX@l        X4-  nUS-  U l        [        R                  " X3S-  UR                  S9U l	        [        R                  " X35      U l
        UR                  U l        U R                  (       as  [        R                  " [        R                  " SUS   -  S-
  U5      5      U l        [        R                  " [        R                  " SUS   -  S-
  U5      5      U l        gg)z
Args:
    config (`VitDetConfig`):
        Model configuration.
    input_size (`tuple[int]`, *optional*):
        Input resolution, only required in case relative position embeddings are added.
g      r   biasr3   r   r   N)r   r   r   num_attention_heads	num_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr$   r%   r&   ro   rp   )r*   r+   
input_sizery   r   head_dimr-   s         r.   r   VitDetAttention.__init__   s     	  ..	"#t^
99S'@IIc'	060W0W-00\\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r0   c           	         UR                   u  p4pVU R                  U5      R                  X4U-  SU R                  S5      R	                  SSSSS5      nUR                  SX0R                  -  XE-  S5      R                  S5      u  pn
XR                  -  U	R                  SS5      -  nU R                  (       a%  [        XU R                  U R                  XE4XE45      nUR                  SS9nX-  nUR                  X0R                  XES5      nUR	                  SSSSS5      nUR                  X4US5      nU R                  U5      nU(       a<  UR                  X0R                  UR                   S   UR                   S   5      nX4nU$ U4nU$ )	Nr   r2   r3   r   r      )ry   )r8   r   rA   r   rB   unbindr   	transposer   r|   ro   rp   softmaxrl   r   )r*   hidden_stateoutput_attentionsrw   rE   rF   rx   r   rn   keysvaluesattention_scoresattention_probsoutputss                 r.   rP   VitDetAttention.forward   s   '3'9'9$
Ehh|$,,Z%DNN\^_gghiklnoqrtuv #AzNN/JFN\^ _ f fgh iv#jj0DNN2r4JJ00@ 4>>4>>F?]c\k  +22r2:&/#((^^VTVW#++Aq!Q:#++JrJyy.-55NNO,A,A",EG\G\]_G`O $5G  $oGr0   )r   r   r   ro   rp   r   r   N)F	rR   rS   rT   rU   rV   r   rP   rX   rY   rZ   s   @r.   r~   r~      s    GX4 r0   r~   input	drop_probtrainingrL   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)r8   ndimr%   randr   r   floor_div)r   r   r   	keep_probr8   random_tensoroutputs          r.   	drop_pathr   
  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr0   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )VitDetDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rL   c                 .   > [         TU ]  5         Xl        g r   )r   r   r   )r*   r   r-   s     r.   r   VitDetDropPath.__init__"  s    "r0   hidden_statesc                 B    [        XR                  U R                  5      $ r   )r   r   r   )r*   r   s     r.   rP   VitDetDropPath.forward&  s    FFr0   c                      SU R                    3$ )Nzp=r   r*   s    r.   
extra_reprVitDetDropPath.extra_repr)  s    DNN#$$r0   r   r   )rR   rS   rT   rU   rV   r   floatr   r%   rW   rP   strr   rX   rY   rZ   s   @r.   r   r     sQ    b#(5/ #T # #GU\\ Gell G%C % %r0   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )VitDetLayerNormi-  a<  
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
channel dimension for inputs that have shape (batch_size, channels, height, width).
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        [        R                  " [        R                  " U5      5      U l        X l	        U4U l
        g r   )r   r   r   r$   r%   onesweightr&   r   epsnormalized_shape)r*   r   r   r-   s      r.   r   VitDetLayerNorm.__init__4  sR    ll5::.>#?@LL-=!>?	!1 3r0   c                    UR                  SSS9nX-
  R                  S5      R                  SSS9nX-
  [        R                  " X0R                  -   5      -  nU R
                  S S 2S S 4   U-  U R                  S S 2S S 4   -   nU$ )Nr   T)keepdimr3   )meanpowr%   r;   r   r   r   )r*   xuss       r.   rP   VitDetLayerNorm.forward;  s    FF1dF#UKKN40UejjXX..KK4&*TYYq$}-EEr0   )r   r   r   r   )gư>r   rZ   s   @r.   r   r   -  s    4 r0   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitDetResBottleneckBlockiC  z
The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
1x1, 3x3, 1x1.
c                   > [         TU ]  5         [        R                  " X$SSS9U l        [        U5      U l        [        UR                     U l	        [        R                  " XDSSSS9U l
        [        U5      U l        [        UR                     U l        [        R                  " XCSSS9U l        [        U5      U l        g)a"  
Args:
    config (`VitDetConfig`):
        Model configuration.
    in_channels (`int`):
        Number of input channels.
    out_channels (`int`):
        Number of output channels.
    bottleneck_channels (`int`):
        Number of output channels for the 3x3 "bottleneck" conv layers.
r   Fr   r   )paddingr   N)r   r   r   r(   conv1r   norm1r   
hidden_actact1conv2norm2act2conv3norm3)r*   r+   in_channelsout_channelsbottleneck_channelsr-   s        r.   r   !VitDetResBottleneckBlock.__init__I  s     	YY{O
$%89
6,,-	YY2TU\ab
$%89
6,,-	YY2!%P
$\2
r0   c                 P    UnU R                  5        H  nU" U5      nM     X-   nU$ r   )children)r*   r   outlayers       r.   rP    VitDetResBottleneckBlock.forwarda  s.    ]]_E*C % g
r0   )r   r   r   r   r   r   r   r   r   rZ   s   @r.   r   r   C  s    
30 r0   r   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jrS	r	U =r
$ )
	VitDetMlpij  in_featureshidden_featuresrL   Nc                   > [         TU ]  5         [        R                  " X#5      U l        [
        UR                     U l        [        R                  " X25      U l        [        R                  " UR                  5      U l        g r   )r   r   r   r   fc1r   r   actfc2Dropoutdropout_probdrop)r*   r+   r   r   r-   s       r.   r   VitDetMlp.__init__k  sV    99[:&++,99_:JJv223	r0   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   )r*   r   s     r.   rP   VitDetMlp.forwardr  sH    HHQKHHQKIIaLHHQKIIaLr0   )r   r   r   r   )rR   rS   rT   rU   r9   r   r%   rW   rP   rX   rY   rZ   s   @r.   r   r   j  s=    4C 4# 4$ 4 %,,  r0   r   c           	      H   U R                   u  p#pEXU-  -
  U-  nXU-  -
  U-  n[        R                  R                  U SSSUSU45      n X6-   XG-   pU R	                  X(U-  XU-  X5      n U R                  SSSSSS5      R                  5       R	                  SXU5      n
XU	44$ )a  
Partition into non-overlapping windows with padding if needed.

Args:
    hidden_state (`torch.Tensor`):
        Input tokens with [batch_size, height, width, num_channels].
    window_size (`int`):
        Window size.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements:
    - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
    - (padded_height, padded_width): padded height and width before partition
r   r   r   r3   r      r2   )r8   r   r?   padrl   rB   
contiguous)r   window_sizerw   rE   rF   r   
pad_height	pad_widthpadded_heightpadded_widthwindowss              r.   window_partitionr   |  s     /;.@.@+J 44CJ{22kAI ==$$\Aq!Y:3VWL"("5u7H<$$[0+{?Z\gL ""1aAq!4??AFFr;eqrGL111r0   c                 (   Uu  pEUu  pgU R                   S   XE-  U-  U-  -  nU R                  XU-  XQ-  XS5      n	U	R                  SSSSSS5      R                  5       n	U	R                  XUS5      n	U	SS2SU2SU2SS24   R                  5       n	U	$ )	a  
Window unpartition into original sequences and removing padding.

Args:
    windows (`torch.Tensor`):
        Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
    window_size (`int`):
        Window size.
    pad_height_width (`tuple[int]`):
        Padded height and width (padded_height, padded_width).
    height_width (`tuple[int]`):
        Original height and width before padding.

Returns:
    hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
r   r2   r   r   r3   r   r   N)r8   rl   rB   r   )
r   r   pad_height_widthheight_widthr   r   rE   rF   rw   r   s
             r.   window_unpartitionr     s    " #3M MFq!m&Bk&QU`&`aJ<<[0,2M{ikL  ''1aAq9DDFL$$ZbQL  7F7FUFA 56AACLr0   c                      ^  \ rS rSrSr SS\S\S\S\SS4
U 4S	 jjjr	  SS
\
R                  S\\
R                     S\S\\\
R                  \
R                  4   \\
R                     4   4S jjrSrU =r$ )VitDetLayeri  zCThis corresponds to the Block class in the original implementation.r+   drop_path_rater   use_residual_blockrL   Nc                   > [         T	U ]  5         UR                  nUR                  n[	        U[
        [        45      (       a  UOXf4nUR                  n[	        U[
        [        45      (       a  UOXw4nUS   US   -  US   US   -  4n[        R                  " XQR                  S9U l        [        XS:X  a  UOX34S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " XQR                  S9U l        [%        X['        XQR(                  -  5      S9U l        X0l        X@l        U R.                  (       a  [1        UUUUS-  S9U l        g g )	Nr   r   )r   )r   r   )r+   r   r   r3   )r+   r   r   r   )r   r   r   r!   r   listtupler   r   	LayerNormlayer_norm_epsr   r~   	attentionr   Identityr   r   r   r9   	mlp_ratiomlpr   r  r   residual)
r*   r+   r   r   r  ry   r!   r   r   r-   s
            r.   r   VitDetLayer.__init__  sF    	  &&
#-j4-#H#HZzNf
&&
#-j4-#H#HZzNf
 mz!}4jmzRS}6TU
\\#+@+@A
(A-=zKC]
 <JC;O7UWU`U`Ub\\#+@+@A
FSQTWgWgQgMhi&"4""4 $'1H	DM #r0   r   	head_maskr   c                 b   UR                  SSSS5      nUnU R                  U5      nU R                  S:  a4  UR                  S   UR                  S   pe[	        XR                  5      u  pU R                  UUS9nUS   nUSS  n	U R                  S:  a  [        XR                  WWW45      nX@R                  U5      -   nXR                  U R                  U R                  U5      5      5      -   nUR                  SSSS5      nU R                  (       a  U R                  U5      nU4U	-   n	U	$ )Nr   r3   r   r   )r   )rB   r   r   r8   r   r  r   r   r
  r   r  r  )
r*   r   r  r   shortcutrE   rF   r   self_attention_outputsr   s
             r.   rP   VitDetLayer.forward  s@    &--aAq9 

=1 a)//2M4G4G4JE.>}N^N^._+M!%/ "0 "
 /q1(, a.}>N>NP`cikpbqrM !>>-#@@%txx

=@Y7Z([[%--aAq9"" MM-8M "W,r0   )r  r   r
  r   r   r  r  r   )r   r   F)NF)rR   rS   rT   rU   rV   r   r   r9   boolr   r%   rW   r   r   r  rP   rX   rY   rZ   s   @r.   r   r     s    M qv!"!49!LO!im!	! !L -1"'	(||( ELL)(  	(
 
uU\\5<</0%2EE	F( (r0   r   c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )VitDetEncoderi  r+   rL   Nc           
        > [         TU ]  5         Xl        UR                  n[        R
                  " SUR                  USS9 Vs/ sH  o3R                  5       PM     nn/ n[        U5       HG  nUR                  [        UXF   XaR                  ;   a  UR                  OSXaR                  ;   S95        MI     [        R                  " U5      U l        SU l        g s  snf )Nr   cpu)r   )r   r   r  F)r   r   r+   num_hidden_layersr%   linspacer   itemrangeappendr   window_block_indicesr   residual_block_indicesr   
ModuleListr   gradient_checkpointing)r*   r+   depthr   r   layersir-   s          r.   r   VitDetEncoder.__init__  s    (( -2NN1f>S>SUZch,ij,iq&&(,ijuAMM#1#467;V;V6V 2 2\]'(,I,I'I	  ]]6*
&+# ks   Cr   r  r   output_hidden_statesreturn_dictc                 6   U(       a  SOS nU(       a  SOS n[        U R                  5       H9  u  pU(       a  Xa4-   nUb  X(   OS n
U	" XU5      nUS   nU(       d  M1  X{S   4-   nM;     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r'  ).0vs     r.   	<genexpr>(VitDetEncoder.forward.<locals>.<genexpr>?  s     m$[q$[s   	last_hidden_stater   
attentions)	enumerater   r  r
   )r*   r   r  r   r$  r%  all_hidden_statesall_self_attentionsr"  layer_modulelayer_head_masklayer_outputss               r.   rP   VitDetEncoder.forward#  s     #7BD$5b4(4OA#$58H$H!.7.CilO(IZ[M)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r0   )r+   r  r   )NFFT)rR   rS   rT   rU   r   r   r%   rW   r   r  r   r  r
   rP   rX   rY   rZ   s   @r.   r  r    s    ,| , ,2 -1"'%* !
||!
 ELL)!
  	!

 #!
 !
 
uo%	&!
 !
r0   r  modulec                     [         R                  R                  U R                  SSS9  U R                  b+  [         R                  R                  U R                  S5        gg)a  
Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

Args:
    module (torch.nn.Module): module to initialize.
fan_outrelu)r6   nonlinearityNr   )r   initkaiming_normal_r   r   	constant_)r7  s    r.   caffe2_msra_fillr?  G  sH     GGFMM	O{{
&++q) r0   c                       \ rS rSr% \\S'   SrSrSr/ r	S\
\R                  \R                  \R                  4   SS4S	 jrS
rg)VitDetPreTrainedModeliU  r+   vitdetrK   Tr7  rL   Nc                 l   [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       a  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        g[        U[(        5      (       Ga  U R                  R*                  (       a  [        R                  R                  UR,                  R                  R                  [        R                  5      SU R                  R                  S9UR,                  l        [        R                  R                  UR.                  R                  R                  [        R                  5      SU R                  R                  S9UR.                  l        g[        U[0        5      (       a  UR2                  UR4                  UR6                  4 H  n[9        U5        M     UR:                  UR<                  4 HL  nUR                  R                  R#                  S5        UR                  R                  R                  5         MN     UR>                  R                  R                  R                  5         UR>                  R                  R                  R                  5         gg)zInitialize the weightsr   )r   stdNr]   ) r   r   r   r(   r<  trunc_normal_r   datator%   float32r+   initializer_ranger   r   zero_r  fill_r   r'   r~   r   ro   rp   r   r   r   r   r?  r   r   r   )r*   r7  r   s      r.   _init_weights#VitDetPreTrainedModel._init_weights]  s   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S) 011.0gg.C.C**//225==AKK11 /D / b++112	 &&+ 00T[[5a5a$&GG$9$9  %%((7KK11 %: %F!
 %'GG$9$9  %%((7KK11 %: %F!  899 ,,fllC ' D ,,5!!'',

%%' 6 LL$$**,LL""((* :r0   r'  )rR   rS   rT   rU   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r   r(   r  rL  rX   r'  r0   r.   rA  rA  U  sK     $O&*#)+E"))RYY*L$M )+RV )+r0   rA  c                      ^  \ rS rSrS\4U 4S jjrS\4S jrS\\	\
\	   4   SS4S jr\     SS	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )VitDetModeli  r+   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r   r   r+   r   rO   r  encoder	post_init)r*   r+   r-   s     r.   r   VitDetModel.__init__  s9     *62$V, 	r0   rL   c                 .    U R                   R                  $ r   rO   r)   r   s    r.   get_input_embeddings VitDetModel.get_input_embeddings      )))r0   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrV  r   r  prune_heads)r*   r^  r   headss       r.   _prune_headsVitDetModel._prune_heads  s<    
 +002LELLu%//;;EB 3r0   rK   r  r   r$  r%  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  X R                   R                  5      nU R                  U5      nU R                  UUUUUS9nUS   nU(       d	  U4USS -   $ [        UUR                  UR                  S9$ )aw  
Examples:

```python
>>> from transformers import VitDetConfig, VitDetModel
>>> import torch

>>> config = VitDetConfig()
>>> model = VitDetModel(config)

>>> pixel_values = torch.randn(1, 3, 224, 224)

>>> with torch.no_grad():
...     outputs = model(pixel_values)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 768, 14, 14]
```Nz You have to specify pixel_values)r  r   r$  r%  r   r   r-  )r+   r   r$  use_return_dictr<   get_head_maskr  rO   rV  r
   r   r/  )	r*   rK   r  r   r$  r%  embedding_outputencoder_outputssequence_outputs	            r.   rP   VitDetModel.forward  s    8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y++2O2OP	??<8,,/!5# ' 
 *!,#%(;;;-)77&11
 	
r0   )r+   rO   rV  )NNNNN)rR   rS   rT   rU   r   r   r   r[  dictr9   r  rc  r   r   r%   rW   r  r   r  r
   rP   rX   rY   rZ   s   @r.   rT  rT    s    | *&6 *C4T#Y+? CD C  04,0,0/3&*=
u||,=
 ELL)=
 $D>	=

 'tn=
 d^=
 
uo%	&=
 =
r0   rT  zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    )custom_introc                      ^  \ rS rSrU 4S jrS\4S jr\   SS\R                  S\
\   S\
\   S\
\   S\4
S	 jj5       rS
rU =r$ )VitDetBackbonei  c                   > [         TU ]  U5        [         TU ]	  U5        [        U5      U l        [        U5      U l        [        UR                  S-   5       Vs/ sH  o!R                  PM     snU l
        U R                  5         g s  snf )Nr   )r   r   _init_backboner   rO   r  rV  r  r  r   num_featuresrW  )r*   r+   rx   r-   s      r.   r   VitDetBackbone.__init__  su     v&*62$V,9>v?W?WZ[?[9\]9\A//9\] 	 ^s   BrL   c                 .    U R                   R                  $ r   rZ  r   s    r.   r[  #VitDetBackbone.get_input_embeddings  r]  r0   rK   r$  r   r%  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  USUUS9nU(       a  UR                  OUS   nSn[        U R                  U5       H  u  pXR                  ;   d  M  X4-  nM     U(       d  U(       a  U4USS -   nU$ U4USS -   nU$ [        UU(       a  UR                  OSUR                  S9$ )ap  
Examples:

```python
>>> from transformers import VitDetConfig, VitDetBackbone
>>> import torch

>>> config = VitDetConfig()
>>> model = VitDetBackbone(config)

>>> pixel_values = torch.randn(1, 3, 224, 224)

>>> with torch.no_grad():
...     outputs = model(pixel_values)

>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 14, 14]
```NT)r$  r   r%  r   r'  r3   )feature_mapsr   r/  )r+   rf  r$  r   rO   rV  r   zipstage_namesout_featuresr	   r/  )r*   rK   r$  r   r%  rh  r   r   rw  stager   r   s               r.   rP   VitDetBackbone.forward  s+   6 &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,!%/#	  
 2=--'!*#&t'7'7#GE)))/ $H #&712;6 M '712;6M%3G'//T))
 	
r0   )rO   rV  rr  )NNN)rR   rS   rT   rU   r   r   r[  r   r%   rW   r   r  r	   rP   rX   rY   rZ   s   @r.   ro  ro    st    	*&6 *  04,0&*;
ll;
 'tn;
 $D>	;

 d^;
 
;
 ;
r0   ro  )rT  rA  ro  )r   F)5rV   collections.abcr   r:   typingr   r   r%   torch.utils.checkpointr   activationsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   utilsr   r   utils.backbone_utilsr   configuration_vitdetr   
get_loggerrR   loggerModuler   r=   script_if_tracingri   r|   r~   rW   r   r  r   r   r   r   r   r   r   r   r  r?  rA  rT  ro  __all__r'  r0   r.   <module>r     s      "    ! 9 ? - , 1 . 
		H	%Uryy Up !3 !3H&R;bii ;~U\\ e T V[VbVb *%RYY %bii ,$ryy $N		 $2@>N, Nb8
BII 8
v*RYY *4 * 0+O 0+ 0+f T
' T
 T
n 
K
*M K

K
\ Er0   