
    <h              	          S r SSKrSSKrSSKrSSKJr  SSKJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJrJrJr  SSKJr  SSK J!r!  \RD                  " \#5      r$\\" SS9 " S S\5      5       5       r%\\" SS9 " S S\5      5       5       r&\\" SS9 " S S\5      5       5       r'\\" SS9 " S S\5      5       5       r(S r)S r* " S S \RV                  5      r, " S! S"\RV                  5      r- " S# S$\RV                  5      r.SJS%\
R^                  S&\0S'\1S(\
R^                  4S) jjr2 " S* S+\RV                  5      r3 " S, S-\RV                  5      r4 " S. S/\RV                  5      r5 " S0 S1\RV                  5      r6 " S2 S3\RV                  5      r7 " S4 S5\RV                  5      r8 " S6 S7\RV                  5      r9 " S8 S9\5      r: " S: S;\RV                  5      r;\ " S< S=\5      5       r<\ " S> S?\<5      5       r=\" S@S9 " SA SB\<5      5       r>\" SCS9 " SD SE\<5      5       r?\" SFS9 " SG SH\<\5      5       r@/ SIQrAg)KzPyTorch Swin Transformer model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)BackboneMixin   )
SwinConfigzN
    Swin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
SwinEncoderOutput+   a  
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   tupler   r   __static_attributes__r       ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/swin/modeling_swin.pyr   r   +   s}     6:x 1 129=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr)   r   zV
    Swin model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)SwinModelOutputA   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
    Average pooling of the last layer hidden-state.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr   pooler_output.r   r   r   r   )r   r    r!   r"   r#   r   r   r$   r%   r&   r.   r   r'   r   r   r(   r   r)   r*   r,   r,   A   s    	 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr)   r,   z*
    Swin masked image model outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   \S
 5       rSrg)SwinMaskedImageModelingOutputZ   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
    Masked image modeling (MLM) loss.
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Reconstructed pixel values.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlossreconstruction.r   r   r   c                 P    [         R                  " S[        5        U R                  $ )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr3   selfs    r*   logits$SwinMaskedImageModelingOutput.logitst   s%    ]	

 """r)   r   )r   r    r!   r"   r#   r2   r   r$   r%   r&   r3   r   r'   r   r   propertyr:   r(   r   r)   r*   r0   r0   Z   s     )-D(5$$
%,26NHU../6=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJ# #r)   r0   z0
    Swin outputs for image classification.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)SwinImageClassifierOutput~   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr2   r:   .r   r   r   r   )r   r    r!   r"   r#   r2   r   r$   r%   r&   r:   r   r'   r   r   r(   r   r)   r*   r>   r>   ~   s     )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr)   r>   c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )z*
Partitions the given input into windows.
r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r*   window_partitionrQ      so     /<.A.A+J!&&k);8LkM ##Aq!Q15@@BGGKfrsGNr)   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SSSSSS5      R                  5       R                  SX#U5      n U $ )z7
Merges windows to produce higher resolution features.
rD   r   r   r   rA   rB   rC   rE   )rP   rK   rM   rN   rO   s        r*   window_reverserS      se     ==$Lll2v4e6JKfrsGooaAq!Q/::<AA"fUabGNr)   c            
          ^  \ rS rSrSrSU 4S jjrS\R                  S\S\S\R                  4S jr	  SS	\
\R                     S
\
\R                     S\S\\R                     4S jjrSrU =r$ )SwinEmbeddings   zO
Construct the patch and position embeddings. Optionally, also the mask token.
c                   > [         TU ]  5         [        U5      U l        U R                  R                  nU R                  R
                  U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l        UR                  (       a?  [        R                  " [        R                  " SUS-   UR                  5      5      U l        OS U l        [        R                  " UR                  5      U l        [        R"                  " UR$                  5      U l        UR(                  U l        Xl        g )Nr   )super__init__SwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr$   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r9   rk   use_mask_tokenr\   	__class__s       r*   rY   SwinEmbeddings.__init__   s     3F ;++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r)   
embeddingsrM   rN   returnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   NrD         ?r   r   rA   bicubicF)sizemodealign_cornersdim)rF   rd   r$   jit
is_tracingrj   r   reshaperH   r   
functionalinterpolaterG   cat)r9   ro   rM   rN   r\   num_positionsclass_pos_embedpatch_pos_embedrx   
new_height	new_widthsqrt_num_positionss               r*   interpolate_pos_encoding'SwinEmbeddings.interpolate_pos_encoding   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr)   pixel_valuesbool_masked_posr   c                    UR                   u  pEpgU R                  U5      u  pU R                  U5      nUR                  5       u  pnUbI  U R                  R                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  b*  U(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nX4$ )NrD         ?)rF   r[   rf   rt   rb   expand	unsqueezetype_asrd   r   ri   )r9   r   r   r   _rO   rM   rN   ro   output_dimensionsrL   seq_lenmask_tokensmasks                 r*   forwardSwinEmbeddings.forward   s     *6););&(,(=(=l(K%
YYz*
!+!2
Q&//00bIK",,R088ED#sTz2[5GGJ##/''*G*G
\a*bb
'*B*BB
\\*-
,,r)   )rk   ri   rb   rf   r[   r^   rj   rd   )F)NF)r   r    r!   r"   r#   rY   r$   Tensorintr   r   r%   
BoolTensorboolr'   r   r(   __classcell__rm   s   @r*   rU   rU      s    &&D5<< &D &DUX &D]b]i]i &DV 7;).	-u001- "%"2"23- #'	-
 
u||	- -r)   rU   c                      ^  \ rS rSrSrU 4S jrS rS\\R                     S\
\R                  \
\   4   4S jrSrU =r$ )	rZ   i  z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        US   US   -  US   US   -  4U l        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)rX   rY   
image_sizerj   rO   ra   
isinstancecollectionsabcIterabler\   r]   r   Conv2d
projection)r9   rk   r   rj   rO   hidden_sizer\   rm   s          r*   rY   SwinPatchEmbeddings.__init__  s    !'!2!2F4E4EJ$*$7$79I9Ik#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L:ir)   c                 f   X0R                   S   -  S:w  aB  SU R                   S   X0R                   S   -  -
  4n[        R                  R                  X5      nX R                   S   -  S:w  aD  SSSU R                   S   X R                   S   -  -
  4n[        R                  R                  X5      nU$ )Nr   r   )rj   r   r|   pad)r9   r   rM   rN   
pad_valuess        r*   	maybe_padSwinPatchEmbeddings.maybe_pad!  s    ??1%%*T__Q/%//!:L2LLMJ==,,\FLOOA&&!+Q4??1#5QRAS8S#STJ==,,\FLr)   r   rp   c                     UR                   u  p#pEU R                  XU5      nU R                  U5      nUR                   u    p$nXE4nUR                  S5      R	                  SS5      nXg4$ )NrA   r   )rF   r   r   flatten	transpose)r9   r   r   rO   rM   rN   ro   r   s           r*   r   SwinPatchEmbeddings.forward*  sp    )5););&~~lEB__\2
(..1e#O''*44Q:
,,r)   )r]   r   rO   r\   rj   r   )r   r    r!   r"   r#   rY   r   r   r$   r%   r'   r   r   r   r(   r   r   s   @r*   rZ   rZ     sK    j	-HU->->$? 	-E%,,X]^aXbJbDc 	- 	-r)   rZ   c            	          ^  \ rS rSrSr\R                  4S\\   S\S\R                  SS4U 4S jjjr
S	 rS
\R                  S\\\4   S\R                  4S jrSrU =r$ )SwinPatchMergingi6  a  
Patch Merging Layer.

Args:
    input_resolution (`tuple[int]`):
        Resolution of input feature.
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
input_resolutionrx   
norm_layerrp   Nc                    > [         TU ]  5         Xl        X l        [        R
                  " SU-  SU-  SS9U l        U" SU-  5      U l        g )NrB   rA   Fbias)rX   rY   r   rx   r   Linear	reductionrf   )r9   r   rx   r   rm   s       r*   rY   SwinPatchMerging.__init__C  sE     01s7AG%@q3w'	r)   c                     US-  S:H  =(       d    US-  S:H  nU(       a-  SSSUS-  SUS-  4n[         R                  R                  X5      nU$ )NrA   r   r   )r   r|   r   )r9   rJ   rM   rN   
should_padr   s         r*   r   SwinPatchMerging.maybe_padJ  sS    qjAo:519>
Q519a!<JMM--mHMr)   rJ   input_dimensionsc                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      nUS S 2SS S2SS S2S S 24   nUS S 2SS S2SS S2S S 24   n	US S 2SS S2SS S2S S 24   n
US S 2SS S2SS S2S S 24   n[        R                  " XX/S5      nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ )Nr   rA   r   rD   rB   )rF   rG   r   r$   r~   rf   r   )r9   rJ   r   rM   rN   rL   rx   rO   input_feature_0input_feature_1input_feature_2input_feature_3s               r*   r   SwinPatchMerging.forwardR  s   ((5(;(;%
%**:uS}eD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?_"fhjk%**:r1|;KL		-0}5r)   )rx   r   rf   r   )r   r    r!   r"   r#   r   re   r'   r   ModulerY   r   r$   r   r   r(   r   r   s   @r*   r   r   6  s|    
 XZWcWc (s (# (299 (hl ( (U\\ U3PS8_ Y^YeYe  r)   r   input	drop_probtrainingrp   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   dtypedevice)rF   ndimr$   randr   r   floor_div)r   r   r   	keep_probrF   random_tensoroutputs          r*   	drop_pathr   m  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr)   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )SwinDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rp   c                 .   > [         TU ]  5         Xl        g N)rX   rY   r   )r9   r   rm   s     r*   rY   SwinDropPath.__init__  s    "r)   r   c                 B    [        XR                  U R                  5      $ r   )r   r   r   r9   r   s     r*   r   SwinDropPath.forward  s    FFr)   c                      SU R                    3$ )Nzp=r   r8   s    r*   
extra_reprSwinDropPath.extra_repr  s    DNN#$$r)   r   r   )r   r    r!   r"   r#   r   floatrY   r$   r   r   strr   r(   r   r   s   @r*   r   r     sQ    b#(5/ #T # #GU\\ Gell G%C % %r)   r   c                      ^  \ rS rSrU 4S jr   S
S\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )SwinSelfAttentioni  c                 
  > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        [        U[        R                  R                  5      (       a  UOXD4U l        [        R                  " [        R                  " SU R                  S   -  S-
  SU R                  S   -  S-
  -  U5      5      U l        [        R"                  " U R                  S   5      n[        R"                  " U R                  S   5      n[        R$                  " ['        XV/SS95      n[        R(                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  n	U	R+                  SSS5      R-                  5       n	U	S S 2S S 2S4==   U R                  S   S-
  -  ss'   U	S S 2S S 2S4==   U R                  S   S-
  -  ss'   U	S S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   U	R/                  S	5      n
U R1                  S
U
5        [        R2                  " U R                  U R                  UR4                  S9U l        [        R2                  " U R                  U R                  UR4                  S9U l        [        R2                  " U R                  U R                  UR4                  S9U l        [        R<                  " UR>                  5      U l         g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()rA   r   ij)indexingrD   relative_position_indexr   )!rX   rY   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   rK   r   r_   r$   r`   relative_position_bias_tablearangestackr   r   rH   rI   sumregister_bufferr   qkv_biasquerykeyvaluerg   attention_probs_dropout_probri   )r9   rk   rx   	num_headsrK   coords_hcoords_wcoordscoords_flattenrelative_coordsr   rm   s              r*   rY   SwinSelfAttention.__init__  s   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP%k;??3K3KLLKS^Rl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr)   r   attention_mask	head_maskoutput_attentionsrp   c                    UR                   u  pVnXVSU R                  4nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      n[        R                  " XR	                  SS5      5      nU[        R                  " U R                  5      -  nU R                  U R                  R                  S5         nUR                  U R                  S   U R                  S   -  U R                  S   U R                  S   -  S5      nUR                  SSS5      R                  5       nXR!                  S5      -   nUbm  UR                   S   nUR                  X^-  XR"                  Xf5      nXR!                  S5      R!                  S5      -   nUR                  SU R"                  Xf5      n[$        R&                  R)                  USS9nU R+                  U5      nUb  X-  n[        R                  " X5      nUR                  SSSS5      R                  5       nUR-                  5       S S U R.                  4-   nUR                  U5      nU(       a  UU4nU$ U4nU$ )NrD   r   rA   r   rw   r   )rF   r   r   rG   r   r   r   r$   matmulmathsqrtr   r   rK   rH   rI   r   r   r   r|   softmaxri   rt   r   )r9   r   r   r  r  rL   rx   rO   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r*   r   SwinSelfAttention.forward  s    )6(;(;%
"T-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.N.Nq.QQ%'--a0J/44(*6N6NPS   02J2J12M2W2WXY2ZZ/44R9Q9QSV\ --//0@b/I ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r)   )	r   r   ri   r   r   r   r   r   rK   NNF)r   r    r!   r"   rY   r$   r   r   r%   r   r'   r   r(   r   r   s   @r*   r   r     sv    #GP 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6 6r)   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )SwinSelfOutputi  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )rX   rY   r   r   denserg   r   ri   r9   rk   rx   rm   s      r*   rY   SwinSelfOutput.__init__  s4    YYs(
zz&"E"EFr)   r   input_tensorrp   c                 J    U R                  U5      nU R                  U5      nU$ r   r  ri   )r9   r   r  s      r*   r   SwinSelfOutput.forward  s$    

=1]3r)   r  
r   r    r!   r"   rY   r$   r   r   r(   r   r   s   @r*   r  r    s7    G
U\\  RWR^R^  r)   r  c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )SwinAttentioni  c                    > [         TU ]  5         [        XX45      U l        [	        X5      U l        [        5       U l        g r   )rX   rY   r   r9   r  r   setpruned_heads)r9   rk   rx   r   rK   rm   s        r*   rY   SwinAttention.__init__  s2    %f9J	$V1Er)   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rw   )lenr   r9   r   r   r%  r   r   r   r   r   r  r   union)r9   headsindexs      r*   prune_headsSwinAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r)   r   r   r  r  rp   c                 f    U R                  XX45      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r9   r   )r9   r   r   r  r  self_outputsattention_outputr  s           r*   r   SwinAttention.forward  sB     yy	];;|AF#%QR(88r)   )r   r%  r9   r  )r   r    r!   r"   rY   r,  r$   r   r   r%   r   r'   r   r(   r   r   s   @r*   r"  r"    sy    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
 
r)   r"  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )SwinIntermediatei"  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rX   rY   r   r   r   	mlp_ratior  r   
hidden_actr   r   intermediate_act_fnr  s      r*   rY   SwinIntermediate.__init__#  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r)   r   rp   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r7  r   s     r*   r   SwinIntermediate.forward+  s&    

=100?r)   r:  r   r   s   @r*   r3  r3  "  s(    9U\\ ell  r)   r3  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
SwinOutputi1  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
rX   rY   r   r   r   r5  r  rg   rh   ri   r  s      r*   rY   SwinOutput.__init__2  sF    YYs6#3#3c#9:C@
zz&"<"<=r)   r   rp   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r   s     r*   r   SwinOutput.forward7  s$    

=1]3r)   r  r   r   s   @r*   r=  r=  1  s(    >
U\\ ell  r)   r=  c                      ^  \ rS rSrSU 4S jjrS rS rS r   SS\R                  S\
\\4   S\\R                     S	\\   S
\\   S\
\R                  \R                  4   4S jjrSrU =r$ )	SwinLayeri=  c                   > [         TU ]  5         UR                  U l        X`l        UR                  U l        X0l        [        R                  " X!R                  S9U l	        [        XX@R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        g )Neps)rK   r   )rX   rY   chunk_size_feed_forward
shift_sizerK   r   r   re   layer_norm_epslayernorm_beforer"  	attentionr   Identityr   layernorm_afterr3  intermediater=  r   )r9   rk   rx   r   r   drop_path_raterH  rm   s          r*   rY   SwinLayer.__init__>  s    '-'E'E$$!-- 0 "S6K6K L&vIK[K[\9G#9Mn5SUS^S^S`!||C5J5JK,V9 -r)   c                    [        U5      U R                  ::  an  [        S5      U l        [        R
                  R                  5       (       a*  [        R                   " [        R                  " U5      5      O
[        U5      U l        g g Nr   )minrK   r   rH  r$   ry   rz   tensor)r9   r   s     r*   set_shift_and_window_size#SwinLayer.set_shift_and_window_sizeK  s_     D$4$44'lDO=BYY=Q=Q=S=S		%,,'789Y\]mYn  5r)   c           	         U R                   S:  Gae  [        R                  " SXS4X4S9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4nSnU H  n	U H  n
XS S 2XS S 24'   US-  nM     M     [        XPR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  S5      R                  US:H  S5      nU$ S nU$ )Nr   r   r   rD   rA   g      Yr   )	rH  r$   r`   slicerK   rQ   rG   r   masked_fill)r9   rM   rN   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r*   get_attn_maskSwinLayer.get_attn_maskS  sy   ??Q{{Ava#8UHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir)   c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ rR  )rK   r   r|   r   )r9   r   rM   rN   	pad_right
pad_bottomr   s          r*   r   SwinLayer.maybe_pado  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r)   r   r   r  r  always_partitionrp   c                    U(       d  U R                  U5        O Uu  pgUR                  5       u  pn
UnU R                  U5      nUR                  XXz5      nU R	                  XU5      u  pUR
                  u  ppU R                  S:  a.  [        R                  " XR                  * U R                  * 4SS9nOUn[        XR                  5      nUR                  SU R                  U R                  -  U
5      nU R                  XUR                  UR                  S9nU R                  UUX4S9nUS   nUR                  SU R                  U R                  U
5      n[        UU R                  X5      nU R                  S:  a-  [        R                  " UU R                  U R                  4SS9nOUnUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R!                  5       nUR                  XU-  U
5      nXR#                  U5      -   nU R%                  U5      nU R'                  U5      nXR)                  U5      -   nU(       a	  UUS	   4nU$ U4nU$ )
Nr   )r   rA   )shiftsdimsrD   r   )r  r   rC   r   )rU  rt   rJ  rG   r   rF   rH  r$   rollrQ   rK   rb  r   r   rK  rS   rI   r   rM  rN  r   )r9   r   r   r  r  rh  rM   rN   rL   r   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsra  attention_outputsr0  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r*   r   SwinLayer.forwardv  s     **+;<("/"4"4"6
x --m<%**:uO %)NN=%$P!&3&9&9#y??Q$)JJ}FVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&)<)<EZEaEa ' 
	 !NN!9i + 
 -Q/,11"d6F6FHXHXZbc():D<L<Ljd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:~xX >>2C#DD++M:((6${{<'@@@Q'8';< YeWfr)   )
rK  rG  r   r   rN  rM  rJ  r   rH  rK   )r   r   NFF)r   r    r!   r"   rY   rU  rb  r   r$   r   r'   r   r   r%   r   r   r(   r   r   s   @r*   rC  rC  =  s    .8) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*A Ar)   rC  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\\4   S\	\R                     S\	\   S\	\   S\\R                     4S	 jjrS
rU =r$ )	SwinStagei  c                 P  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ sH+  n[        UUUUXh   US-  S:X  a  SOUR                  S-  S9PM-     sn5      U l	        Ub  U" X2[        R                  S9U l        OS U l        SU l        g s  snf )NrA   r   )rk   rx   r   r   rO  rH  )rx   r   F)rX   rY   rk   rx   r   
ModuleListrangerC  rK   blocksre   
downsamplepointing)
r9   rk   rx   r   depthr   r   r  irm   s
            r*   rY   SwinStage.__init__  s    mm u
 &A !%5'#,<%&UaZqf6H6HA6M &

 !()9r||\DO"DO'
s   1B#r   r   r  r  rh  rp   c                    Uu  pg[        U R                  5       H  u  pUb  X8   OS n
U	" XXU5      nUS   nM     UnU R                  b%  US-   S-  US-   S-  pXgX4nU R                  X5      nOXgXg4nXU4nU(       a  UWSS  -  nU$ )Nr   r   rA   )	enumerater  r  )r9   r   r   r  r  rh  rM   rN   r  layer_modulelayer_head_maskrx  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr   stage_outputss                    r*   r   SwinStage.forward  s     )(5OA.7.CilO(UeM *!,M  6 -:)??&5;aZA4EPQ	VWGW 1!'0B V OO,M`M!' >&K\]]12..Mr)   )r  rk   rx   r  r  rz  )r   r    r!   r"   rY   r$   r   r'   r   r   r%   r   r   r(   r   r   s   @r*   r|  r|    s    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	 r)   r|  c                      ^  \ rS rSrU 4S jr      SS\R                  S\\\4   S\	\R                     S\	\   S\	\   S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SwinEncoderi  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " SUR                  [        UR                  5      SS9 Vs/ sH  o3R                  5       PM     nn[        R                  " [        U R                  5       Vs/ sH  n[        U[        UR                   SU-  -  5      US   SU-  -  US   SU-  -  4UR                  U   UR"                  U   U[        UR                  S U 5      [        UR                  S US-    5       XPR                  S-
  :  a  [$        OS S9PM     sn5      U l        SU l        g s  snf s  snf )Nr   cpu)r   rA   r   )rk   rx   r   r  r   r   r  F)rX   rY   r(  depths
num_layersrk   r$   linspacerO  r   itemr   r~  r  r|  r   ra   r   r   layersgradient_checkpointing)r9   rk   r]   xdpri_layerrm   s         r*   rY   SwinEncoder.__init__  sQ   fmm,!&63H3H#fmmJ\ej!kl!kAvvx!klmm  %T__5  6G !F,,q'z9:&/lq'z&BIaLUVX_U_D`%a --0$..w7!#fmmHW&=">V]]S`U\_`U`EaAbc4;ooPQ>Q4Q/X\  6
 ',#! ms   &E$'B)E)r   r   r  r  output_hidden_states(output_hidden_states_before_downsamplingrh  return_dictrp   c	                 2   U(       a  SOS n	U(       a  SOS n
U(       a  SOS nU(       aB  UR                   u  pnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  n	X4-  n
[        U R                  5       H  u  nnUb  UU   OS nU" XUXG5      nUS   nUS   nUS   nUS   US   4nU(       aS  U(       aL  UR                   u  pnUR                  " U/US   US   4QUP76 nUR                  SSSS5      nU	U4-  n	X4-  n
OPU(       aI  U(       dB  UR                   u  pnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  n	X4-  n
U(       d  M  UUSS  -  nM     U(       d  [        S XU4 5       5      $ [        UU	UU
S	9$ )
Nr   r   r   r   rA   r  rD   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r   ).0vs     r*   	<genexpr>&SwinEncoder.forward.<locals>.<genexpr>G  s     m$[q$[s   	)r   r   r   r   )rF   rG   rH   r  r  r'   r   )r9   r   r   r  r  r  r  rh  r  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsrL   r   r   reshaped_hidden_stater  r  r  rx  r  r   s                         r*   r   SwinEncoder.forward  s$    #7BD+?RT"$5b4)6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&*BB&(5OA|.7.CilO(BSM *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*.FF*%.V-:-@-@*
{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*.FF*  #}QR'88#A  6D m]GZ$[mmm ++*#=	
 	
r)   )rk   r  r  r  )NFFFFT)r   r    r!   r"   rY   r$   r   r'   r   r   r%   r   r   r   r   r(   r   r   s   @r*   r  r    s    ,4 26,1/4CH+0&*A
||A
  S/A
 E--.	A

 $D>A
 'tnA
 3;4.A
 #4.A
 d^A
 
u''	(A
 A
r)   r  c                   8    \ rS rSr% \\S'   SrSrSrS/r	S r
Srg	)
SwinPreTrainedModeliQ  rk   swinr   Tr|  c                 p   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       ad  UR                  b$  UR                  R
                  R                  5         UR                  b%  UR                  R
                  R                  5         gg[        U[         5      (       a%  UR"                  R
                  R                  5         gg)zInitialize the weightsr   )meanstdNr   )r   r   r   r   weightdatanormal_rk   initializer_ranger   zero_re   fill_rU   rb   rd   r   r   )r9   modules     r*   _init_weights!SwinPreTrainedModel._init_weightsY  s.   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S)//  ,!!&&,,.))5**//557 6 122//44::< 3r)   r   N)r   r    r!   r"   r   r&   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr  r(   r   r)   r*   r  r  Q  s&    $O&*#$=r)   r  c                      ^  \ rS rSrSU 4S jjrS rS r\       SS\\	R                     S\\	R                     S\\	R                     S\\   S	\\   S
\S\\   S\\\4   4S jj5       rSrU =r$ )	SwinModelim  c                   > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  SU R
                  S-
  -  -  5      U l        [        XS9U l
        [        XR                  R                  5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z
add_pooling_layer (`bool`, *optional*, defaults to `True`):
    Whether or not to apply pooling layer.
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether or not to create and apply mask tokens in the embedding layer.
rA   r   )rl   rE  N)rX   rY   rk   r(  r  r  r   ra   num_featuresrU   ro   r  r^   encoderr   re   rI  	layernormAdaptiveAvgPool1dpooler	post_init)r9   rk   add_pooling_layerrl   rm   s       r*   rY   SwinModel.__init__o  s     	 fmm, 0 0119L3M MN(O"6??+E+EFd&7&7V=R=RS1Bb**1- 	r)   c                 .    U R                   R                  $ r   ro   r[   r8   s    r*   get_input_embeddingsSwinModel.get_input_embeddings      ///r)   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  layerrK  r,  )r9   heads_to_pruner  r*  s       r*   _prune_headsSwinModel._prune_heads  s<    
 +002LELLu%//;;EB 3r)   r   r   r  r  r  r   r  rp   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U[        U R                   R                  5      5      nU R                  XUS9u  pU R                  UU	UUUUS9n
U
S   nU R                  U5      nSnU R                  b8  U R                  UR                  SS5      5      n[        R                  " US5      nU(       d  X4U
SS -   nU$ [        UUU
R                   U
R"                  U
R$                  S9$ )	z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)r   r   )r  r  r  r  r   r   rA   )r   r.   r   r   r   )rk   r  r  use_return_dictr   get_head_maskr(  r  ro   r  r  r  r   r$   r   r,   r   r   r   )r9   r   r   r  r  r  r   r  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s                 r*   r   SwinModel.forward  si    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y#dkk6H6H2IJ	-1__Tl .= .
* ,,/!5# ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%58KKFM-')77&11#2#I#I
 	
r)   )rk   ro   r  r  r  r  r  )TFNNNNNFN)r   r    r!   r"   rY   r  r  r   r   r$   r%   r   r   r   r'   r,   r   r(   r   r   s   @r*   r  r  m  s    *0C  596:15,0/3).&*>
u001>
 "%"2"23>
 E--.	>

 $D>>
 'tn>
 #'>
 d^>
 
uo%	&>
 >
r)   r  ad  
    Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                      ^  \ rS rSrU 4S jr\       SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\
S	\\
   S
\\\4   4S jj5       rSrU =r$ )SwinForMaskedImageModelingi  c                   > [         TU ]  U5        [        USSS9U l        [	        UR
                  SUR                  S-
  -  -  5      n[        R                  " [        R                  " X!R                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r  rl   rA   r   )in_channelsout_channelsr   )rX   rY   r  r  r   ra   r  r   
Sequentialr   encoder_striderO   PixelShuffledecoderr  )r9   rk   r  rm   s      r*   rY   #SwinForMaskedImageModeling.__init__  s     fdS	6++aF4E4E4I.JJK}}II(7L7La7ORXReRe7est OOF112	
 	r)   r   r   r  r  r  r   r  rp   c           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9nUS   n	U	R                  SS5      n	U	R                  u  pn[
        R                  " US-  5      =pU	R                  XX5      n	U R                  U	5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  SUU5      nUR                  U R                   R                  S5      R                  U R                   R                  S5      R                  S5      R                  5       n[        R                  R!                  XSS	9nUU-  R#                  5       UR#                  5       S
-   -  U R                   R$                  -  nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 192, 192]
```N)r   r  r  r  r   r  r   r   rA   rr   rD   none)r   gh㈵>)r2   r3   r   r   r   )rk   r  r  r   rF   r  floorr{   r  r   rj   repeat_interleaver   rI   r   r|   l1_lossr   rO   r0   r   r   r   )r9   r   r   r  r  r  r   r  r  r  rL   rO   sequence_lengthrM   rN   reconstructed_pixel_valuesmasked_im_lossrt   r   reconstruction_lossr   s                        r*   r   "SwinForMaskedImageModeling.forward  s   L &1%<k$++B]B]))+/!5%=#  
 "!*)33Aq94C4I4I1
/OS$899)11*FZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY,5!//))#*#A#A
 	
r)   )r  r  r  )r   r    r!   r"   rY   r   r   r$   r%   r   r   r   r'   r0   r   r(   r   r   s   @r*   r  r    s       596:15,0/3).&*R
u001R
 "%"2"23R
 E--.	R

 $D>R
 'tnR
 #'R
 d^R
 
u33	4R
 R
r)   r  a  
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Swin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrU 4S jr\       SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\
S	\\
   S
\\\4   4S jj5       rSrU =r$ )SwinForImageClassificationiD  c                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g rR  )rX   rY   
num_labelsr  r  r   r   r  rL  
classifierr  )r9   rk   rm   s     r*   rY   #SwinForImageClassification.__init__S  sx      ++f%	 EKDUDUXYDYBIIdii,,f.?.?@_a_j_j_l 	
 	r)   r   r  labelsr  r  r   r  rp   c           	      V   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   n	U R                  U	5      n
SnUb  U R	                  XXR                   S9nU(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r  r  r  r   r  r   )r:   r  pooled_logitsrk   rA   )r2   r:   r   r   r   )	rk   r  r  r  loss_functionr>   r   r   r   )r9   r   r  r  r  r  r   r  r  r  r:   r2   r   s                r*   r   "SwinForImageClassification.forwarda  s    " &1%<k$++B]B]))/!5%=#  
  
/%%VRXalal%mDY,F)-)9TGf$EvE(!//))#*#A#A
 	
r)   )r  r  r  r  )r   r    r!   r"   rY   r   r   r$   r%   
LongTensorr   r   r'   r>   r   r(   r   r   s   @r*   r  r  D  s      5915-1,0/3).&*-
u001-
 E--.-
 ))*	-

 $D>-
 'tn-
 #'-
 d^-
 
u//	0-
 -
r)   r  zM
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    c                      ^  \ rS rSrS\4U 4S jjrS r   SS\R                  S\	\
   S\	\
   S\	\
   S	\4
S
 jjrSrU =r$ )SwinBackbonei  rk   c           	      D  > [         TU ]  U5        [         TU ]	  U5        UR                  /[	        [        UR                  5      5       Vs/ sH  n[        UR                  SU-  -  5      PM      sn-   U l        [        U5      U l
        [        XR                  R                  5      U l        0 n[        U R                  U R                   5       H  u  pE["        R$                  " U5      X4'   M     ["        R&                  " U5      U l        U R+                  5         g s  snf )NrA   )rX   rY   _init_backbonera   r  r(  r  r   r  rU   ro   r  r^   r  zip_out_featuresrm  r   re   
ModuleDicthidden_states_normsr  )r9   rk   r  r  stagerO   rm   s         r*   rY   SwinBackbone.__init__  s     v&#--.X]^abhbobo^pXq1rXqST#f6F6FA6M2NXq1rr(0"6??+E+EF !#&t'9'94==#IE)+l)C& $J#%==1D#E  	 2ss   $Dc                 .    U R                   R                  $ r   r  r8   s    r*   r  !SwinBackbone.get_input_embeddings  r  r)   r   r  r  r  rp   c                 0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      u  pVU R                  UUSUSSSSS9nUR                  nSn	[        U R                  U5       H  u  pXR                  ;   d  M  UR                  u  ppUR                  SSSS5      R                  5       nUR                  XU-  U5      nU R                  U
   " U5      nUR                  XX5      nUR                  SSSS5      R                  5       nX4-  n	M     U(       d  U	4nU(       a  UUR                  4-  nU$ [!        U	U(       a  UR                  OSUR"                  S	9$ )
a  
Returns:

Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
>>> model = AutoBackbone.from_pretrained(
...     "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 7, 7]
```NT)r  r  r  r  rh  r  r   r   rA   r   r   )feature_mapsr   r   )rk   r  r  r  ro   r  r   r  stage_namesout_featuresrF   rH   rI   rG   r  r   r
   r   )r9   r   r  r  r  r  r   r  r   r
  r  hidden_staterL   rO   rM   rN   r   s                    r*   r   SwinBackbone.forward  s   @ &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq-1__\-J*,,/!%59!  	
  66#&t'7'7#GE))):F:L:L7
&+33Aq!Q?JJL+00e^\Z#77>|L+00UY+33Aq!Q?JJL/ $H "_F#70022M%3G'//T))
 	
r)   )ro   r  r  r  )NNN)r   r    r!   r"   r   rY   r  r$   r   r   r   r
   r   r(   r   r   s   @r*   r  r    sp    z "0 04,0&*J
llJ
 'tnJ
 $D>	J

 d^J
 
J
 J
r)   r  )r  r  r  r  r  )r   F)Br#   collections.abcr   r  r5   dataclassesr   typingr   r   r$   torch.utils.checkpointr   activationsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_swinr   
get_loggerr   loggerr   r,   r0   r>   rQ   rS   r   rU   rZ   r   r   r   r   r   r   r   r  r"  r3  r=  rC  r|  r  r  r  r  r  r  __all__r   r)   r*   <module>r     s   &    ! "    ! 9 . - [ [ D D 1 * 
		H	% 
K K K  
Kk K K& 
#K # #< 
K K K*	Y-RYY Y-x(-")) (-V3ryy 3nU\\ e T V[VbVb *%299 %\		 \~
RYY 
#BII #Lryy 	 	z		 zz9* 9xX
")) X
v =/ = =6 `
# `
 `
F 	d
!4 d
d
N =
!4 =
=
@ 
_
& _

_
Dr)   