
    <h              	          S r SSKrSSKrSSKJr  SSKJrJr  SSK	r	SSK
r	SSK	Jr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  \R@                  " \!5      r"\\" SS9 " S S\5      5       5       r#\\" SS9 " S S\5      5       5       r$\\" SS9 " S S\5      5       5       r%\\" SS9 " S S\5      5       5       r& " S S\RN                  5      r( " S S \RN                  5      r)S@S!\	RT                  S"\+S#\,S$\	RT                  4S% jjr- " S& S'\RN                  5      r. " S( S)\RN                  5      r/ " S* S+\RN                  5      r0 " S, S-\RN                  5      r1 " S. S/\5      r2 " S0 S1\RN                  5      r3\ " S2 S3\5      5       r4\ " S4 S5\45      5       r5\" S6S9 " S7 S8\45      5       r6\" S9S9 " S: S;\45      5       r7\" S<S9 " S= S>\4\5      5       r8/ S?Qr9g)AzPyTorch FocalNet model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging)BackboneMixin   )FocalNetConfigzC
    FocalNet encoder's outputs, with potential hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FocalNetEncoderOutput'   a  
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlast_hidden_statehidden_statesreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   tupler   __static_attributes__r       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   r   '   sT     6:x 1 1298<M8E%"3"345<AEHU5+<+<%=>Er'   r   zZ
    FocalNet model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	FocalNetModelOutput<   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
    Average pooling of the last layer hidden-state.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr   pooler_outputr   r   r   )r   r   r   r    r!   r   r   r"   r#   r$   r,   r   r%   r   r&   r   r'   r(   r*   r*   <   si    	 6:x 1 12915M8E--.58<M8E%"3"345<AEHU5+<+<%=>Er'   r*   z.
    FocalNet masked image model outputs.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	!FocalNetMaskedImageModelingOutputT   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
    Masked image modeling (MLM) loss.
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Reconstructed pixel values.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlossreconstructionr   r   r   )r   r   r   r    r!   r0   r   r"   r#   r$   r1   r   r%   r   r&   r   r'   r(   r.   r.   T   sh     )-D(5$$
%,26NHU../68<M8E%"3"345<AEHU5+<+<%=>Er'   r.   z4
    FocalNet outputs for image classification.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	FocalNetImageClassifierOutputn   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr0   logitsr   r   r   )r   r   r   r    r!   r0   r   r"   r#   r$   r5   r   r%   r   r&   r   r'   r(   r3   r3   n   sh     )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<AEHU5+<+<%=>Er'   r3   c                      ^  \ rS rSrSrS	U 4S jjr S
S\\R                     S\\R                     S\
\R                     4S jjrSrU =r$ )FocalNetEmbeddings   zP
Construct the patch embeddings and layernorm. Optionally, also the mask token.
c           
        > [         TU ]  5         [        UUR                  UR                  UR
                  UR                  UR                  SS9U l        U R                  R                  U l
        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l        [        R                   " UR                  UR"                  S9U l        [        R&                  " UR(                  5      U l        g )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr;   r<   r=   r>   r?   patch_embeddings	grid_size
patch_gridr   	Parameterr"   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr:   use_mask_token	__class__s      r(   rD   FocalNetEmbeddings.__init__   s     7((((,,&&!00!
 //99O]",,u{{1a9I9I'JKcgLL!1!1v7L7LM	zz&"<"<=r'   pixel_valuesbool_masked_posreturnc                 0   U R                  U5      u  p4U R                  U5      nUR                  5       u  pVnUbI  U R                  R	                  XVS5      nUR                  S5      R                  U5      n	USU	-
  -  X-  -   nU R                  U5      nX44$ )N      ?)rF   rN   sizerK   expand	unsqueezetype_asrQ   )
rR   rV   rW   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmasks
             r(   forwardFocalNetEmbeddings.forward   s     )-(=(=l(K%
YYz*
!+!2
Q&//00bIK",,R088ED#sTz2[5GGJ\\*-
,,r'   )rQ   rK   rN   rF   rH   )FN)r   r   r   r    r!   rD   r   r"   r#   
BoolTensorr%   Tensorrg   r&   __classcell__rT   s   @r(   r7   r7      sV    >& hl-$U%6%67-JRSXScScJd-	u||	- -r'   r7   c                      ^  \ rS rSr   SU 4S jjrS rS\\R                     S\	\R                  \	\   4   4S jrSrU =r$ )	rE      c	                 t  > [         TU ]  5         [        U[        R                  R
                  5      (       a  UOX"4n[        U[        R                  R
                  5      (       a  UOX34nUS   US   -  US   US   -  -  n	X l        X0l        X@l        Xl	        US   US   -  US   US   -  4U l
        U(       a0  U(       a  Sn
SnSnOSn
SnSn[        R                  " XEXUS9U l        O[        R                  " XEX3S9U l        U(       a$  [        R                  " XQR                  S	9U l        g S U l        g )
Nr   r            r
   )kernel_sizestridepadding)rt   ru   rA   )rC   rD   
isinstancecollectionsabcIterabler;   r<   r=   num_patchesrG   r   Conv2d
projectionrL   rM   rN   )rR   r:   r;   r<   r=   r>   add_normr?   r@   r{   rt   rv   ru   rT   s                r(   rD    FocalNetPatchEmbeddings.__init__   s$    	#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY ii[Y`DO !iiZkDOY4I4IJDIDIr'   c                 f   X0R                   S   -  S:w  aB  SU R                   S   X0R                   S   -  -
  4n[        R                  R                  X5      nX R                   S   -  S:w  aD  SSSU R                   S   X R                   S   -  -
  4n[        R                  R                  X5      nU$ )Nr   r   )r<   r   
functionalpad)rR   rV   heightwidth
pad_valuess        r(   	maybe_pad!FocalNetPatchEmbeddings.maybe_pad   s    ??1%%*T__Q/%//!:L2LLMJ==,,\FLOOA&&!+Q4??1#5QRAS8S#STJ==,,\FLr'   rV   rX   c                 D   UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  XU5      nU R	                  U5      nUR                   u    p$nXE4nUR                  S5      R                  SS5      nU R                  b  U R                  U5      nXg4$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rr   r   )shaper=   
ValueErrorr   r}   flatten	transposerN   )rR   rV   rd   r=   r   r   r`   ra   s           r(   rg   FocalNetPatchEmbeddings.forward   s    )5););&,,,w  ~~lEB__\2
(..1e#O''*44Q:
99 :.J,,r'   )rG   r;   rN   r=   r{   r<   r}   )FFF)r   r   r   r    rD   r   r   r"   r#   r%   rk   intrg   r&   rl   rm   s   @r(   rE   rE      sQ     (T-HU->->$? -E%,,X]^aXbJbDc - -r'   rE   input	drop_probtrainingrX   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)r   ndimr"   randr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          r(   	drop_pathr      s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr'   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )FocalNetDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rX   c                 .   > [         TU ]  5         Xl        g ri   )rC   rD   r   )rR   r   rT   s     r(   rD   FocalNetDropPath.__init__  s    "r'   r   c                 B    [        XR                  U R                  5      $ ri   )r   r   r   )rR   r   s     r(   rg   FocalNetDropPath.forward  s    FFr'   c                      SU R                    3$ )Nzp=r   rR   s    r(   
extra_reprFocalNetDropPath.extra_repr  s    DNN#$$r'   r   ri   )r   r   r   r    r!   r   floatrD   r"   rk   rg   strr   r&   rl   rm   s   @r(   r   r     sQ    b#(5/ #T # #GU\\ Gell G%C % %r'   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )FocalNetModulationi  c                   > [         T	U ]  5         X0l        UR                  U   U l        UR
                  U   U l        X@l        UR                  U l        UR                  U l	        [        R                  " USU-  U R                  S-   -   US9U l        [        R                  " X3SSUS9U l        [        R                  " 5       U l        [        R                  " X35      U l        [        R$                  " U5      U l        [        R(                  " 5       U l        / U l        [/        U R                  5       H  nU R                  U-  U R                  -   nU R*                  R1                  [        R2                  " [        R                  " X3USX8S-  SS9[        R                  " 5       5      5        U R,                  R1                  U5        M     U R                  (       a$  [        R4                  " X1R6                  S9U l        g g )Nrr   r   )bias)rt   ru   r   F)rt   ru   groupsrv   r   rA   )rC   rD   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inr|   projection_contextGELU
activationprojection_outrO   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrL   rM   	layernorm)
rR   r:   indexr   r   r   r   krt   rT   s
            r(   rD   FocalNetModulation.__init__  s   "007!..u5(060W0W-#)#=#= YYsAGt7G7G!7K,LSWX"$))C!ATX"Y'') ii1"$**-?"@MMOt''(A++a/$2C2CCK$$IIk!ChiYipu GGI	 $$[1 ) 00\\#3H3HIDN 1r'   c                 *   UR                   S   nU R                  U5      R                  SSSS5      R                  5       n[        R
                  " X2X R                  S-   4S5      u  pEnSn[        U R                  5       H*  nU R                  U   " U5      nXuUSS2XS-   24   -  -   nM,     U R                  UR                  SSS9R                  SSS95      n	XyUSS2U R                  S24   -  -   nU R                  (       a  XpR                  S-   -  nU R                  U5      n
XJ-  nUR                  SSSS5      R                  5       nU R                  (       a  U R                  U5      nU R                  U5      nU R!                  U5      nU$ )	zh
Args:
    hidden_state:
        Input features with shape of (batch_size, height, width, num_channels)
rZ   r   r
   r   rr   NT)keepdim)r   r   permute
contiguousr"   splitr   r   r   r   meanr   r   r   r   r   r   )rR   hidden_stater=   xqctxgatesctx_alllevel
ctx_global	modulatorx_outs               r(   rg   FocalNetModulation.forward=  s    $))"- |,44Q1a@KKMAlDTDTWXDX'Y[\] 4++,E##E*3/CeAuqy/@,@&A AAG - __SXXaX%>%C%CAt%C%TU
q$2B2B2D/D)EEE ##!1!1A!56G ++G4	aAq)44600NN5)E ##E*''.r'   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rr   Tr   r   r   r   r    rD   rg   r&   rl   rm   s   @r(   r   r     s    JB" "r'   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )FocalNetMlpib  c                    > [         TU ]  5         U=(       d    UnU=(       d    Un[        R                  " X#5      U l        [
        UR                     U l        [        R                  " X45      U l        [        R                  " U5      U l
        g ri   )rC   rD   r   r   fc1r   
hidden_actr   fc2rO   drop)rR   r:   in_featureshidden_featuresout_featuresr   rT   s         r(   rD   FocalNetMlp.__init__c  sd    #2{)8[99[: !2!2399_;JJt$	r'   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ ri   )r   r   r   r   )rR   r   s     r(   rg   FocalNetMlp.forwardl  sN    xx-|4yy.xx-yy.r'   )r   r   r   r   )NNr   r   rm   s   @r(   r   r   b  s    % r'   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )FocalNetLayeriu  aS  Focal Modulation Network layer (block).

Args:
    config (`FocalNetConfig`):
        Model config.
    index (`int`):
        Layer index.
    dim (`int`):
        Number of input channels.
    input_resolution (`tuple[int]`):
        Input resolution.
    drop_path (`float`, *optional*, defaults to 0.0):
        Stochastic depth rate.
c                 .  > [         TU ]  5         Xl        X0l        X@l        UR
                  U l        UR                  U l        [        R                  " X1R                  S9U l        [        UUUU R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X1R                  S9U l        [%        X1R&                  -  5      n[)        XX`R                  S9U l        SU l        SU l        UR0                  (       aw  [        R2                  " UR4                  [6        R8                  " U5      -  SS9U l        [        R2                  " UR4                  [6        R8                  " U5      -  SS9U l        g g )NrA   )r:   r   r   r   r   )r:   r   r   r   r[   T)requires_grad)rC   rD   r:   r   input_resolutionrP   r   use_post_layernormr   rL   rM   norm1r   
modulationr   Identityr   norm2r   	mlp_ratior   mlpgamma_1gamma_2use_layerscalerI   layerscale_valuer"   ones)rR   r:   r   r   r   r   mlp_hidden_dimrT   s          r(   rD   FocalNetLayer.__init__  s2     0 ..	"(";";\\#+@+@A
,#yy	
 9BC))4R[[]\\#+@+@A
S#3#334f~dmdmn  <<(?(?%**S/(QaefDL<<(?(?%**S/(QaefDL !r'   c           	      J   Uu  p4UR                   u  pVnUnU R                  (       a  UOU R                  U5      nUR                  XSXG5      nU R	                  U5      R                  XSU-  U5      nU R                  (       d  UOU R                  U5      nXR                  U R                  U-  5      -   nXR                  U R                  U R                  (       a   U R                  U R                  U5      5      OU R                  U R                  U5      5      -  5      -   nU$ ri   )
r   r   r   viewr   r   r   r   r   r   )	rR   r   input_dimensionsr   r   rb   rd   r=   shortcuts	            r(   rg   FocalNetLayer.forward  s    (&2&8&8#
| (,'>'>|DJJ|D\#((UQ|499*unVbc+/+B+B|

S_H`  ..1L"MM#nnLL595L5Ltzz$((<01RVRZRZ[_[e[efr[sRtv'
 

 r'   )r:   r   r   r   r   r   r   r   r   r   r   r   )r   )	r   r   r   r    r!   rD   rg   r&   rl   rm   s   @r(   r   r   u  s    g@ r'   r   c                   v   ^  \ rS rSrU 4S jrS\R                  S\\\4   S\\R                     4S jr	Sr
U =r$ )FocalNetStagei  c                   > [         TU ]  5         Xl        [        UR                  5      U l        [        U R
                  5       Vs/ sH  oAR                  SU-  -  PM     nnXR   nX R
                  S-
  :  a  XRS-      OS nX R
                  S-
  :  a  [        OS n[        R                  " SUR                  [        UR                  5      SS9 V	s/ sH  oR                  5       PM     n
n	U
[        UR                  S U 5      [        UR                  S US-    5       n[        R                  " [        UR                  U   5       Vs/ sH)  n[!        UUUU[#        U[$        5      (       a  X   OUS9PM+     sn5      U l        Ub  U" UUSUUSUR(                  SS	9U l        OS U l        SU l        g s  snf s  sn	f s  snf )
Nrr   r   r   cpu)r   )r:   r   r   r   r   TF)r:   r;   r<   r=   r>   r~   r?   r@   )rC   rD   r:   lendepths
num_stagesr   r>   rE   r"   linspacedrop_path_ratesumitemr   r   r   rw   listlayersr?   
downsamplepointing)rR   r:   r   r   ir>   r   out_dimr  r   dprr   rT   s               r(   rD   FocalNetStage.__init__  s   fmm,8=doo8NO8N1%%A.8N	O+0??Q3F+F)AI&T1619L1L,SW
 "'63H3H#fmmJ\ej!kl!kAvvx!klFMM&512S{QR9S5TU	mm v}}U34	 5A !%5.8D.I.Iily 5	
 !(+ !%44	DO #DOI P m	s   F4F9
/F>r   r   rX   c                    Uu  p4U R                    H  nU" X5      nM     UnU R                  bH  Uu  p4UR                  SS5      R                  UR                  S   SX45      nU R                  U5      u  pOX4X44nXU4nU$ )Nr   rr   r   rZ   )r  r  r   reshaper   )	rR   r   r   r   r   layer_module!hidden_states_before_downsamplingra   stage_outputss	            r(   rg   FocalNetStage.forward  s    ( KKL(IM ( -:)??&,MF)33Aq9AA177:BM 04}/M,M, "( >&K\]r'   )r:   r  r  r  r  )r   r   r   r    rD   r"   rk   r%   r   rg   r&   rl   rm   s   @r(   r  r    sB    *XU\\ U3PS8_ Y^_d_k_kYl  r'   r  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\\4   S\	\
   S\	\
   S\	\
   S\\\4   4S	 jjrS
rU =r$ )FocalNetEncoderi  c                 0  > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " [        U R                  5       Vs/ sH"  n[        UUUS   SU-  -  US   SU-  -  4S9PM$     sn5      U l
        SU l        g s  snf )Nr   rr   r   )r:   r   r   F)rC   rD   r  r  r  r:   r   r   r   r  stagesgradient_checkpointing)rR   r:   rG   i_layerrT   s       r(   rD   FocalNetEncoder.__init__  s    fmm,mm  %T__5  6G !!&/lq'z&BIaLUVX_U_D`%a
  6	
 ',#s   (Br   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrX   c                    U(       a  SOS nU(       a  SOS nU(       aB  UR                   u  pn
UR                  " U/UQU
P76 nUR                  SSSS5      nXa4-  nX{4-  n[        U R                  5       H  u  pU" X5      nUS   nUS   nUS   nUS   US   4nU(       aS  U(       aL  UR                   u  pn
UR                  " U/US   US   4QU
P76 nUR                  SSSS5      nXo4-  nX{4-  nM  U(       d  M  U(       a  M  UR                   u  pn
UR                  " U/UQU
P76 nUR                  SSSS5      nXa4-  nX{4-  nM     U(       d  [        S X4 5       5      $ [        UUUS	9$ )
Nr   r   r
   r   rr   rZ   c              3   ,   #    U H  oc  M  Uv   M     g 7fri   r   ).0vs     r(   	<genexpr>*FocalNetEncoder.forward.<locals>.<genexpr>>  s     X$Fq$Fs   	)r   r   r   )r   r   r   	enumerater  r%   r   )rR   r   r   r   r!  r"  all_hidden_statesall_reshaped_hidden_statesrb   rd   hidden_sizereshaped_hidden_stater  stage_moduler  r  ra   s                    r(   rg   FocalNetEncoder.forward  s    #7BD+?RT")6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&*BB&(5OA(IM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!%II!*.FF*%%.V.V-:-@-@*
{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*.FF*3  66 X]$FXXX$++#=
 	
r'   )r:   r  r  r  )FFT)r   r   r   r    rD   r"   rk   r%   r   r   boolr   r   rg   r&   rl   rm   s   @r(   r  r    sz    ,, 05CH&*5
||5
  S/5
 'tn	5

 3;4.5
 d^5
 
u++	,5
 5
r'   r  c                   8    \ rS rSr% \\S'   SrSrSrS/r	S r
Srg	)
FocalNetPreTrainedModeliG  r:   focalnetrV   Tr  c                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       a3  UR                  b%  UR                  R
                  R                  5         gg[        U[        5      (       a  U R                  R                   (       as  UR"                  R
                  R                  U R                  R$                  5        UR&                  R
                  R                  U R                  R$                  5        ggg)zInitialize the weightsr   )r   stdNr[   )rw   r   r   r|   weightdatanormal_r:   initializer_ranger   zero_rL   fill_r7   rK   r   r   r   r   r   )rR   modules     r(   _init_weights%FocalNetPreTrainedModel._init_weightsO  sH   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) 233  ,!!&&,,. -..{{))##))$++*F*FG##))$++*F*FG * /r'   r   N)r   r   r   r    r   r$   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr>  r&   r   r'   r(   r3  r3  G  s(    "$O&*#()Hr'   r3  c                      ^  \ rS rSrSU 4S jjrS r\    SS\\R                     S\\R                     S\\   S\\   S\\\4   4
S	 jj5       rS
rU =r$ )FocalNetModelic  c                   > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  SU R
                  S-
  -  -  5      U l        [        XS9U l
        [        XR                  R                  5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
rr   r   )rS   rA   N)rC   rD   r:   r  r  r  r   r>   num_featuresr7   r`   r  rH   encoderr   rL   rM   r   AdaptiveAvgPool1dpooler	post_init)rR   r:   add_pooling_layerrS   rT   s       r(   rD   FocalNetModel.__init__e  s     	 fmm, 0 0119L3M MN,VS&v/I/IJd&7&7V=R=RS1Bb**1- 	r'   c                 .    U R                   R                  $ ri   )r`   rF   r   s    r(   get_input_embeddings"FocalNetModel.get_input_embeddingsz  s    ///r'   rV   rW   r   r"  rX   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  XS9u  pVU R                  UUUUS9nUS   nU R                  U5      nSn	U R                  b8  U R                  UR                  SS5      5      n	[        R                  " U	S5      n	U(       d  X4USS -   n
U
$ [        UU	UR                  UR                  S9$ )	z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)rW   r   r"  r   r   rr   )r   r,   r   r   )r:   r   use_return_dictr   r`   rH  r   rJ  r   r"   r   r*   r   r   )rR   rV   rW   r   r"  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s              r(   rg   FocalNetModel.forward}  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@-1__\_-k*,,!5#	 ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%58KKFM"-')77#2#I#I	
 	
r'   )r:   r`   rH  r   rG  r  rJ  )TFNNNN)r   r   r   r    rD   rO  r   r   r"   r#   rj   r1  r   r%   r*   rg   r&   rl   rm   s   @r(   rE  rE  c  s    *0  596:/3&*.
u001.
 "%"2"23.
 'tn	.

 d^.
 
u))	*.
 .
r'   rE  a  
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                      ^  \ rS rSrU 4S jr\    S
S\\R                     S\\R                     S\\
   S\\
   S\\\4   4
S jj5       rS	rU =r$ )FocalNetForMaskedImageModelingi  c                   > [         TU ]  U5        [        USSS9U l        [	        UR
                  5      U l        [        UR                  SU R                  S-
  -  -  5      n[        R                  " [        R                  " X!R                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R!                  5         g )NFT)rL  rS   rr   r   )in_channelsout_channelsrt   )rC   rD   rE  r4  r  r  r  r   r>   r   r   r|   encoder_strider=   PixelShuffledecoderrK  )rR   r:   rG  rT   s      r(   rD   'FocalNetForMaskedImageModeling.__init__  s     %fVZ[fmm,6++aDOOa4G.HHI}}II(7L7La7ORXReRe7est OOF112	
 	r'   rV   rW   r   r"  rX   c                    Ub  UOU R                   R                  nU R                  UUUUS9nUS   nUR                  SS5      nUR                  u  pxn	[
        R                  " U	S-  5      =pUR                  XxX5      nU R                  U5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  SX5      nUR                  U R                   R                  S5      R                  U R                   R                  S5      R                  S5      R                  5       n[        R                  R!                  XSS	9nUU-  R#                  5       UR#                  5       S
-   -  U R                   R$                  -  nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
>>> config = FocalNetConfig()
>>> model = FocalNetForMaskedImageModeling(config)

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
>>> list(reconstructed_pixel_values.shape)
[1, 3, 192, 192]
```N)rW   r   r"  r   r   rr   g      ?rZ   none)	reductiongh㈵>)r0   r1   r   r   )r:   rS  r4  r   r   mathfloorr  ra  r;   r<   repeat_interleaver^   r   r   r   l1_lossr	  r=   r.   r   r   )rR   rV   rW   r   r"  outputsrV  rb   r=   sequence_lengthr   r   reconstructed_pixel_valuesmasked_im_lossr\   rf   reconstruction_lossr   s                     r(   rg   &FocalNetForMaskedImageModeling.forward  s   H &1%<k$++B]B]--+!5#	   
 "!*)33Aq94C4I4I1
/OS$899)11*FZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY05!//#*#A#A	
 	
r'   )ra  r4  r  rY  )r   r   r   r    rD   r   r   r"   r#   rj   r1  r   r%   r.   rg   r&   rl   rm   s   @r(   r[  r[    s    "  596:/3&*L
u001L
 "%"2"23L
 'tn	L

 d^L
 
u77	8L
 L
r'   r[  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                      ^  \ rS rSrU 4S jr\    S
S\\R                     S\\R                     S\\
   S\\
   S\\\4   4
S jj5       rS	rU =r$ )FocalNetForImageClassificationi  c                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )Nr   )rC   rD   
num_labelsrE  r4  r   r   rG  r   
classifierrK  rR   r:   rT   s     r(   rD   'FocalNetForImageClassification.__init__'  sx      ++%f- IOHYHY\]H]BIIdmm00&2C2CDcecncncp 	
 	r'   rV   labelsr   r"  rX   c                 $   Ub  UOU R                   R                  nU R                  UUUS9nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       n	U R
                  S:X  a&  U	" UR                  5       UR                  5       5      nOU	" Xr5      nOU R                   R                  S:X  a=  [        5       n	U	" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       n	U	" Xr5      nU(       d  U4USS -   n
Ub  U4U
-   $ U
$ [        UUUR                   UR"                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NrR  r   
regressionsingle_label_classificationmulti_label_classificationrZ   rr   )r0   r5   r   r   )r:   rS  r4  rt  problem_typers  r   r"   longr   r	   squeezer   r   r   r3   r   r   )rR   rV   rw  r   r"  rj  rW  r5   r0   loss_fctr   s              r(   rg   &FocalNetForImageClassification.forward5  s    &1%<k$++B]B]--!5#   
  
/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE,!//#*#A#A	
 	
r'   )rt  r4  rs  rY  )r   r   r   r    rD   r   r   r"   r#   
LongTensorr1  r   r%   r3   rg   r&   rl   rm   s   @r(   rq  rq    s      59-1/3&*9
u0019
 ))*9
 'tn	9

 d^9
 
u33	49
 9
r'   rq  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c            
       |   ^  \ rS rSrS\4U 4S jjr\  S
S\R                  S\	\
   S\	\
   S\4S jj5       rS	rU =r$ )FocalNetBackboneir  r:   c                    > [         TU ]  U5        [         TU ]	  U5        UR                  /UR                  -   U l        [        U5      U l        U R                  5         g ri   )	rC   rD   _init_backboner>   hidden_sizesrG  rE  r4  rK  ru  s     r(   rD   FocalNetBackbone.__init__x  sQ     v&#--.1D1DD%f- 	r'   rV   r   r"  rX   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  USSS9nUR                  nSn[        U R                  5       H  u  pxXR                  ;   d  M  XeU   4-  nM      U(       d  U4n	U(       a  XR                  4-  n	U	$ [        UU(       a  UR                  SS9$ SSS9$ )a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
>>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```NTrR  r   )feature_mapsr   
attentions)
r:   rS  r   r4  r   r*  stage_namesr   r   r   )
rR   rV   r   r"  rj  r   r  idxstager   s
             r(   rg   FocalNetBackbone.forward  s    2 &1%<k$++B]B]$8$D $++JjJj 	 --4UY-Z66#D$4$45JC)))s!3 55 6 "_F#0022M%3G'//
 	
MQ
 	
r'   )r4  rG  )NN)r   r   r   r    r   rD   r   r"   rk   r   r1  r   rg   r&   rl   rm   s   @r(   r  r  r  s]    ~   04&*	0
ll0
 'tn0
 d^	0

 
0
 0
r'   r  )rq  r[  r  rE  r3  )r   F):r!   collections.abcrx   rf  dataclassesr   typingr   r   r"   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr   r*   r.   r3   Moduler7   rE   rk   r   r1  r   r   r   r   r   r  r  r3  rE  r[  rq  r  __all__r   r'   r(   <module>r     s      ! "    A A ! 9 . - 9 9 1 2 
		H	% 
FK F F 
F+ F F$ 
F F F( 
FK F F(%- %-PD-bii D-PU\\ e T V[VbVb *%ryy %D DN")) &BBII BJ?. ?DH
bii H
V Ho H H6 H
+ H
 H
V _
%< _
_
D J
%< J
J
Z 
<
. <

<
~r'   