
    h%                        d Z ddlmZmZ ddlZddlmZ ddlZddlm	Z	m
Z
mZ ddlmZmZmZmZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZm Z  	 ddl!m"Z"  G d de      Z# G d de      Z$ G d de
      Z% G d de	      Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d de      Z+e G d  d!e             Z, ed"#       G d$ d%e             Z- G d& d'e      Z.g d(Z/y))zPyTorch SAM 2 model.    )OptionalUnionN)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModel)TransformersKwargscheck_model_inputs   )PretrainedConfig)Unpack)auto_docstring   )CONFIG_MAPPING
AutoConfig)TimmWrapperModelc                   L     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )EdgeTamVisionConfiga  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configc                    t        |   di | |g dn|}|ddgddgddggn|}|ddgn|}t        |t              r'|j	                  dd      |d<   t        |d      di |}n2t        |t              r|}n|t        j                  d	dd
g dd      }|| _        || _	        || _
        || _        || _        || _        || _        || _        |	| _        |
| _        || _        || _        y )N)i     `   0         @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r      r   r   )in_chansfeatures_onlyout_indices)
model_args )super__init__
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_range)selfr   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   kwargs	__class__s                 j/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/edgetam/modular_edgetam.pyr/   zEdgeTamVisionConfig.__init__^   s(     	"6"6K6S 2Yn2H2Pc3Z#sb"X.Vl 	 )<(Cq!fI\ot,,;,?,?n,]OL),_\-JK^o^O4-O$(88*()DQ]^O
  / &;"&<#..$&#6 "4$,!2    )NNNr#   r(   r(   r   Nr   gelugư>g{Gz?)
__name__
__module____qualname____doc__base_config_keyr&   r   sub_configsr/   __classcell__)rA   s   @rB   r   r   1   sQ    $L &O'J:K "# 13 13rC   r   c                       e Zd Zy)EdgeTamPromptEncoderConfigNrE   rF   rG   r-   rC   rB   rM   rM          rC   rM   c                       e Zd Zy)EdgeTamMaskDecoderConfigNrN   r-   rC   rB   rQ   rQ      rO   rC   rQ   c                       e Zd Zy)EdgeTamConfigNrN   r-   rC   rB   rS   rS      rO   rC   rS   c                       e Zd Zy)EdgeTamLayerNormNrN   r-   rC   rB   rU   rU      rO   rC   rU   c                       e Zd Zy)EdgeTamVisionEncoderOutputNrN   r-   rC   rB   rW   rW      rO   rC   rW   c                       e Zd Zy)EdgeTamAttentionNrN   r-   rC   rB   rY   rY      rO   rC   rY   c                       e Zd Zy)EdgeTamTwoWayAttentionBlockNrN   r-   rC   rB   r[   r[      rO   rC   r[   c                       e Zd Zy)EdgeTamFeedForwardNrN   r-   rC   rB   r]   r]      rO   rC   r]   c                       e Zd Zd Zy)EdgeTamPreTrainedModelc                    | j                   j                  }t        |t        j                  t        j
                  t        j                  f      rY|j                  j                  j                  d|       |j                  |j                  j                  j                          nt        |t        j                        re|j                  j                  j                  d|       |j                  |j                  j                  |j                     j                          nit        |t        j                  t        f      rI|j                  j                  j!                  d       |j                  j                  j                          t        |t"              r2|j$                  %|j$                  j                  j                          y y y )Ng        )meanstdg      ?)configr>   r0   nnLinearConv2dConvTranspose2dweightdatanormal_biaszero_	Embeddingpadding_idx	LayerNormrU   fill_EdgeTamModelno_memory_embedding)r?   modulerb   s      rB   _init_weightsz$EdgeTamPreTrainedModel._init_weights   s?   kk++fryy"))R5G5GHIMM&&CS&9{{&  &&(-MM&&CS&9!!-""6#5#56<<>/? @AMM$$S)KK""$fl+))5**//557 6 ,rC   N)rE   rF   rG   rt   r-   rC   rB   r_   r_      s    8rC   r_   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc            
       p    e Zd ZeZdZeedZd Ze		 dde
ej                     dee   deeef   fd       Zy)	EdgeTamVisionModelpixel_values)hidden_states
attentionsc                     t        d      Nz2Can't get input embeddings from timm wrapper modelNotImplementedErrorr?   s    rB   get_input_embeddingsz'EdgeTamVisionModel.get_input_embeddings       !"VWWrC   Nr@   returnc           	      D   |t        d      | j                  |      }|j                  }|D cg c]  }|j                  dddd       }}| j	                  |      \  }}|| j
                   d  d d d   }|| j
                   d  d d d   }t        |d   ||      S c c}w )Nz You have to specify pixel_valuesr   r   r   r(   )last_hidden_statefpn_hidden_statesfpn_position_encoding)
ValueErrorbackboner   permuteneckr;   rW   )r?   rx   r@   backbone_outputintermediate_hidden_stateshidden_stater   r   s           rB   forwardzEdgeTamVisionModel.forward   s     ?@@ --5%4%F%F"[u%v[u<l&:&:1aA&F[u"%v3799=W3X00-t/F/F.F.HI$B$O 5t7N7N6N6P QRVTVRV W)8</"7
 	
 &ws   B)N)rE   rF   rG   r   config_classmain_input_namer   _can_record_outputsr   r   r   torchFloatTensorr   r   r   tuplerW   r   r-   rC   rB   rw   rw      sq     'L$O,<L\]X  59
u001
 +,
 
u00	1	
 
rC   rw   c                       e Zd Zg dZd Zy)rq   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                     t        d      r|   r}   r   s    rB   r   z!EdgeTamModel.get_input_embeddings   r   rC   N)rE   rF   rG   "_keys_to_ignore_on_load_unexpectedr   r-   rC   rB   rq   rq      s    	*&XrC   rq   )rq   rw   r_   rS   r   rM   rQ   )0rH   typingr   r   r   torch.nnrd   torch.utils.checkpoint+transformers.models.sam2.configuration_sam2r   r   r   &transformers.models.sam2.modeling_sam2r   r	   r
   r   r   r   r   r   transformers.utils.genericr   r   configuration_utilsr   processing_utilsr   utilsr   autor   r   6transformers.models.timm_wrapper.modeling_timm_wrapperr   r   rM   rQ   rS   rU   rW   rY   r[   r]   r_   rw   rq   __all__r-   rC   rB   <module>r      s    "    r r	 	 	 N 3 & . W^3* ^3B	!8 		4 		J 		} 		!8 		} 		": 		 	 80 8 8& 

 


DX9 X rC   