
    h                     P   d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlmc mZ d dl	mZ d dlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( 	 d dl)m*Z*  G d dejV                        Z,e e!d       G d de                     Z-	 d@dej\                  de	j                  de	j                  de	j                  dee	j                     de/de/fdZ0 G d  d!ej\                        Z1 G d" d#ej\                        Z2 G d$ d%ej\                        Z3e! G d& d'e             Z4 G d( d)ej\                        Z5 G d* d+ej\                        Z6 e!d,       G d- d.e4             Z7e e!d/       G d0 d1e                     Z8 G d2 d3ej\                        Z9 G d4 d5ej\                        Z: G d6 d7ej\                        Z; G d8 d9ej\                        Z< G d: d;ej\                        Z= e!d<       G d= d>e4             Z>g d?Z?y)A    N)	dataclass)CallableOptionalUnion)Tensor)OutputRecorderTransformersKwargscheck_model_inputs   )ACT2FN)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)#compile_compatible_method_lru_cache)ModelOutputauto_docstring   )	AutoModel   )EdgeTamConfigEdgeTamMaskDecoderConfigEdgeTamPromptEncoderConfigEdgeTamVisionConfig)TimmWrapperModelc                   f     e Zd ZdZddd fd
Zdej                  dej                  f fdZ xZS )	EdgeTamLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    ư>channels_lastepsdata_formatc                \    t        |   |fd|i| |dvrt        d|       || _        y )Nr!   )r   channels_firstzUnsupported data format: )super__init__NotImplementedErrorr"   )selfnormalized_shaper!   r"   kwargs	__class__s        k/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/edgetam/modeling_edgetam.pyr&   zEdgeTamLayerNorm.__init__<   s?    )=s=f=AA%(A+&OPP&    featuresreturnc                     | j                   dk(  r9|j                  dddd      }t        |   |      }|j                  dddd      }|S t        |   |      }|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        r$   r   r   r   r   )r"   permuter%   forward)r(   r.   r+   s     r,   r2   zEdgeTamLayerNorm.forwardB   sj    
 //''1a3Hwx0H''1a3H  wx0Hr-   )	__name__
__module____qualname____doc__r&   torchr   r2   __classcell__r+   s   @r,   r   r   6   s4    
 15/ '   r-   r   z,Base class for the vision encoder's outputs.)custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	EdgeTamVisionEncoderOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    fpn_hidden_states (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
        `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
    fpn_position_encoding (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
        `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
        model at the output of each stage.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    Nlast_hidden_statefpn_hidden_statesfpn_position_encoding.hidden_states
attentions)r3   r4   r5   r6   r=   r   r7   FloatTensor__annotations__r>   r?   r@   tuplerA    r-   r,   r<   r<   P   s    & 6:x 1 12959x 1 1299=8E$5$56==AM8E%"3"3S"89:A:>Ju00#567>r-   r<   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr   r   )dimdtype)ptrainingr   )r7   matmul	transposenn
functionalsoftmaxfloat32torP   rL   rR   
contiguous)
rF   rG   rH   rI   rJ   rK   rL   r*   attn_weightsattn_outputs
             r,   eager_attention_forwardr]   m   s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r-   c                        e Zd ZdZd
 fd	Z	 d
dej                  dej                  dej                  deej                     dee	   de
ej                  ej                  f   fd	Z xZS )EdgeTamAttentionz
    EDGETAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
    values.
    c                    t         |           ||j                  n|}|| _        |j                  | _        |j                  |z  | _        |j                  | _        | j
                  |j                  z  | _        | j                  dz  | _        d| _	        t        j                  | j                  | j
                        | _        t        j                  | j                  | j
                        | _        t        j                  | j                  | j
                        | _        t        j                  | j
                  | j                        | _        y )Ng      F)r%   r&   attention_downsample_rateconfighidden_sizeinternal_dimnum_attention_headshead_dimrK   	is_causalrU   Linearq_projk_projv_projo_proj)r(   rb   downsample_rater+   s      r,   r&   zEdgeTamAttention.__init__   s    >M>U&::[j!--"../A#)#=#= ))V-G-GG}}d*ii 0 0$2C2CDii 0 0$2C2CDii 0 0$2C2CDii 1 143C3CDr-   rG   rH   rI   attention_similarityr*   r/   c                    |j                   d d \  }}||z  d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        }	| j                  j                  dk7  rt        | j                  j                     }	 |	| |||f|d| j                  | j                  d|\  }
}|
j                  ||d| j                  | j                  z        j                         }
| j!                  |
      }
|
|fS )Nr   rN   r   eager        )rJ   rL   rK   rg   )shapere   rf   ri   viewrT   rj   rk   r]   rb   _attn_implementationr   rK   rg   reshaperZ   rl   )r(   rG   rH   rI   rn   r*   
batch_sizepoint_batch_size	new_shapeattention_interfacer\   r[   s               r,   r2   zEdgeTamAttention.forward   so    (-{{2A$
$"22B8P8PRVR_R_`	'E"''3==aC#dkk###Y/99!Q?'E"''3==aC(?;;++w6"9$++:Z:Z"[$7	
%

 0LLnn
%
 
%
!\ "))("d.F.F.V

*, 	 kk+.L((r-   N)r3   r4   r5   r6   r&   r7   r   r   r   r	   rD   r2   r8   r9   s   @r,   r_   r_      s    
E* 8<%)||%) \\%) ||	%)
 'u||4%) +,%) 
u||U\\)	*%)r-   r_   c                   P     e Zd Zddedef fdZdededededed	ee   fd
Z	 xZ
S )EdgeTamTwoWayAttentionBlockrb   skip_first_layer_pec                 ,   t         |           t        |d      | _        t	        j
                  |j                        | _        t        |      | _        t	        j
                  |j                        | _	        t        |j                  |j                  |j                  |j                        | _        t	        j
                  |j                        | _        t	        j
                  |j                        | _        t        |      | _        || _        y)a  
        A transformer block with four layers:
            (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
            sparse inputs (4) cross attention of dense inputs -> sparse inputs

        Arguments:
            config (`EdgeTamMaskDecoderConfig`):
                The configuration file used to instantiate the block
            attention_downsample_rate (*optionalk*, int, defaults to 2):
                The downsample ratio of the block used to reduce the inner dim of the attention.
            skip_first_layer_pe (*optional*, bool, defaults to `False`):
                Whether or not to skip the addition of the query_point_embedding on the first layer.
        r   )rm   )
num_layersN)r%   r&   r_   	self_attnrU   	LayerNormrc   layer_norm1cross_attn_token_to_imagelayer_norm2EdgeTamFeedForwardmlp_dimnum_hidden_layersmlplayer_norm3layer_norm4cross_attn_image_to_tokenr}   )r(   rb   r}   r+   s      r,   r&   z$EdgeTamTwoWayAttentionBlock.__init__   s     	)&!D<<(:(:;)9&)A&<<(:(:;%0B0BvOgOg
 <<(:(:;<<(:(:;)9&)A&#6 r-   querieskeysquery_point_embeddingkey_point_embeddingrn   r*   c                    | j                   r| j                  |||      \  }}n!||z   }| j                  |||      \  }	}||	z   }| j                  |      }||z   }||z   }
| j                  ||
||      \  }	}||	z   }| j	                  |      }| j                  |      }||z   }| j                  |      }||z   }||z   }
| j                  |
||      \  }	}||	z   }| j                  |      }|||	fS )NrG   rH   rI   )rG   rH   rI   rn   )	r}   r   r   r   r   r   r   r   r   )r(   r   r   r   r   rn   r*   _rG   attn_outrH   mlp_outs               r,   r2   z#EdgeTamTwoWayAttentionBlock.forward   s=    ##g7'RJGQ33E..u%w.OKHa(G""7+ //((44SCW 5 
! H$""7+ ((7#G#""7+ //((443EQX4Y!h%h&&r-   )F)r3   r4   r5   r   boolr&   r   r   r	   r2   r8   r9   s   @r,   r|   r|      s\    77 7d 7>*'*' *'  &	*'
 $*' %*' +,*'r-   r|   c                   D     e Zd Z	 	 d	dedededededef fdZd Z xZS )
r   	input_dim
hidden_dim
output_dimr   
activationsigmoid_outputc           	      `   t         |           || _        t        |   | _        t        j                  ||      | _        t        j                  ||      | _        t        j                  t        |dz
        D cg c]  }t        j                  ||       c}      | _        || _        y c c}w )Nr   )r%   r&   r   r   r   rU   rh   proj_inproj_out
ModuleListrangelayersr   )	r(   r   r   r   r   r   r   r   r+   s	           r,   r&   zEdgeTamFeedForward.__init__  s     	$ ,yyJ7		*j9mmPUV`cdVdPe$fPe1RYYz:%FPe$fg, %gs   :B+c                     | j                  |      }| j                  |      }| j                  D ]  }| j                   ||            } | j                  |      }| j                  rt        j                  |      }|S rz   )r   r   r   r   r   Fsigmoid)r(   r@   layers      r,   r2   zEdgeTamFeedForward.forward   sl    ]36[[E OOE-,@AM ! m4IIm4Mr-   )reluF)	r3   r4   r5   intstrr   r&   r2   r8   r9   s   @r,   r   r     sO     !$-- - 	-
 - - -"	r-   r   c                   *    e Zd ZeZdZdZdZdZdZ	d Z
y)EdgeTamPreTrainedModeledgetampixel_valuesTc                    | j                   j                  }t        |t        j                  t        j
                  t        j                  f      rY|j                  j                  j                  d|       |j                  |j                  j                  j                          nt        |t        j                        re|j                  j                  j                  d|       |j                  |j                  j                  |j                     j                          nit        |t        j                  t        f      rI|j                  j                  j!                  d       |j                  j                  j                          t        |t"              r2|j$                  %|j$                  j                  j                          y y y )Nrq   )meanstd      ?)rb   initializer_range
isinstancerU   rh   Conv2dConvTranspose2dweightdatanormal_biaszero_	Embeddingpadding_idxr   r   fill_EdgeTamModelno_memory_embedding)r(   rF   r   s      r,   _init_weightsz$EdgeTamPreTrainedModel._init_weights5  s?   kk++fryy"))R5G5GHIMM&&CS&9{{&  &&(-MM&&CS&9!!-""6#5#56<<>/? @AMM$$S)KK""$fl+))5**//557 6 ,r-   N)r3   r4   r5   r   config_classbase_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_2_supports_attention_backendr   rE   r-   r,   r   r   ,  s(     L!$ON!"&8r-   r   c                        e Zd ZdZ	 ddedededee   f fdZ e	d      	 dd	e
j                  d
ee
j                  ef   de
j                  dee   def
d       Z xZS )EdgeTamSinePositionEmbeddingz
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    num_pos_featstemperature	normalizescalec                     t         |           ||du rt        d      || _        || _        || _        |dt        j                  z  | _        y || _        y )NFz+normalize should be True if scale is passedr   )	r%   r&   
ValueErrorr   r   r   mathpir   )r(   r   r   r   r   r+   s        r,   r&   z%EdgeTamSinePositionEmbedding.__init__N  sY     	e!3JKK*&"$)MQ[
u
r-   r   )maxsizerr   devicerP   maskr/   c           
         |2t        j                  |d   |d   |d   f|t         j                        }| j                  |      }|j	                  d      }|j	                  d      }| j
                  rDd}||d d dd d d f   |z   z  | j                  z  }||d d d d dd f   |z   z  | j                  z  }t        j                  | j                  t         j                  |      j                  |      }	| j                  dt        j                  |	dd	
      z  | j                  z  z  }	|d d d d d d d f   |	z  }
|d d d d d d d f   |	z  }t        j                  |
d d d d d d dd df   j                         |
d d d d d d dd df   j                         fd      j                  d      }
t        j                  |d d d d d d dd df   j                         |d d d d d d dd df   j                         fd      j                  d      }t        j                   ||
fd      j#                  dddd      }|S )Nr   r   r   r   rP   r   r   rN   rP   r   floor)rounding_mode   rO   )r7   zerosr   rY   cumsumr   r   aranger   int64r   divstacksincosflattencatr1   )r(   rr   r   rP   r   not_masky_embedx_embedr!   dim_tpos_xpos_yposs                r,   r2   z$EdgeTamSinePositionEmbedding.forwardY  s    <;;a%(E!H=fTYT^T^_DE::e$//!$//!$>>CBC!3c!9:TZZGGArs!3c!9:TZZGGT//u{{6RUUV[\  Q5!7)S%SVZVhVh%hi1a&.1a&.U1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgU1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgiiA.66q!QB
r-   )@   i'  FNrz   )r3   r4   r5   r6   r   r   r   floatr&   r   r7   Sizer   r   r   rP   r   r2   r8   r9   s   @r,   r   r   H  s     tx	= 	=47	=LP	=aijoap	= )3 "&zz ellC'( {{	
 v 
 4r-   r   c                        e Zd Zdef fdZdej                  deeej                  df   eej                  df   f   fdZ xZ	S )EdgeTamVisionNeckrb   c           
         t         |           || _        t        |j                  dz  d      | _        t        j                         | _        |j                  D ]]  }| j                  j                  t        j                  ||j                  |j                  |j                  |j                               _ |j                  | _        y )Nr   T)r   r   )in_channelsout_channelskernel_sizestridepadding)r%   r&   rb   r   fpn_hidden_sizeposition_encodingrU   r   convsbackbone_channel_listappendr   fpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levels)r(   rb   r   r+   s      r,   r&   zEdgeTamVisionNeck.__init__w  s    != 00A5"
 ]]_
!77KJJ		 +!'!7!7 & 6 6!,,".. 8 $*#=#= r-   r@   r/   .c                 H   d}d}t        | j                        dz
  }t        |dd      D ]  }||   j                  dddd      } | j                  ||z
     |      }|| j                  vs||k(  r|}nVt        j                  j                  t        j                        dd	d d
      j                  |j                        }||z   }| j                  |j                  |j                  |j                        j                  |j                        }	||fz  }||	fz  } ||fS )NrE   r   rN   r   r   r   )rP   g       @nearestF)scale_factormodealign_corners	antialias)lenr   r   r1   r   r   interpolaterY   r7   rX   rP   r   rr   r   )
r(   r@   r>   r?   nilateral_featuresprev_featurestop_down_featuresprev_position_encodings
             r,   r2   zEdgeTamVisionNeck.forward  s@    " 

Oaq"b!A,Q/771aC0tzz!a%01AB000AF 0$%MM!$$5==$9!$""&#% "%++, " !13D D%)%;%;##]%9%9=;N;N&b$$% # -!11!&<%>>!) ", !"777r-   )
r3   r4   r5   r   r&   r7   r   rD   r2   r8   r9   s   @r,   r   r   v  sS    >2 >(8U\\ 8eE%,,PSBS<TV[\a\h\hjm\mVn<n6o 8r-   r   zN
    The vision model from EdgeTAM without any head or projection on top.
    c            
            e Zd ZeZdZeedZdef fdZe		 dde
ej                     dee   deeef   fd       Z xZS )	EdgeTamVisionModelr   )r@   rA   rb   c                     t         |   |       || _        t        j                  |j
                        | _        t        |      | _        |j                  | _	        | j                          y rz   )r%   r&   rb   r   from_configbackbone_configbackboner   necknum_feature_levels	post_initr(   rb   r+   s     r,   r&   zEdgeTamVisionModel.__init__  sS     !--f.D.DE%f-	"(";";r-   r*   r/   c           	      D   |t        d      | j                  |      }|j                  }|D cg c]  }|j                  dddd       }}| j	                  |      \  }}|| j
                   d  d d d   }|| j
                   d  d d d   }t        |d   ||      S c c}w )Nz You have to specify pixel_valuesr   r   r   r   rN   )r=   r>   r?   )r   r  r=   r1   r  r  r<   )r(   r   r*   backbone_outputintermediate_hidden_stateshidden_stater>   r?   s           r,   r2   zEdgeTamVisionModel.forward  s     ?@@ --5%4%F%F"[u%v[u<l&:&:1aA&F[u"%v3799=W3X00-t/F/F.F.HI$B$O 5t7N7N6N6P QRVTVRV W)8</"7
 	
 &ws   Brz   )r3   r4   r5   r   r   r   r   _can_record_outputsr&   r
   r   r7   rB   r   r	   r   rD   r<   r2   r8   r9   s   @r,   r  r    sy     'L$O,<L\]	2 	  59
u001
 +,
 
u00	1	
 
r-   r  z*Base class for the EdgeTam model's output.c                   L   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                  df   ed<   dZeeej                  df      ed<   dZeeej                  df      ed	<   dZeeej                  df      ed
<   y)EdgeTamImageSegmentationOutputa  
    iou_scores (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks)`):
        The Intersection over Union (IoU) scores of the predicted masks.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, height, width)`):
        The predicted low-resolution masks. This is an alias for `low_res_masks`. These masks need to be post-processed
        by the processor to be brought to the original image size.
    object_score_logits (`torch.FloatTensor` of shape `(batch_size, point_batch_size, 1)`):
        Logits for the object score, indicating if an object is present.
    image_embeddings (`tuple(torch.FloatTensor)`):
        The features from the FPN, which are used by the mask decoder. This is a tuple of `torch.FloatTensor` where each
        tensor has shape `(batch_size, channels, height, width)`.
    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`.
        Hidden-states of the vision model at the output of each stage.
    vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
        Attentions weights of the vision model.
    mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
        Attentions weights of the mask decoder.
    N
iou_scores
pred_masksobject_score_logits.image_embeddingsvision_hidden_statesvision_attentionsmask_decoder_attentions)r3   r4   r5   r6   r  r   r7   rB   rC   r  r   r!  rD   r"  r#  r$  rE   r-   r,   r  r    s    , /3J**+2.2J**+27;%"3"34;6:eE--s23:DH(5):):C)?#@AHAExe&7&7&< =>EGKXeE,=,=s,B&CDKr-   r  c                   ,     e Zd Zdef fdZddZ xZS )EdgeTamPositionalEmbeddingrb   c                     t         |           |j                  | _        | j                  t        j                  d|j
                  dz  f      z  }| j                  d|       y )Nr   positional_embedding)r%   r&   r   r7   randnrc   register_buffer)r(   rb   r(  r+   s      r,   r&   z#EdgeTamPositionalEmbedding.__init__  sT    \\
#zzEKKF<N<NRS<S8T,UU35IJr-   c                    |j                         }|D|dddddddf   |d   z  |dddddddf<   |dddddddf   |d   z  |dddddddf<   |j                  t        j                         d|z  dz
  }|j                  | j                  j
                        }|| j                  z  }dt        j                  z  |z  }t        j                  t        j                  |      t        j                  |      gd      S )z8Positionally encode points that are normalized to [0,1].Nr   r   r   rN   r   )clonerY   r7   rX   r(  rP   npr   r   r   r   )r(   input_coordsinput_shapecoordinatess       r,   r2   z"EdgeTamPositionalEmbedding.forward  s    "((*"&1!Q1*&=A&NK1a
#&1!Q1*&=A&NK1a
#u}}% +o)!nnT%>%>%D%DE!D$=$=="%%i+-yy%))K0%))K2HIrRRr-   rz   r3   r4   r5   r   r&   r2   r8   r9   s   @r,   r&  r&    s    K9 KSr-   r&  c                   *     e Zd Zdef fdZd Z xZS )EdgeTamMaskEmbeddingrb   c                 2   t         |           |j                  dz  | _        t        |j                     | _        t        j                  d| j                  dd      | _        t        j                  | j                  |j                  dd      | _	        t        j                  |j                  |j                  d      | _        t        | j                  |j                  d      | _        t        | j                  dz  |j                  d      | _        y )Nr   r   r   r   r   )r   r$   r    )r%   r&   mask_input_channelsr   
hidden_actr   rU   r   conv1conv2rc   conv3r   layer_norm_epsr   r   r  s     r,   r&   zEdgeTamMaskEmbedding.__init__  s    #)#=#=#B  !2!23YYq$":":RST
YYt779S9Sabklm
YYv996;M;M[\]
+$$&*?*?M]
 ,$$q(f.C.CQa
r-   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }|S rz   )r8  r   r   r9  r   r:  )r(   masksr@   dense_embeddingss       r,   r2   zEdgeTamMaskEmbedding.forward#  so    

5)((76

=1((76::m4r-   r1  r9   s   @r,   r3  r3    s    
9 
	 r-   r3  c                       e Zd Zdef fdZdej                  dej                  dedej                  fdZdej                  dej                  fd	Z	d
e
eej                  ej                  f      de
ej                     de
ej                     de
ej                     deej                  ej                  f   f
dZ xZS )EdgeTamPromptEncoderrb   c                    t         |           t        |      | _        t	        |      | _        t        j                  d|j                        | _	        |j                  |j                  z  |j                  |j                  z  f| _        d|j                  z  |j                  z  d|j                  z  |j                  z  f| _        |j                  | _        t        j                  |j                  |j                        | _        |j                  | _        t        j                  d|j                        | _        y )Nr   r   )r%   r&   r&  shared_embeddingr3  
mask_embedrU   r   rc   no_mask_embed
image_size
patch_sizeimage_embedding_sizemask_input_sizeinput_image_sizenum_point_embeddingspoint_embednot_a_point_embedr  s     r,   r&   zEdgeTamPromptEncoder.__init__0  s     :6 B.v6\\!V-?-?@%+%6%6&:K:K%KVM^M^bhbsbsMs$t! !F$5$5 59J9J JAPVPaPaLaekevevLvw & 1 1<<(C(CVEWEWX!--!#a1C1C!Dr-   pointslabelspadr/   c                 P   |dz   }|rZt         j                  j                  j                  |ddd      }t         j                  j                  j                  |ddd      }| j                  | j                  f}| j                  ||      }t        j                  |d   dk(  | j                  j                  |      }t        j                  |d   d	k7  |t        j                  |            }|| j                  |j                  d
            |dk\  j                  d      z  z   }|S )zEmbeds point prompts.      ?r   r   r   r   constantr   r  rI   )r   r   rN   ).Ni)min)r7   rU   rV   rO  rI  rB  whererL  r   
zeros_likerK  clamp	unsqueeze)r(   rM  rN  rO  r/  point_embeddings         r,   _embed_pointsz"EdgeTamPromptEncoder._embed_points>  s   #XX((,,V\
Z[,\FXX((,,VV*TV,WF,,d.C.CD//D  ++fY&72&=t?U?U?\?\^mn  ++9$_-
 *D,<,<V\\a\=P,QU[_`U`TkTklnTo,oor-   boxesc                 6   |dz  } |j                   g |j                  dd dd }t        j                  j                  j                  |ddd      }| j                  || j                  | j                  f      }|dddddddfxx   | j                  j                  d   z  cc<   |dddddddfxx   | j                  j                  d	   z  cc<   | j                  j                  j                  |dddddddf         |dddddddf<   |S )
zEmbeds box prompts.rQ  Nr   rR  rS  r   rT  r   r   )rs   rr   r7   rU   rV   rO  rB  rI  rK  r   rL  	expand_as)r(   r\  coordscorner_embeddings       r,   _embed_boxesz!EdgeTamPromptEncoder._embed_boxesW  s   3U[[!_3a33$$((JVW(X00$:O:OQUQfQf9ghAq!$(8(8(?(?(BB$Aq!$(8(8(?(?(BB$'+'='='D'D'N'NO_`acdfgij`jOk'lAq!$r-   input_pointsinput_labelsinput_boxesinput_masksc                    d}d}|4|j                   d   }|t        d      | j                  |||du       }|}|>|j                   d   }| j                  |      }||}nt	        j
                  ||gd      }|| j                  |      }	||	fS | j                  j                  j                  dddd      j                  |d| j                  d   | j                  d         }	||	fS )	au  
        Embeds different types of prompts, returning both sparse and dense embeddings.

        Args:
            points (`torch.Tensor`, *optional*):
                point coordinates and labels to embed.
            boxes (`torch.Tensor`, *optional*):
                boxes to embed
            masks (`torch.Tensor`, *optional*):
                masks to embed
        Nr   r   z5If points are provided, labels must also be provided.)rO  r   r   rN   )rr   r   r[  ra  r7   r   rC  rD  r   ru   expandrG  )
r(   rb  rc  rd  re  sparse_embeddingsrv   point_embeddingsbox_embeddingsr>  s
             r,   r2   zEdgeTamPromptEncoder.forwardc  s'   $ !
#%++A.J# !XYY#11,S^bfSf1h 0"$**1-J!..{;N ($2!$)II/@..QWX$Y!"#{; !"222	  $1188@@B1MTTB 9 9! <d>W>WXY>Z  !"222r-   )r3   r4   r5   r   r&   r7   r   r   r[  ra  r   rD   r2   r8   r9   s   @r,   r@  r@  /  s    E9 EELL %,, T V[VbVb 2
 %,, 
 5<< 
 (3uU\\5<<%?@A(3 u||,(3 ell+	(3
 ell+(3 
u||U\\)	*(3r-   r@  c                   X     e Zd Zdef fdZ	 d
dededededee   dee	e
f   fd	Z xZS )EdgeTamTwoWayTransformerrb   c                 r   t         |           || _        |j                  | _        t	        j
                         | _        t        | j                        D ]+  }| j                  j                  t        ||dk(               - t        |      | _        t	        j                  |j                        | _        y )Nr   )r}   )r%   r&   rb   r   rU   r   r   r   r   r|   r_   final_attn_token_to_imager   rc   layer_norm_final_attn)r(   rb   r  r+   s      r,   r&   z!EdgeTamTwoWayTransformer.__init__  s    !'!9!9mmot--.AKK:6XY]^X^`a / *:&)A&%'\\&2D2D%E"r-   ri  r!  image_positional_embeddingsrn   r*   r/   c           
         |t        d      |j                  d      j                  ddd      j                  d      }|j                  d      j                  ddd      j                  d      }|}|}| j                  D ]  }	|||z  } |	d|||||d|\  }}}
 ||z   }||z   }| j                  |||      \  }}
||z   }| j                  |      }||fS )Nz&You have to specify an image_embeddingr   r   r   )r   r   r   r   rn   r   rE   )r   r   r1   rY  r   rn  ro  )r(   ri  r!  rp  rn   target_embeddingr*   r   r   r   r   rG   rH   r   s                 r,   r2   z EdgeTamTwoWayTransformer.forward  s%    #EFF+33A6>>q!QGQQRST&A&I&I!&L&T&TUVXY[\&]&g&ghi&j# # [[E+++$  &6$?%9   GT1	 ! **00445cQU4V!H$,,W5}r-   rz   )r3   r4   r5   r   r&   r   r   r	   r   rD   r   r2   r8   r9   s   @r,   rl  rl    sf    F7 F& ( ( !( &,	(
 %( +,( 
uo%	&(r-   rl  c                   h    e Zd Zdef fdZ	 	 ddej                  dej                  dej                  dej                  dedeej                     d	e	ej                     d
e	ej                     de
e   deej                  ej                  ej                  ej                  f   fdZd Zd Z xZS )EdgeTamMaskDecoderrb   c                    t         |           || _        |j                  | _        |j                  | _        |j                  dz   | _        t        j                  d| j                        | _        t        j                  | j
                  | j                        | _	        t        |      | _        t        j                  | j                  | j                  dz  dd      | _        t        j                  | j                  dz  | j                  dz  dd      | _        t        | j                  dz  d      | _        t        j"                         | _        g }t'        | j
                        D ]5  }|t)        | j                  | j                  | j                  dz  d      gz  }7 t        j*                  |      | _        t)        | j                  |j.                  | j
                  |j0                  d	
      | _        t        j4                  |j                  |j                  dz  dd      | _        t        j4                  |j                  |j                  dz  dd      | _        t        j                  d| j                        | _        t)        | j                  | j                  dd      | _        |j>                  | _        |j@                  | _         |jB                  | _!        y )Nr   r   r   r5     r$   )r"   r   T)r   )"r%   r&   rb   rc   num_multimask_outputsnum_mask_tokensrU   r   	iou_tokenmask_tokensrl  transformerr   upscale_conv1upscale_conv2r   upscale_layer_normGELUr   r   r   r   output_hypernetworks_mlpsiou_head_hidden_dimiou_head_depthiou_prediction_headr   conv_s0conv_s1obj_score_tokenpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r(   rb   	mlps_listr   r+   s       r,   r&   zEdgeTamMaskDecoder.__init__  s[   !--%+%A%A"%;;a?a)9)9:<<(<(<d>N>NO3F;  //0@0@$BRBRVWBWefopq//0@0@A0EtGWGW[\G\jktuv"243C3Cq3HVf"g'')	t++,A,T-=-=t?O?OQUQaQaefQfhijkkI -)+y)A&#5&&  !!$
  yy!3!3V5G5G15LZ[defyy!3!3V5G5G15LZ[def!||At/?/?@#5d6F6FHXHXZ[]^#_ /5/U/U,171Y1Y.282[2[/r-   r!  rp  sparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputhigh_resolution_featuresrn   rr  r*   r/   c	           
         |j                   \  }
}}}|j                   d   }t        j                  | j                  j                  | j
                  j                  | j                  j                  gd      }|j                  |
|dd      }|j                   d   dk7  rt        j                  ||fd      }n|}|j                  | j
                  j                  j                        }||z   }|j                  |d      }|j                  |d      } | j                  d	|||||d|	\  }}|dddddddf   }|dddddd| j                  z   ddf   }|j                  dd      j                  |
|z  |||      }|\  }}|j                  |d      }|j                  |d      }| j                  |      |z   }| j!                  | j#                  |            }| j!                  | j%                  |      |z         }g }t'        | j                        D ]*  }| j(                  |   }| ||dddd|ddf         gz  }, t        j*                  |d      }|j                   \  }}}}|j                  |
||||z        }||z  j                  |
|d||      }| j-                  |      }| j/                  |dddddddf         }|r+t1        dd      }|dddd|ddddf   }|dddd|f   }nd| j2                  r.| j4                  s"t1        dd      }| j7                  ||      \  }}n*t1        dd      }|dddd|ddddf   }|dddd|f   }|dddd|f   } ||| |fS )
a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (`torch.Tensor`):
                The embeddings from the image encoder.
            image_positional_embeddings (`torch.Tensor`):
                Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (`torch.Tensor`):
                The embeddings of the points and boxes.
            dense_prompt_embeddings (`torch.Tensor`):
                The embeddings of the mask inputs.
            multimask_output (`bool`):
                Whether to return multiple masks or a single mask.
            high_resolution_features (`list[torch.Tensor]`, *optional*):
                The high-resolution features from the vision encoder.
            attention_similarity (`torch.Tensor`, *optional*):
                The attention similarity tensor.
            target_embedding (`torch.Tensor`, *optional*):
                The target embedding.
        r   r   r   r   )ri  r!  rp  rn   rr  Nr   rN   rE   )rr   r7   r   r  r   ry  rz  repeatrY   rP   repeat_interleaver{  rx  rT   rs   r|  r   r~  r}  r   r  r   r  r  slicer  rR    _dynamic_multimask_via_stability)!r(   r!  rp  r  r  r  r  rn   rr  r*   rv   num_channelsheightwidthrw   output_tokenstokensri  iou_token_outmask_tokens_outfeat_s0feat_s1upscaled_embeddinghyper_in_listr  current_mlphyper_inr   r=  iou_predr   
mask_slicesam_tokens_outs!                                    r,   r2   zEdgeTamMaskDecoder.forward  s   B 3C2H2H/
L&%399!<		$$++%%  ''
 
 &,,Z9I1aP#))!,1YY/GHaPF"F!99T^^%:%:%@%@A ,.EE+==>NTU=V&A&S&STdfg&h#-=T-=-= .
--(C!5-.
 .
** )Aq!4*1aa$:N:N6N1OQR+RS ,55a;@@))<
 4++,<!+D++,<!+D!//0@AGK!__T-D-DEW-XY!__T-?-?@R-SV]-]^,.t++,A88;Kk/!Q1**EFGGM - ;;}!4);)A)A&</44ZAQS_agjoaop..44ZAQSUW]_de ++M:"667G1aQR
7ST q$J!Q
Aq01E1j 01H11$--q!J"CCE8TOE8q!J!Q
Aq01E1j 01H(Az)9:h0CCCr-   c                    |j                  d      }| j                  }t        j                  ||kD  d      j	                         }t        j                  || kD  d      j	                         }t        j
                  |dkD  ||z  d      }|S )zz
        Compute stability scores of the mask logits based on the IoU between upper and
        lower thresholds.
        rN   r   r   r   )r   r  r7   sumr   rV  )r(   mask_logitsstability_deltaarea_iarea_ustability_scoress         r,   _get_stability_scoresz(EdgeTamMaskDecoder._get_stability_scores`  s    
 "))"-@@;8bAGGI;/)99rBHHJ ;;vz6F?CHr-   c           	         |ddddddddddf   }|ddddddf   }t        j                  |d      }|j                  d      j                  d      j                  d      }|j                  ddd|j	                  d      |j	                  d            }t        j
                  |d|      }t        j
                  |d|j                  d            }|ddddddddddf   }	|ddddddf   }
| j                  |	      }|| j                  k\  }t        j                  |d   j                  |	      |	|      }t        j                  |j                  |
      |
|      }||fS )	as  
        When outputting a single mask, if the stability score from the current single-mask
        output (based on output token 0) falls below a threshold, we instead select from
        multi-mask outputs (based on output token 1~3) the mask with the highest predicted
        IoU score. This is intended to ensure a valid mask for both clicking and tracking.
        Nr   rN   r   r  r   r   ).NN)
r7   argmaxrY  rg  sizegatherr  r  rV  r^  )r(   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_indsbest_scores_inds_expandedbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresr  	is_stablemask_logits_outiou_scores_outs                  r,   r  z3EdgeTamMaskDecoder._dynamic_multimask_via_stabilityl  s    +1aQ>:-aABh7 <<(<"E$4$>$>r$B$L$LR$P$Z$Z[]$^!$=$D$DA',,R02B2G2G2K%
! !&-=qB[ \$)LL1EqJZJdJdegJh$i! ,Aq!A#q!O< .q!QqSy 9556GH$(O(OO	  ++o&001BC!

  56!%

 ..r-   )NN)r3   r4   r5   r   r&   r7   r   r   listr   r   r	   rD   r2   r  r  r8   r9   s   @r,   rt  rt    s    '\7 '\b 8<37mD,,mD &+\\mD #(,,	mD
 "'mD mD #'u||"4mD 'u||4mD #5<<0mD +,mD 
u||U\\5<<E	FmD^
 #/r-   rt  z
    Segment Anything Model 2 (SAM 2) for generating segmentation masks, given an input image and
    input points and labels, boxes, or masks.
    c                       e Zd ZdgZdgZd eed      iZg dZde	f fdZ
d Zd	ej                  fd
Z ej                         dej                   dee   d	eej                     fd       Z ej                         	 	 	 	 ddeej                      deej,                     deej                      deej,                     fd       Zee	 	 	 	 	 	 	 	 	 ddeej                      deej                      deej,                     deej                      deej,                     deej                      dedeej                      deej                      dee   d	efd              Zdej                   dee   d	eeej                     eej                     eeej                   df      eeej                   df      f   fdZ xZS )r   z4prompt_encoder.shared_embedding.positional_embeddingr$  r   )index)z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterrb   c                    t         |   |       t        |j                        | _        t        j                  |j                        | _        t        |j                        | _
        |j                  |j                  _        t        |j                        | _        |j                  j                  | _        |j                  j                   | _        |j                  j"                  | _        t&        j(                  j+                  t'        j,                  dd| j$                              | _        | j1                          y )Nr   )r%   r&   r&  prompt_encoder_configshared_image_embeddingr   r  vision_configvision_encoderr@  prompt_encoderrt   mask_decoder_configrt  mask_decoderr  backbone_feature_sizesr   r   r7   rU   	Parameterr   r   r  r  s     r,   r&   zEdgeTamModel.__init__  s     &@A]A]&^#'33F4H4HI263O3OP:@:U:U""7.v/I/IJ"("6"6"I"I&,&:&:&Q&Q# ..>>#(88#5#5ekk!Q6X#Y r-   c                     | j                   j                  j                  | j                  j                  j                  _        y rz   )r  r(  r   r  rB  )r(   s    r,   _tie_weightszEdgeTamModel._tie_weights  s1    ''<<AA 	,,AAFr-   r/   c                    | j                   j                  }| j                  j                  j                  }| j                  j                  j
                  }t        j                  |||      }|j                  d      dz
  }|j                  d      dz
  }||d   z  }||d   z  }| j                  t        j                  ||gd            }|j                  ddd      j                  d      S )Nr   r   r   rQ  r   rN   r   )r  rG  r  r(  r   rP   r7   onesr   r   r1   rY  )r(   r  target_devicetarget_dtypegridr   r   r(  s           r,   $get_image_wide_positional_embeddingsz1EdgeTamModel.get_image_wide_positional_embeddings  s    ""7733HHOO22GGMMzz$}LI++!+$s*++!+$s*DG#DG##::5;;QXGY_a;bc#++Aq!4>>qAAr-   r   r*   c           
          |j                   d   } | j                  |fi |\  }}}}|d   | j                  z   |d<   t        || j                        D cg c]*  \  }} |j                  ddd      j                  |dg| , }}}|S c c}}w )z
        Returns the image embeddings by passing the pixel values through the vision encoder.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Input pixel values
        r   rN   r   r   )rr   get_image_featuresr   zipr  r1   rs   )	r(   r   r*   rv   feature_mapsr   feat	feat_sizer!  s	            r,   get_image_embeddingsz!EdgeTamModel.get_image_embeddings  s     "''*
 7 7 7 O OaA (+d.F.FFR
 $'|T5P5P#Q
#Qi 'DLLAq!&&z2B	B#Q 	 

  
s   /B
rb  rc  rd  re  c                 0    | j                  ||||      }|S )a  
        Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.

        Args:
            input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
                Optional input points for the prompt encoder. The padding of the point is automatically done by the
                processor. `point_batch_size` refers to the number of masks that we want the model to predict per
                point. The model will output `point_batch_size` times 3 masks in total.
            input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
                Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
                processor, or can be fed by the user.
            input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`):
                Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
                processor. users can also pass manually the input boxes.
            input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`):
                Optional input masks for the prompt encoder.
        rb  rc  rd  re  )r  )r(   rb  rc  rd  re  prompt_outputs         r,   get_prompt_embeddingsz"EdgeTamModel.get_prompt_embeddings  s-    2 ++%%##	 , 
 r-   r!  r  rn   rr  c
                 V   |du |du z  st        d      |M|K|j                  d   |j                  d   k7  r,t        d|j                  d    d|j                  d    d      | j                         }||j                  d   n|d   j                  d   }|j                  |ddd      }d}d}|x | j                  |fi |
\  }}}}|d   | j
                  z   |d<   t        || j                        D cg c]*  \  }} |j                  dd	d      j                  |dg| , }}}|?|=t        j                  |dddddddf   t        j                  |j                  
      }|m|kt        j                  |ddd	|d   j                  |d   j                  
      }t        j                   |ddt        j"                  |d   j                  
       }|{|j                  dd | j$                  j&                  k7  rUt)        j*                  |j-                         | j$                  j&                  ddd      j/                  |j                        }| j%                  ||||      \  }} | j0                  d|d   |||||dd ||	d|
\  }}}}t3        ||||||      S c c}}w )a  
        input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`):
            Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
            better results. The points can be obtained by passing a list of list of list to the processor that will
            create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the
            second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict
            per input point), the third dimension is the number of points per segmentation mask (it is possible to pass
            multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
            coordinates of the point. If a different number of points is passed either for each image, or for each
            mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
            computation of the embedding will be skipped for these points using the labels.
        input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`):
            Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
            official implementation, there are 3 types of labels

            - `1`: the point is a point that contains the object of interest
            - `0`: the point is a point that does not contain the object of interest
            - `-1`: the point corresponds to the background

            We added the label:

            - `-10`: the point is a padding point, thus should be ignored by the prompt encoder

            The padding labels should be automatically done by the processor.
        input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
            Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
            much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
            that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch
            size, the number of boxes per image and the coordinates of the top left and bottom right point of the box.
            In the order (`x1`, `y1`, `x2`, `y2`):

            - `x1`: the x coordinate of the top left point of the input box
            - `y1`: the y coordinate of the top left point of the input box
            - `x2`: the x coordinate of the bottom right point of the input box
            - `y2`: the y coordinate of the bottom right point of the input box
        input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`):
            SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
            generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
            manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`):
            Image embeddings, this is used by the mask decoder to generate masks and iou scores. For more memory
            efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
            method, and then feed them to the `forward` method instead of feeding the `pixel_values`.
        multimask_output (`bool`, *optional*):
            In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
            bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
            "best" mask, by specifying `multimask_output=False`.
        attention_similarity (`torch.FloatTensor`, *optional*):
            Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the
            model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048).
        target_embedding (`torch.FloatTensor`, *optional*):
            Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case
            the model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048).

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoModel, AutoProcessor

        >>> model = AutoModel.from_pretrained("danelcsb/edgetam.1_hiera_tiny")
        >>> processor = AutoProcessor.from_pretrained("danelcsb/edgetam.1_hiera_tiny")

        >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
        >>> input_points = [[[400, 650]]]  # 2D location of a window on the car
        >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")

        >>> # Get segmentation mask
        >>> outputs = model(**inputs)

        >>> # Postprocess masks
        >>> masks = processor.post_process_masks(
        ...     outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
        ... )
        ```
        NzAExactly one of pixel_values or image_embeddings must be provided.r   zGYou should provide as many bounding boxes as input points per box. Got z and .r   rN   r   r   r  FbilinearT)r  r  r  r  r  )r!  rp  r  r  r  r  rn   rr  )r  r  r   r!  r"  r#  rE   )r   rr   r  r  r  r   r  r  r1   rs   r7   	ones_liker   r   r   rP   r  int32r  rH  r   r  r   rY   r  r  )r(   r   rb  rc  rd  re  r!  r  rn   rr  r*   rp  rv   r#  r"  r  r   r  r  rh  r>  low_res_multimasksr  r   s                           r,   r2   zEdgeTamModel.forward  se   z %*:d*BC`aa#(?!!!$(9(9!(<< ]^j^p^pqr^s]tty  {F  {L  {L  MN  {O  zP  PQ  R  '+&O&O&Q#.:.F\''*L\]_L`LfLfghLi
&A&H&HUVXY[\&]# ##G^tG^G^HHDL!13D  ,B/$2J2JJL
 (+<9T9T'U 'UOD) +Q1%**:rFIF'U   
 #(< ??<1a
+C599]i]p]pqLK$7 ;;Aq!+;B+?+E+EN^_aNbNiNiL "JJz1au{{ScdfSgSnSnooL"   %)<)<)L)LLmm%%',,<<"'#" "[&&'  /3.A.A%%##	 /B /
++ BSARAR 
B
-b1(C%6$4-%5cr%:!5-
B
 
B
>J+> .!) 3-!5/
 	
[ s   7/J%.c                     | j                   |fi |}|j                  }|j                  }t        |      }| j                  j                  |d         |d<   | j                  j                  |d         |d<   |D cg c]$  }|j                  d      j                  ddd      & }}|D cg c]$  }|j                  d      j                  ddd      & }}|||j                  |j                  fS c c}w c c}w )a  
        Extract and preprocess image features using the vision encoder.

        Args:
            pixel_values (`torch.FloatTensor`):
                Input pixel values of shape `(batch_size, num_channels, height, width)`.

        Returns:
            `tuple`: A tuple containing:
                - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
                - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
                - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
                - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
        r   r   r   )r  r>   r?   r  r  r  r  r   r1   r@   rA   )r(   r   r*   vision_outputsr   feature_maps_position_embeddingsfeature_mapfeature_map_position_embeddings           r,   r  zEdgeTamModel.get_image_features  s   0 6IT5H5H6
6

 &77+9+O+O( L)++33LODQ++33LODQ T``S_K++A.66q!Q?S_` 3S,
2R. +2215==aAF2R 	) ,

 =~?[?[]k]v]vvv a,
s   =)C1,)C6)NNNN)	NNNNNNTNN) r3   r4   r5   _tied_weights_keys_keys_to_ignore_on_load_missingr   r|   r  "_keys_to_ignore_on_load_unexpectedr   r&   r  r7   r   r  no_gradrB   r   r	   r  r  r   
LongTensorr  r
   r   r   r  r2   rD   r  r8   r9   s   @r,   r   r     s    QQ']&^#4nE`hi6jk	*&} "

Bell B U]]_ ''  +,  
ell		   4 U]]_ 59373726u001 u//0 e//0	
 e../ @  59483737268<!%<@8<i
u001i
 u001i
 u//0	i

 e//0i
 e../i
 #5#4#45i
 i
 'u'8'89i
 #5#4#45i
 +,i
 
(i
  i
V-w''-w +,-w 
U\\U\\u((#-./u((#-./	1
	-wr-   r   )r   r  r   )rq   )@r   dataclassesr   typingr   r   r   numpyr-  r7   torch.nnrU   torch.nn.functionalrV   r   r   transformers.utils.genericr   r	   r
   activationsr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   autor   configuration_edgetamr   r   r   r   6transformers.models.timm_wrapper.modeling_timm_wrapperr   r   r   r<   Moduler   r]   r_   r|   r   r   r   r   r  r  r&  r3  r@  rl  rt  r   __all__rE   r-   r,   <module>r     sF  ,  ! , ,       ] ] ! / F & @ 0   Wr|| 4 KL? ? M ?D %II%<<% 
% <<	%
 U\\*% % %,;)ryy ;)|J'")) J'Z < 8_ 8 86+299 +\18		 18h 
'
/ '

'
T IJL[ L K L@S S2 299  6\3299 \3~6ryy 6rH/ H/V Iw) IwIwX
 Kr-   