
    <h                        S SK rS SKrS SKJr  S SKJrJrJr  S SK	r
S SKrS SKJs  Jr  S SKJrJr  SSKJr  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJrJrJr  SSK J!r!  \" 5       (       a  S SK"J#r#  \" 5       (       a  S SK$J%r%  S SK&J'r'  \\" SS9 " S S\5      5       5       r( SLS\R                   S\R                   S\R                   4S jjr)S\S\S\4S jr*S\R                   S\R                   S\R                   4S jr+ " S S\RX                  5      r-S\S\S\.S\4S jr/S\R                   S\R                   S\.S\R                   4S  jr0 " S! S"\RX                  5      r1 " S# S$\RX                  5      r2 " S% S&\RX                  5      r3 SMS'\RX                  S(\R                   S)\R                   S*\R                   S+\\R                      S,\4S-\44S. jjr5 " S/ S0\RX                  5      r6 " S1 S2\RX                  5      r7SNS3\R                   S4\4S5\8S\R                   4S6 jjr9 " S7 S8\RX                  5      r: " S9 S:\RX                  5      r; " S; S<\RX                  5      r< " S= S>\5      r= " S? S@\R|                  5      r? " SA SB\RX                  5      r@ " SC SD\RX                  5      rA " SE SF\RX                  5      rB\ " SG SH\5      5       rC\" SIS9 " SJ SK\C5      5       rDSHSK/rEg)O    N)	dataclass)CallableOptionalUnion)Tensornn   )ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)auto_docstringcan_return_tupleis_accelerate_available   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   D   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R$                        \	S
'   Srg)"EomtForUniversalSegmentationOutput0   a  
loss (`torch.Tensor`, *optional*):
    The computed loss, returned when labels are present.
class_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
    query. Note the `+ 1` is needed because we incorporate the null class.
masks_queries_logits (`torch.FloatTensor`):
    A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
    query.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Last hidden states (final feature map) of the last layer.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Self and Cross Attentions weights from transformer decoder.
patch_offsets (`list[torch.Tensor]`, *optional*):
    list of tuples indicating the image index and start and end positions of patches for semantic segementation.
Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r    r!   tupler"   r#   listr   __static_attributes__r$       ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/eomt/modeling_eomt.pyr   r   0   s    * )-D(5$$
%,8<(5#4#45<8<(5#4#45<59x 1 1298<M8E%"3"345<59Ju0012926M8D./6r0   r   input_featurespoint_coordinatesreturnc                     UR                  5       S:X  a  SnUR                  S5      n[        R                  R                  R
                  " U SU-  S-
  40 UD6nU(       a  UR                  S5      nU$ )a  
A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

Args:
    input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
        A tensor that contains features map on a height * width grid
    point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
    2)):
        A tensor that contains [0, 1] * [0, 1] normalized point coordinates
    add_dim (`bool`):
        boolean value to keep track of added dimension

Returns:
    point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
    height_grid, width_grid):
        A tensor that contains features for points in `point_coordinates`.
r	   T   g       @      ?)dim	unsqueezer*   r   
functionalgrid_samplesqueeze)r2   r3   add_dimkwargspoint_featuress        r1   sample_pointr@   [   st    ( !#-77: XX((44^SK\E\_bEbmflmN'//2r0   inputslabelsc                    U R                  5       R                  S5      n S[        R                  " XR                  5      -  nU R                  S5      SS2S4   UR                  S5      SSS24   -   nSUS-   US-   -  -
  nU$ )a  
A pair wise version of the dice loss, see `dice_loss` for usage.

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    `torch.Tensor`: The computed loss between each pairs.
r   r6   N)sigmoidflattenr*   matmulTsum)rA   rB   	numeratordenominatorr   s        r1   pair_wise_dice_lossrL   {   sz     ^^%%a(FELL22I**R.D)FJJrN47,CCK	A+/22DKr0   c                 Z   U R                   S   n[        R                  " SS9nU" U [        R                  " U 5      5      nU" U [        R
                  " U 5      5      n[        R                  " XB-  UR                  5      n[        R                  " XR-  SU-
  R                  5      nXg-   nU$ )a  
A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    loss (`torch.Tensor`): The computed loss between each pairs.
r   none	reduction)shaper   BCEWithLogitsLossr*   	ones_like
zeros_likerG   rH   )	rA   rB   height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr   s	            r1   $pair_wise_sigmoid_cross_entropy_lossr[      s     ||A$$v6I&vuv/FG&vu/?/?/GH||2EvxxPH||2EF
~~VHDKr0   c                      ^  \ rS rSrSr SS\S\S\S\4U 4S jjjr\R                  " 5       S\R                  S	\R                  S
\R                  S\R                  S\\\
      4
S j5       rSrU =r$ )EomtHungarianMatcher   aa  This class computes an assignment between the labels and the predictions of the network.

For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).

cost_class	cost_mask	cost_dice
num_pointsc                    > [         TU ]  5         US:X  a  US:X  a  US:X  a  [        S5      eX@l        Xl        X l        X0l        g)a  Creates the matcher

Params:
    cost_class (`float`, *optional*, defaults to 1.0):
        Relative weight of the classification error in the matching cost.
    cost_mask (`float`, *optional*,  defaults to 1.0):
        This is the relative weight of the focal loss of the binary mask in the matching cost.
    cost_dice (`float`, *optional*, defaults to 1.0):
        This is the relative weight of the dice loss of the binary mask in the matching cost.
    num_points (`int`, *optional*, defaults to 12544):
        No. of points to sample on which the mask loss will be calculated. The same set of K points are
        uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
        matching.
r   zAll costs can't be 0N)super__init__
ValueErrorrb   r_   r`   ra   )selfr_   r`   ra   rb   	__class__s        r1   re   EomtHungarianMatcher.__init__   sC    " 	?yA~)q.344$$""r0   r   r   mask_labelsclass_labelsr4   c           
         / nUR                   S   n[        U5       GH  nX'   R                  S5      nX   n	USS2XG   4   * n
X7   R                  U	5      nUSS2S4   nU	SS2S4   n	[        R
                  " SU R                  SU	R                  S9nUR                  UR                   S   SS5      n[        XSS9R                  S5      nUR                  U	R                   S   SS5      n[        XSS9R                  S5      n	[        X5      n[        X5      nU R                  U-  U R                  U
-  -   U R                  U-  -   n[        R                   " U[        R"                  " S	5      5      n[        R$                  " U[        R"                  " S
5      5      n[        R&                  " US5      n[)        UR+                  5       5      nUR-                  U5        GM     U VVs/ sHL  u  nn[        R.                  " U[        R0                  S9[        R.                  " U[        R0                  S94PMN     nnnU$ s  snnf )a  
Params:
    masks_queries_logits (`torch.Tensor`):
        A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
    class_queries_logits (`torch.Tensor`):
        A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
    class_labels (`torch.Tensor`):
        A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
        target) containing the class labels.
    mask_labels (`torch.Tensor`):
        A tensor of dim `num_target_boxes, height, width` containing the target masks.

Returns:
    matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
    where:
        - index_i is the indices of the selected predictions (in order)
        - index_j is the indices of the corresponding selected labels (in order)
    For each batch element, it holds:
        len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
r   rD   Nr   r6   deviceFalign_cornersg    _Bg    _©dtype)rQ   rangesoftmaxtor*   randrb   rn   repeatr@   r<   r[   rL   r`   r_   ra   minimumtensormaximum
nan_to_numr   cpuappend	as_tensorint64)rg   r   r   rj   rk   indices
batch_sizei
pred_probs	pred_maskr_   target_maskr3   target_coordinatespred_coordinatesr`   ra   cost_matrixassigned_indicesjmatched_indicess                        r1   forwardEomtHungarianMatcher.forward   s   8 *, *//2
z"A-088<J,/I %Q%788J%.++I6K%ag.K!!T'*I !&

1dooqIYIY Z!2!9!9+:K:KA:NPQST!U&{V[\ddefgK077	8JAqQ$YPUV^^_`aI =YTI+ICI..94t7SSVZVdVdgpVppK--U\\$5GHK--U\\%5HIK**;:K0EkooFW0XNN+,? #F ho
gn_c_`bcU__Qekk2EOOAU[[4YZgn 	 
 
s   ,AIr_   ra   r`   rb   )r7   r7   r7   i 1  )r%   r&   r'   r(   r)   floatintre   r*   no_gradr   r.   r-   r   r/   __classcell__rh   s   @r1   r]   r]      s     jo##27#JO#cf# #4 ]]_D#llD $llD \\	D
 llD 
eFm	D Dr0   r]   	num_masksc                     U R                  5       R                  S5      nSX1-  R                  S5      -  nUR                  S5      UR                  S5      -   nSUS-   US-   -  -
  nUR                  5       U-  nU$ )a  
Compute the DICE loss, similar to generalized IOU for masks as follows:

$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

Args:
    inputs (`torch.Tensor`):
        A tensor representing a mask.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).
    num_masks (`int`):
        The number of masks present in the current batch, used for normalization.

Returns:
    `torch.Tensor`: The computed loss.
r   r6   rD   )rE   rF   rI   )rA   rB   r   probsrJ   rK   r   s          r1   	dice_lossr     sx    , NN$$Q'EU^((,,I))B-&**R.0K	A+/22D88:	!DKr0   c                     [         R                  " SS9nU" X5      nUR                  S5      R                  5       U-  nU$ )aX  
Args:
    inputs (`torch.Tensor`):
        A float tensor of arbitrary shape.
    labels (`torch.Tensor`):
        A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
        (0 for the negative class and 1 for the positive class).

Returns:
    loss (`torch.Tensor`): The computed loss.
rN   rO   r   )r   rR   meanrI   )rA   rB   r   rV   cross_entropy_lossr   s         r1   sigmoid_cross_entropy_lossr   5  sB     $$v6I"62""1%))+i7DKr0   c                     ^  \ rS rSrS\S\\\4   4U 4S jjrS\	\	\
      S\	\
   4S jrS\	\   S\\\4   4S	 jrS
\S\	\   S\\R                      S\\\4   4S jrS\R                  S\	\R                     S\\R                      S\
S\\\R                  4   4
S jrS rS rS\R                  S\R                  4S jrS\R                  S\
S\
S\S\R                  4
S jr SS\R                  S
\R                  S\	\R                     S\	\R                     S\\\\R                  4      S\\\R                  4   4S jjrS\R                  S\R4                  S\R                  4S jrSrU =r$ ) EomtLossiI  configweight_dictc                   > [         TU ]  5         [        U S/5        UR                  U l        X l        UR
                  U l        [        R                  " U R                  S-   5      nU R                  US'   U R                  SU5        UR                  U l        UR                  U l        UR                  U l        [        UR                  UR                   UR"                  U R                  S9U l        g)a   
The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and mask)

Args:
    config (`EomtConfig`):
        The configuration for Eomt model also containing loss calculation specific parameters.
    weight_dict (`dict[str, float]`):
        A dictionary of weights to be applied to the different losses.
scipyr   rD   empty_weightr   N)rd   re   r   
num_labelsr   no_object_weighteos_coefr*   onesregister_buffertrain_num_pointsrb   oversample_ratioimportance_sample_ratior]   class_weightdice_weightmask_weightmatcher)rg   r   r   r   rh   s       r1   re   EomtLoss.__init__J  s     	$	* ++& //zz$//A"56==R^\: !11 & 7 7'-'E'E$+**((((	
r0   sizesr4   c                 p    US   nUSS   H'  n[        U5       H  u  pE[        X$   U5      X$'   M     M)     U$ )Nr   r   )	enumeratemax)rg   r   maxessublistindexitems         r1   _max_by_axisEomtLoss._max_by_axism  sC    aQRyG(1"5<6  2 ! r0   tensorsc                 R   U R                  U Vs/ sH  n[        UR                  5      PM     sn5      n[        U5      /U-   nUu  pVpxUS   R                  n	US   R
                  n
[        R                  " XIU
S9n[        R                  " XWU4[        R                  U
S9n[        XU5       Ho  u  p-nUS UR                  S   2S UR                  S   2S UR                  S   24   R                  U5        SUS UR                  S   2S UR                  S   24'   Mq     X4$ s  snf )Nr   rr   rn   r   r6   F)r   r.   rQ   lenrr   rn   r*   zerosr   boolzipcopy_)rg   r   ry   max_sizebatch_shaper   _heightwidthrr   rn   padded_tensorspadding_maskspadded_tensorpadding_masks                  r1   _pad_images_to_max_in_batch$EomtLoss._pad_images_to_max_in_batchu  s'   $$w%OwVd6<<&8w%OP7|nx/'2$
v
  ""[fM

J#>ejjY_`36wP]3^/F<+FLLO+->v||A->@Q&,,q/@QQRXXY_`AFL*6<<?*,=fll1o,==> 4_ ,, &Ps   D$r   rk   r   c           	         UnUR                   u  pVn[        R                  " U R                  S9nU R	                  U5      n	[
        R                  " [        X#5       V
VVs/ sH  u  n
u  p{X   PM     snnn
5      n[
        R                  " XV4U R                  [
        R                  UR                  S9nXU	'   UR                  SS5      nU" X5      nSU0nU$ s  snnn
f )a-  Compute the losses related to the labels using cross entropy.

Args:
    class_queries_logits (`torch.Tensor`):
        A tensor of shape `batch_size, num_queries, num_labels`
    class_labels (`list[torch.Tensor]`):
        List of class labels of shape `(labels)`.
    indices (`tuple[np.array])`:
        The indices computed by the Hungarian matcher.

Returns:
    `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
    - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
)weight)
fill_valuerr   rn   r   r6   loss_cross_entropy)rQ   r   CrossEntropyLossr   $_get_predictions_permutation_indicesr*   catr   fullr   r   rn   	transpose)rg   r   rk   r   pred_logitsr   num_queriesr   rV   idxtargetr   target_classes_otarget_classespred_logits_transposedloss_celossess                    r1   loss_labelsEomtLoss.loss_labels  s    " +%0%6%6"
''t/@/@A	77@ 99-0-GH-G>66AVY-GH
 %$//]h]o]o
 /s!,!6!6q!!<2C&0 Is    Cr   rj   r   c                   ^  T R                  U5      nT R                  U5      nX   nT R                  U5      u  pX   nUSS2S4   nUSS2S4   n[        R                  " 5          T R                  UU 4S jT R                  T R                  T R                  5      n
[        XSS9R                  S5      nSSS5        [        UW
SS9R                  S5      n[        UWU5      [        XU5      S.nAAU$ ! , (       d  f       NF= f)a$  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

Args:
    masks_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, height, width)`.
    mask_labels (`torch.Tensor`):
        List of mask labels of shape `(labels, height, width)`.
    indices (`tuple[np.array])`:
        The indices computed by the Hungarian matcher.
    num_masks (`int)`:
        The number of masks, used for normalization.

Returns:
    losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
    - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
      masks.
    - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
      masks.
Nc                 &   > TR                  U 5      $ N)calculate_uncertainty)logitsrg   s    r1   <lambda>%EomtLoss.loss_masks.<locals>.<lambda>  s    t99&Ar0   Fro   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r*   r   sample_points_using_uncertaintyrb   r   r   r@   r<   r   r   )rg   r   rj   r   r   src_idxtgt_idx
pred_maskstarget_masksr   r3   point_labelspoint_logitsr   s   `             r1   
loss_masksEomtLoss.loss_masks  s   4 ;;GD77@)2
 ::;G#,  4(
#AtG, ]]_ $ D DA%%,,! (W\]eefghL  $J0AQVW__`ab 4L,PYZ"<yI

 ) _s   &AC77
Dc                    [         R                  " [        U5       VVVs/ sH  u  nu  p4[         R                  " X25      PM      snnn5      n[         R                  " U VVs/ sH  u  p4UPM	     snn5      nXV4$ s  snnnf s  snnf r   r*   r   r   	full_like)rg   r   r   srcr   batch_indicespredictions_indicess          r1   r   -EomtLoss._get_predictions_permutation_indices  sh    		iX_N`"aN`{q(35??3#:N`"ab#iiW(EW#W(EF11 #b(E   $A:"B
c                    [         R                  " [        U5       VVVs/ sH  u  nu  p4[         R                  " XB5      PM      snnn5      n[         R                  " U VVs/ sH  u  p4UPM	     snn5      nXV4$ s  snnnf s  snnf r   r   )rg   r   r   r   tgtr   target_indicess          r1   r   )EomtLoss._get_targets_permutation_indices  sg    		iX_N`"aN`{q(15??3#:N`"ab#@HQC#@A,, #b#@r   r   c                 4    [         R                  " U5      * nU$ )a2  
In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
for the foreground class in `classes`.

Args:
    logits (`torch.Tensor`):
    A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
    the number of foreground classes. The values are logits.

Returns:
    scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
    uncertain locations having the highest uncertainty score.
)r*   abs)rg   r   uncertainty_scoress      r1   r   EomtLoss.calculate_uncertainty  s      %yy01!!r0   rb   r   r   c           
      h   UR                   S   n[        X4-  5      n[        R                  " XgSUR                  S9n[        XSS9n	U" U	5      n
[        XS-  5      nX;-
  n[        R                  " U
SS2SSS24   USS9S   nU[        R                  " U[        R                  UR                  S	9-  nXSS2S4   -  nUR                  S
S5      UR                  S
5      SS24   R                  XkS5      nUS:  a5  [        R                  " U[        R                  " XlSUR                  S9/SS9nU$ )a  
This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
prediction as input.

Args:
    logits (`float`):
        Logit predictions for P points.
    uncertainty_function:
        A function that takes logit predictions for P points and returns their uncertainties.
    num_points (`int`):
        The number of points P to sample.
    oversample_ratio (`int`):
        Oversampling parameter.
    importance_sample_ratio (`float`):
        Ratio of points that are sampled via importance sampling.

Returns:
    point_coordinates (`torch.Tensor`):
        Coordinates for P sampled points.
r   r6   rm   Fro   Nr   )kr8   r   rD   r8   )rQ   r   r*   rv   rn   r@   topkarangelongviewr   )rg   r   uncertainty_functionrb   r   r   	num_boxesnum_points_sampledr3   r   point_uncertaintiesnum_uncertain_pointsnum_random_pointsr   shifts                  r1   r   (EomtLoss.sample_points_using_uncertainty  s3   < LLO	 !>? "JJyaPVP]P]^#FUS2<@"#:#GH&=jj,Q1W59MSTUVWX"U\\)5::V\VcVc%ddQW~-222q9#((2,/JOOPYqrsq  %		"EJJyQW]WdWd$ef! ! r0   auxiliary_predictionsc                    U R                  XX45      nU R                  XDS   R                  S9n0 U R                  XXg5      EU R	                  X$U5      EnUbj  [        U5       H[  u  pU
S   nU
S   nU R                  XX45      nUR                  5        VVs0 sH  u  pU SU	 3U_M     nnnUR                  U5        M]     U$ s  snnf )a  
This performs the loss computation.

Args:
    masks_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, height, width)`.
    class_queries_logits (`torch.Tensor`):
        A tensor of shape `(batch_size, num_queries, num_labels)`.
    mask_labels (`torch.Tensor`):
        List of mask labels of shape `(labels, height, width)`.
    class_labels (`list[torch.Tensor]`):
        List of class labels of shape `(labels)`.
    auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
        if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
        the inner layers of the EomtMaskedAttentionDecoder.

Returns:
    losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
    - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
    - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
      masks.
    - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
      masks.
    if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
    losses for each auxiliary predictions.
r   rm   r   r   r   )	r   get_num_masksrn   r   r   r   r   itemsupdate)rg   r   r   rj   rk   r  r   r   r   r   aux_outputs	loss_dictkeyvalues                 r1   r   EomtLoss.forward:  s    H ,,3;e&&|O<R<R&S	%
oo2T%
37K%

 !,$-.C$D '23I'J$'23I'J$ LL)=U`o	EN__EVWEVzsuAcU^U2EV	Wi( %E  Xs   Crn   c                 V   [        U Vs/ sH  n[        U5      PM     sn5      n[        R                  " U[        R                  US9nSn[        5       (       a3  [        R                  0 :w  a  [        U5      n[        5       R                  n[        R                  " XE-  SS9nU$ s  snf )z[
Computes the average number of target masks across the batch, for normalization purposes.
r   r   )min)rI   r   r*   r~   r   r   r   _shared_stater   num_processesclamp)rg   rk   rn   classesr   
world_sizes         r1   r  EomtLoss.get_num_masksq  s     \B\'W\BC	OOIU[[P	
"$$))R/"9-	)^99
KK	 6A>	 Cs   B&)r   r   r   r   rb   r   r   r   )r%   r&   r'   r(   r   dictstrr   re   r.   r   r   r   r-   r   nparrayr   r*   r   r   r   r   r   r   r   rn   r  r/   r   r   s   @r1   r   r   I  s   !
z !
S%Z8H !
F$tCy/ d3i -4< -E&RX.DY -" $* :>v, QVWYW_W_Q` 	c6k	 D<#ll< %,,'< rxx	<
 < 
c5<<	 <|2-"ELL "U\\ ""5!5! 	5!
 5! "'5! 
5!z DH5#ll5 $ll5 %,,'	5
 5<<(5  (S%,,->(?@5 
c5<<	 5n%,,  QVQ]Q]  r0   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )EomtPatchEmbeddingsi  z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   kernel_sizestride)rd   re   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)rg   r   r/  r0  r1  r2  r7  rh   s          r1   re   EomtPatchEmbeddings.__init__  s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir0   pixel_valuesr4   c                     UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r6   )rQ   r1  rf   r9  rF   r   )rg   r;  r1  
embeddingss       r1   r   EomtPatchEmbeddings.forward  sx    #))!,,,,!../yaI  __\2::1=GG1M
r0   )r/  r1  r7  r0  r9  )r%   r&   r'   r(   r)   re   r*   r   r   r/   r   r   s   @r1   r*  r*    s.    jELL U\\  r0   r*  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
EomtEmbeddingsi  zE
Construct the CLS token, mask token, position and patch embeddings.
r   r4   Nc                   > [         TU ]  5         Xl        UR                  U l        [        R
                  " [        R                  " SSUR                  5      5      U l	        [        R
                  " [        R                  " SUR                  UR                  5      5      U l        [        U5      U l        U R                  R                  n[        R                   " UR"                  5      U l        SUR                  -   U l        [        R(                  " X!R                  5      U l        U R-                  S[        R.                  " U5      R1                  S5      SS9  g )Nr   position_ids)r   rD   F)
persistent)rd   re   r   r0  r   	Parameterr*   randnr2  	cls_tokenr   num_register_tokensregister_tokensr*  patch_embeddingsr7  Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr   r  expand)rg   r   r7  rh   s      r1   re   EomtEmbeddings.__init__  s     ++ekk!Q8J8J&KL!||EKK6;U;UW]WiWi,jk 3F ;++77zz&"<"<=!"V%?%?!?#%<<=O=O#P ^U\\+-F-M-Mg-Vchir0   r;  c                    UR                   u  n    nU R                  R                  R                  R                  nU R                  UR                  US95      nU R                  R                  USS5      nU R                  R                  USS5      nXPR                  U R                  5      -   n[        R                  " XgU/SS9nU R                  U5      nU$ )Nrq   rD   r   r  )rQ   rJ  r9  r   rr   ru   rG  rQ  rI  rP  rC  r*   r   rM  )rg   r;  r   r   target_dtyper>  
cls_tokensrI  s           r1   r   EomtEmbeddings.forward  s    *00
Aq!,,77>>DD**<???+NO
^^**:r2>
..55j"bI":":4;L;L"MM
YY
ZHaP
\\*-
r0   )rG  r   rM  rN  rJ  r0  rP  rI  )r%   r&   r'   r(   r)   r   re   r*   r   r   r/   r   r   s   @r1   rA  rA    s>    jz jd j ELL U\\  r0   rA  modulequeryr  r  attention_maskscalingrM  c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrD   )r8   rr   )ptrainingr   r6   )r*   rG   r   r   r:   rt   float32ru   rr   rM  r^  
contiguous)
rW  rX  r  r  rY  rZ  rM  r>   attn_weightsattn_outputs
             r1   eager_attention_forwardrc    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r0   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
EomtAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)rd   re   r   r2  	embed_dimnum_attention_heads	num_headshead_dimrf   scaleattention_dropoutrM  	is_causalr   Lineark_projv_projq_projout_projrg   r   rh   s     r1   re   EomtAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar0   r!   rY  r4   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ )z#Input shape: Batch x Time x Channelr   r6   eager        )rm  rZ  rM  )rQ   rq  ro  rp  r
  ri  rj  r   rc  r   _attn_implementationr   rm  rk  r^  rM  reshaper`  rr  )rg   r!   rY  r>   r   
seq_lengthrg  querieskeysvaluesattention_interfacerb  ra  s                r1   r   EomtAttention.forward  sS    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0((r0   )r   rM  rg  rj  rm  ro  ri  rr  rq  rk  rp  r   )r%   r&   r'   r(   r)   re   r*   r   r   r-   r   r/   r   r   s   @r1   re  re    s[    GB. 26$)||$) !.$)
 
u||Xell33	4$) $)r0   re  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtLayerScalei  r4   c                    > [         TU ]  5         [        R                  " UR                  [
        R                  " UR                  5      -  5      U l        g r   )	rd   re   r   rE  layerscale_valuer*   r   r2  lambda1rs  s     r1   re   EomtLayerScale.__init__  s8    ||F$;$;ejjI[I[>\$\]r0   hidden_statec                     XR                   -  $ r   r  rg   r  s     r1   r   EomtLayerScale.forward!  s    ll**r0   r  r4   N
r%   r&   r'   r(   re   r*   r   r   r/   r   r   s   @r1   r  r    s)    ^+ELL +U\\ + +r0   r  input	drop_probr^  c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
rw  r   r   )r   r   )rQ   ndimr*   rv   rr   rn   floor_div)r  r  r^  	keep_probrQ   random_tensoroutputs          r1   	drop_pathr  %  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr0   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )EomtDropPathi9  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr  r4   c                 .   > [         TU ]  5         Xl        g r   )rd   re   r  )rg   r  rh   s     r1   re   EomtDropPath.__init__<  s    "r0   r!   c                 B    [        XR                  U R                  5      $ r   )r  r  r^  rg   r!   s     r1   r   EomtDropPath.forward@  s    FFr0   c                      SU R                    3$ )Nzp=r  rg   s    r1   
extra_reprEomtDropPath.extra_reprC  s    DNN#$$r0   r  r   )r%   r&   r'   r(   r)   r   r   re   r*   r   r   r&  r  r/   r   r   s   @r1   r  r  9  sQ    b#(5/ #T # #GU\\ Gell G%C % %r0   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtMLPiG  r4   c                 z  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[
        R                  " X$SS9U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [
        R                  " XCSS9U l        g )NTbias)rd   re   r2  r   	mlp_ratior   rn  fc1r3  
hidden_actr&  r
   
activationfc2rg   r   in_featuresout_featureshidden_featuresrh   s        r1   re   EomtMLP.__init__H  s    %+%7%77f0063C3CCD99[Ef''--$V%6%67DO$//DO99_Fr0   r  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  s     r1   r   EomtMLP.forwardS  s2    xx-|4xx-r0   )r  r  r  r  r  r   s   @r1   r  r  G  s)    	GELL U\\  r0   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtSwiGLUFFNiZ  r4   c                 $  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[        US-  S-  5      S-   S-  S-  n[
        R                  " USU-  SS9U l        [
        R                  " XCSS9U l        g )Nr6   r	         Tr  )	rd   re   r2  r   r  r   rn  
weights_inweights_outr  s        r1   re   EomtSwiGLUFFN.__init__[  s    %+%7%77f0063C3CCD2Q67!;AAE))K_1D4P99_Nr0   r  c                     U R                  U5      nUR                  SSS9u  p#[        R                  R	                  U5      U-  nU R                  U5      $ )Nr6   rD   r  )r  chunkr   r:   silur  )rg   r  x1x2hiddens        r1   r   EomtSwiGLUFFN.forwardd  sQ    |4##A2#.##B'",''r0   )r  r  r  r  r   s   @r1   r  r  Z  s)    O(ELL (U\\ ( (r0   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr  SS\R                  S\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )	EomtLayerik  zCThis corresponds to the Block class in the original implementation.r   r4   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        U5      U l
        UR                  S:  a  [        UR                  5      O[        R                  " 5       U l        [        R                  " UR                  UR
                  S9U l        UR                   (       a  [#        U5      U l        O['        U5      U l        [        U5      U l        g )Nepsrw  )rd   re   r   	LayerNormr2  layer_norm_epsnorm1re  	attentionr  layer_scale1drop_path_rater  Identityr  norm2use_swiglu_ffnr  mlpr  layer_scale2rs  s     r1   re   EomtLayer.__init__n  s    \\&"4"4&:O:OP
&v.*62@F@U@UX[@[f&;&;<acalalan\\&"4"4&:O:OP
  $V,DHvDH*62r0   r!   	head_maskoutput_attentionsc                 >   U R                  U R                  U5      UUS9nUS   nU R                  U5      nUSS  nU R                  U5      U-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      U-   nU4U-   nU$ )N)r  r   r   )r  r  r  r  r  r  r  )rg   r!   r  r  self_attention_outputsattention_outputoutputslayer_outputs           r1   r   EomtLayer.forward~  s     "&JJ}%/ "0 "

 2!4,,-=>(, '78=H zz-0xx-((6 ~~l3mC/G+r0   )r  r  r  r  r  r  r  )NF)r%   r&   r'   r(   r)   r   re   r*   r   r   r   r   r-   r   r/   r   r   s   @r1   r  r  k  s    M3z 3d 3& -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r0   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )EomtLayerNorm2di  c                 "   > [         TU ]  XUS9  g )N)r  elementwise_affine)rd   re   )rg   r1  r  affinerh   s       r1   re   EomtLayerNorm2d.__init__  s    6Jr0   r  r4   c                     UR                  SSSS5      n[        R                  " XR                  U R                  U R
                  U R                  5      nUR                  SSSS5      nU$ )Nr   r6   r	   r   )permuteF
layer_normnormalized_shaper   r  r  r  s     r1   r   EomtLayerNorm2d.forward  s`    #++Aq!Q7||L2G2GVZV_V_aeaiaij#++Aq!Q7r0   r$   )gư>Tr  r   s   @r1   r  r    s)    KELL U\\  r0   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )EomtScaleLayeri  r   c           	         > [         TU ]  5         UR                  n[        R                  " X"SSS9U l        [        UR                     U l        [        R                  " UUSSUSS9U l
        [        U5      U l        g )Nr6   r,  r	   r   F)r-  paddinggroupsr  )rd   re   r2  r   ConvTranspose2dconv1r
   r  r  r8  conv2r  layernorm2drg   r   r2  rh   s      r1   re   EomtScaleLayer.__init__  ss    ((''aXYZ
 !2!23YY

 +;7r0   r!   r4   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  r  s     r1   r   EomtScaleLayer.forward  sB    

=16

=1((7r0   )r  r  r  r  )r%   r&   r'   r(   r   re   r*   ry   r   r   r/   r   r   s   @r1   r  r    s/    8z 8 U\\ ell  r0   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )EomtScaleBlocki  r   c                    > [         TU ]  5         UR                  U l        [        R
                  " [        U R                  5       Vs/ sH  n[        U5      PM     sn5      U l        g s  snf r   )	rd   re   num_upscale_blocks
num_blocksr   
ModuleListrs   r  blockrg   r   r   rh   s      r1   re   EomtScaleBlock.__init__  sM     33]]E$//DZ#[DZqN6$:DZ#[\
#[s   A)r!   r4   c                 <    U R                    H  nU" U5      nM     U$ r   )r  )rg   r!   r  s      r1   r   EomtScaleBlock.forward  s     ZZE!-0M  r0   )r  r  r%   r&   r'   r(   r   re   r*   r   r   r/   r   r   s   @r1   r  r    s1    ]z ]
U\\ ell  r0   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )EomtMaskHeadi  r   c                   > [         TU ]  5         UR                  n[        R                  " X"5      U l        [        R                  " X"5      U l        [        R                  " X"5      U l        [        UR                     U l
        g r   )rd   re   r2  r   rn  r  r  fc3r
   r  r  r  s      r1   re   EomtMaskHead.__init__  s[    ((99[699[699[6 !2!23r0   r!   r4   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      nU$ r   r  r  r  r  r  s     r1   r   EomtMaskHead.forward  sD    (?@(?@/r0   r  r   r   s   @r1   r  r    s/    4z 4U\\ ell  r0   r  c                   d    \ rS rSr% Sr\\S'   SrSrSr	S/r
SrSrS	\R                  S
S4S jrSrg)EomtPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r   eomtr;  Fr  TrW  r4   Nc                 D   U R                   R                  n[        U[        R                  [        R
                  [        R                  45      (       a  [        R                  R                  UR                  [        R                  " S5      S9  UR                  by  [        R                  R                  UR                  5      u  p4US:  a  S[        R                  " U5      -  OSn[        R                  R                  UR                  U* U5        g g [        U[        R                  5      (       aJ  UR                  R                   R#                  S5        UR                  R                   R%                  5         g [        U[        R&                  5      (       ad  UR                  R                   R)                  SSS9  UR*                  b2  UR                  R                   UR*                     R%                  5         g g [        U[,        5      (       aL  [/        US5      (       a:  UR0                  R                   R#                  U R                   R2                  5        g g [        U[4        5      (       a  [        R                  R7                  UR8                  R                   R;                  [<        R>                  5      SUS9R;                  UR8                  R@                  5      UR8                  l        URB                  R                   R%                  5         g g )	N   )ar   r   r7   rw  )r   stdr  )"r   initializer_ranger3  r   rn  r8  r  initkaiming_uniform_r   mathsqrtr  _calculate_fan_in_and_fan_outuniform_r  datafill_zero_rO  normal_padding_idxr  hasattrr  r  rA  trunc_normal_rG  ru   r*   r_  rr   rI  )rg   rW  r  fan_inr   bounds         r1   _init_weights!EomtPreTrainedModel._init_weights  s!   kk++fryy"))R5G5GHIIGG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< ' --MM$$S)KK""$--MM&&CQ&7!!-""6#5#56<<> .//vy))##))$++*F*FG *//$&GG$9$9  %%((7cs %: %b!!''( ! ""''--/	 0r0   r$   )r%   r&   r'   r(   r)   r   r,   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attnr   Moduler   r/   r$   r0   r1   r
  r
    sJ    
 $O&+#$N0BII 0$ 0r0   r
  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                   :  ^  \ rS rSrSrS\SS4U 4S jjrS\S\S	\S
\S\\	\4   S\\	\4   4S jr
S\\	\4   S\4S jr\\     SS\S	\\\      S
\\\      S\\   S\\   S\\\      S\4S jj5       5       rS rS\R                  4S jr\S 5       rSrU =r$ )EomtForUniversalSegmentationi  r;  r   r4   Nc                   > [         TU ]  U5        Xl        UR                  U l        [	        U5      U l        [        R                  " UR                  UR                  S9U l
        [        R                  " UR                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[!        U5      PM     sn5      U l        [%        U5      U l        [)        U5      U l        [        R,                  " UR                  UR.                  S-   5      U l        UR2                  UR4                  -  UR2                  UR4                  -  4U l        UR8                  UR:                  UR<                  S.U l        [A        XR>                  S9U l!        U RE                  S[F        RH                  " URJ                  5      5        U RM                  5         g s  snf )Nr  r   )r   r   r   )r   r   attn_mask_probs)'rd   re   r   num_hidden_layersrA  r>  r   r  r2  r  	layernormrO  r   rX  r  rs   r  layersr  upscale_blockr  	mask_headrn  r   class_predictorr/  r0  	grid_sizer   r   r   r   r   rV   r   r*   r   r  	post_initr  s      r1   re   %EomtForUniversalSegmentation.__init__  sr    !'!9!9(0f&8&8f>S>ST\\&"4"4f6H6HI
mmfF^F^@_$`@_1Yv%6@_$`a+F3%f-!yy););V=N=NQR=RS ++v/@/@@&BSBSW]WhWhBhi"("5"5++++.
 "=M=MN.

6;L;L0MN% %as   =G)r   r   rj   rk   r  c                     U R                  UUUUUS9nU R                  R                  5        H)  u  pxUR                  5        H  u  pXy;   d  M  X-  n
M     M+     U$ )Nr   r   rj   rk   r  )rV   r   r  )rg   r   r   rj   rk   r  r  r  r   loss_keyr   s              r1   get_loss_dict*EomtForUniversalSegmentation.get_loss_dict+  sj     (,~~!5!5#%"7 (6 (
	  ++113KC"+//"3?ND #4 4
 r0   r  c                 4    [        UR                  5       5      $ r   )rI   r}  )rg   r  s     r1   get_loss%EomtForUniversalSegmentation.get_lossC  s    9##%&&r0   output_hidden_statesr  r#   c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnSu  pSnUc  [        S5      eU R	                  U5      n[        U R                  5       GH  u  pU(       a  X|4-  nXR                  U R                   R                  -
  :X  am  U R                  R                  SSS2SS24   R                  UR                  S   SS5      R                  UR                  5      n[        R                   " X4SS9nXR                  U R                   R                  -
  :  Ga  U R"                  (       d7  U R$                  XR                  -
  U R                   R                  -      S:  Ga  U R'                  U5      nU R)                  U5      u  nnU	U4-  n	U
U4-  n
[        R*                  " UR                  S   UR                  S   UR                  S   UR                  [        R,                  S	9n[.        R0                  " UU R2                  S
S9nUR5                  UR7                  S5      UR7                  S5      S5      nU R                   R8                  nUU R                  R:                  -   nUS:  USS2SU2US24'   U R=                  UU R$                  XR                  -
  U R                   R                  -      UUUR                  S9nUSS2SS4   R                  SU R                   R>                  SS5      nURA                  5       RC                  U) S5      nU" XU5      nUS   nU(       d  GM  UUS   4-  nGM     U R'                  U5      nU(       a  UU4-  nU R)                  U5      u  nnU	U4-  n	U
U4-  n
SnUbA  Ub>  Sn[E        X5       H-  u  nnU RG                  UUUUSS9nUU RI                  U5      -  nM/     [K        UUUUUUUS9$ )a(  
mask_labels (`list[torch.Tensor]`, *optional*):
    list of mask labels of shape `(num_labels, height, width)` to be fed to a model
class_labels (`list[torch.LongTensor]`, *optional*):
    list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
    labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
patch_offsets (`list[torch.Tensor]`, *optional*):
    list of tuples indicating the image index and start and end positions of patches for semantic segementation.
Nr$   )r$   r$   z You have to specify pixel_valuesr   rD   r   r  )rn   rr   bilinear)sizemode)probnum_query_tokensencoder_start_tokensrn   .g    erw  r7  )r   r   r   r    r!   r"   r#   )&r   r>  r  rf   r>  r   r/  r-  r  rX  r   rQ  rQ   ru   rn   r*   r   r^  r,  r.  predictr   r   r  interpolater3  r
  rA  r   rN  _disable_attention_maskrh  r   masked_fillr   r9  r<  r   )rg   r;  rj   rk   r>  r  r#   all_hidden_statesall_attentionsmasks_queries_logits_per_layerclass_queries_logits_per_layerrY  r!   r   layer_modulerX  norm_hidden_statesr   r   interpolated_logitsrD  rE  layer_outputssequence_outputr   r  s                             r1   r   $EomtForUniversalSegmentation.forwardF  s9   * %9$D $++JjJj 	 2C1N-TXT_T_TqTq"6BD0dIOF&?@@5!*4;;!7C#!%55!,,t{{/E/EEE

))$1*5<<]=P=PQR=SUWY[\__`m`t`tu %		5*@a H,,t{{/E/EEE!5!5c<R<R6RUYU`U`UkUk6k!lop!p%)^^M%B"=A\\J\=]:$&:.3G2II..3G2II.!&!''*!''*!''*(//**" '(mm4Ht~~dn&o#&9&>&>',,Q/1D1I1I!1Lb'# $(;;#:#: '7$//:[:['[$ ObdeNeq"3#3"35I5JJK "&!=!="--c4J4J.JT[[McMc.cd%5)=)00 "> " "04!=!D!DRIhIhjlnp!q!/!5!5!7!C!C^OUY!Z(HYZM)!,M  =#3"55k "8n ..7/!3359\\/5R22&+?*AA&&+?*AA&"|'?D>A.?:$&: !..)=)= +!-*. / 	 i00? 2!5!5-+%'
 	
r0   c                 .    U R                   R                  $ r   )r>  rJ  r  s    r1   get_input_embeddings1EomtForUniversalSegmentation.get_input_embeddings  s    ///r0   r   c                    US S 2S U R                   R                  2S S 24   nU R                  U5      nUS S 2U R                   R                  U R                  R                  -   S 2S S 24   nUR                  SS5      nUR                  " UR                  S   S/U R                  Q76 nU R                  U5      nU R                  U5      n[        R                  " SX$5      nXS4$ )Nr   r6   r   rD   zbqc, bchw -> bqhw)r   r   r2  r>  rN  r   ry  rQ   r3  r1  r0  r*   einsum)rg   r   query_tokensclass_logitsprefix_tokensmask_logitss         r1   rF  $EomtForUniversalSegmentation.predict  s    a!:4;;#:#:!:A=>++L9q$++"9"9DOO<]<]"]"_abbc%//15%--m.A.A!.DbZ4>>Z~~l3**=9ll#6T((r0   c                 ~    US:  a6  [         R                  " U R                  S   X$S9U:  nSU S S 2S U2US 24   U'   U $ )Nr   r   rm   )r*   rv   rQ   )	attn_maskrC  rD  rE  rn   random_queriess         r1   rH  4EomtForUniversalSegmentation._disable_attention_mask  sT    !8"ZZ	(:<L\_ccN VWIa***,@,AAB>Rr0   )r2  r   rV   r>  r3  r.  r/  r1  r-  rX  r0  r   )NNNNN)r%   r&   r'   r(   r#  r   re   r   r%  r&  r9  r<  r   r   r   r.   r   r   r   rU  r*   rF  staticmethodrH  r/   r   r   s   @r1   r*  r*    sQ    %Oz d 8$ % 	
   $CK0 
c6k	0'$sF{"3 ' '  /3/3/3,004x
x
 d6l+x
 tF|,	x

 'tnx
 $D>x
  V-x
 
,x
  x
t0)ell )   r0   r*  )F)rw  )rw  F)Fcollections.abcr4  r  dataclassesr   typingr   r   r   numpyr'  r*   torch.nn.functionalr   r:   r  r   activationsr
   
file_utilsr   r   r   modeling_layersr   modeling_utilsr   r   utilsr   r   r   configuration_eomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r   r@   rL   r[   r(  r]   r   r   r   r   r*  rA  r   rc  re  r  r   r  r  r  r  r  r  r  r  r  r  r
  r*  __all__r$   r0   r1   <module>rr     s  ,   ! , ,      ! L L 9 F N N * 4'' 	7 7	 7B LQLL5:\\
\\@  6 , u|| X]XdXd 8g299 gTf f   <u|| U\\ VY ^c^j^j (uryy up	")) B"RYY "X %II%<<% 
% <<	%
 U\\*% % %.;)BII ;)|+RYY +U\\ e T V[VbVb (%299 %bii &(BII ("0* 0fbll RYY 2	RYY 	299 " $0/ $0 $0N 
R#6 R
Rj !"@
Ar0   