
    Ph@                    |   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-  e&j\                  e/      Z0dSde	jb                  de	jd                  dee3   fdZ4	 dTde	jj                  de	jd                  de	jl                  de3fdZ7dTdZ8e e$d       G d de"                    Z9e e$d        G d! d"e"                    Z: G d# d$e
jv                        Z<	 dUd%e
jv                  d&e	jb                  d'e	jb                  d(e	jb                  d)ee	jb                     d*e=d+e=fd,Z> G d- d.e
jv                        Z? G d/ d0e
jv                        Z@ G d1 d2e      ZA G d3 d4e
jv                        ZB G d5 d6e
jv                        ZC G d7 d8e
jv                        ZD G d9 d:e
jv                        ZE G d; d<e
jv                        ZF G d= d>e      ZG G d? d@e
jv                        ZHe$ G dA dBe             ZI G dC dDeI      ZJ G dE dFeI      ZK e$dG       G dH dIeIe             ZL G dJ dKe
jv                        ZM e$dL       G dM dNeI             ZN e$dO       G dP dQeIe             ZOg dRZPy)VzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   | j                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }d|z
  }|j                  |j                  t        j
                        t	        j                  |      j                        S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r#   r$   r%   bszsrc_lenexpanded_maskinverted_masks          f/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr5   -   s     99;LC ,g'GD$)*11#q'7KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc                    | \  }}t        j                  ||ft        j                  |      j                  |      }t        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t        j                  t        j                  ||||      |gd      }|ddddddf   j                  |d|||z         S )zB
    Make causal mask used for bi-directional self-attention.
    )r8   r   r   r$   r8   dimN)r,   fullr.   r/   aranger(   masked_fill_viewr*   catzerosr)   )r7   r$   r8   r9   r0   r%   r#   	mask_conds           r4   _make_causal_maskrF   ;   s     #LC::w(%++e*<*@*@PDTYYr]6:Ii9q="6"6tyy}a"HH!L775>D!yy%++g/EU[abdhioqrdAq !((a'DZ:Z[[r6   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r=   )neintr,   cumsumtype_aslong)	input_idspadding_idxr9   r#   incremental_indicess        r4   "create_position_ids_from_input_idsrP   M   sW     <<$((*D <<!4<<TBE[[_cc##%33r6   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed<   dZeed	<   d
ee   fdZy)Kosmos2ModelOutputa  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)text_model_outputrZ   Ngetattrto_tuple.0kselfs     r4   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   =      
   LLDGRYZ^`aRbRkRkRmm    -0tuplekeysrf   s   `r4   rb   zKosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r6   )__name__
__module____qualname____doc__rT   r   r,   FloatTensor__annotations__rU   r   rV   rk   rW   rX   rY   rZ   r   r   rb    r6   r4   rS   rS   ]   s    & 6:x 1 129'+OXe_+8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r6   rS   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   H   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed	<   dZeed
<   dee   fdZy)*Kosmos2ForConditionalGenerationModelOutputa*  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrU   rV   rW   rX   rY   rZ   r[   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr^   r`   rc   s     r4   rg   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rh   ri   rj   rm   s   `r4   rb   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rn   r6   )ro   rp   rq   rr   rx   r   r,   rs   rt   ry   rU   r   rV   rk   rW   rX   rY   rZ   r   r   rb   ru   r6   r4   rw   rw      s    . )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r6   rw   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )Kosmos2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r;   
persistent)super__init__r~   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr,   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr@   r)   rf   r~   	__class__s     r4   r   z Kosmos2VisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr6   
embeddingsheightwidthr[   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr;   g      ?r	   r   bicubicF)r(   modealign_cornersr=   )shaper   weight	unsqueezer,   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolaterB   rC   )rf   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr>   
new_height	new_widthsqrt_num_positionss                r4   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr6   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r$   r   r   r;   r=   )r   r   
ValueErrorr   r   r$   r*   flatten	transposer   r)   r,   rC   r   r   r   )rf   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r4   forwardzKosmos2VisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr6   F)ro   rp   rq   r"   r   r,   TensorrI   r   rs   r   __classcell__r   s   @r4   r}   r}      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r6   r}   modulequerykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr;   r=   ptrainingr   r   )	r,   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r4   eager_attention_forwardr   	  s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r6   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   r~   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r4   r   zKosmos2VisionAttention.__init__"  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar6   rV   r   causal_attention_maskoutput_attentionsr[   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }|sd}||fS )#Input shape: Batch x Time x Channelr   r   flash_attention_2Neager        )r   r   r   )r   r   r   r   rB   r   r   r   r~   _attn_implementationr   r   r   r   r   r   r   r   r   )rf   rV   r   r   r   r   
seq_lengthr   queriesrl   valuesattention_interfacer   r   s                 r4   r   zKosmos2VisionAttention.forward6  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r6   )NNF)ro   rp   rq   rr   r   r,   r   r   r-   rk   r   r   r   s   @r4   r   r     s}    GB. 268<,1/)||/) !./)  (5	/)
 $D>/) 
u||Xell33	4/)r6   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Kosmos2VisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r   r   r~   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r4   r   zKosmos2VisionMLP.__init__j  sd    #F$5$5699V//1I1IJ99V55v7I7IJr6   rV   r[   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   rf   rV   s     r4   r   zKosmos2VisionMLP.forwardq  s4    /**=9/r6   )ro   rp   rq   r   r,   r   r   r   r   s   @r4   r   r   i  s$    KU\\ ell r6   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
Kosmos2VisionEncoderLayerr~   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r4   r   z"Kosmos2VisionEncoderLayer.__init__z  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr6   rV   r   r   r   r[   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rV   r   r   r   )r	  r  r  r
  )rf   rV   r   r   r   residualr   outputss           r4   r   z!Kosmos2VisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr6   r   )ro   rp   rq   r"   r   r,   r   r   r-   rk   rs   r   r   r   s   @r4   r  r  y  sg    S2 S -2&||& &  %||	&
 $D>& 
u  	!&r6   r  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    r~   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r~   r   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointing)rf   r~   r   r   s      r4   r   zKosmos2VisionEncoder.__init__  sW    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A#r   r   r   output_hidden_statesreturn_dictr[   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nru   )r   r   r   )rT   rV   rW   )r~   r   r  use_return_dict	enumerater  r   )rf   inputs_embedsr   r   r   r  r  encoder_statesall_attentionsrV   idxencoder_layerlayer_outputss                r4   r   zKosmos2VisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/=2B!B)%"3	M *!,M !/=3C2E!E #9  +}.>>N+>Vd
 	
r6   )NNNNN)ro   rp   rq   rr   r"   r   r   r   r,   r   r-   r   rk   r   r   r   r   s   @r4   r  r    s    ,2 ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r6   r  c                        e Zd Zdef fdZ	 	 	 	 	 d
deej                     dee   dee   dedee   de	e
ef   fd	Z xZS )Kosmos2VisionTransformerr~   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r  )r   r   r~   r   r}   r   r   r  r  pre_layrnormr  encoderpost_layernorm)rf   r~   r   r   s      r4   r   z!Kosmos2VisionTransformer.__init__  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr6   r   r   r  r   r  r[   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r  r   r  r  r   r   )rT   pooler_outputrV   rW   )r~   r   r  r  r   r   r%  r&  r'  r   rV   rW   )
rf   r   r   r  r   r  rV   encoder_outputsrT   pooled_outputs
             r4   r   z Kosmos2VisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r6   NNNFN)ro   rp   rq   r"   r   r   r,   rs   r-   r   rk   r   r   r   r   s   @r4   r#  r#    s    Q2 Q 59,0/3).&*'
u001'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
r6   r#  c                       e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         	 	 	 	 dd	ee
j                     d
ee
j                     dedee
j                     fd       Zd Z xZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.r   embedding_dimrN   c                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y )Nr   )r   r   offsetr/  rN   make_weights)rf   r   r/  rN   r   s       r4   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__?  s@    *&-$++5}kRr6   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )Nweightsr<   Fr   )get_embeddinghasattrr*   r5  r$   r8   r   )rf   r3  r/  rN   emb_weightss        r4   r2  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsG  s[    ((T4#%..t||/A/A$,,J]J].^KYFr6   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r   r   r=   r;   N)mathlogr,   expr@   int64floatr   rC   sincosrB   rD   r*   get_default_dtype)r3  r/  rN   half_dimembs        r4   r6  z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingO  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r6   rM   r  r9   r   c                 v   |F|j                         \  }}|[t        || j                  |      j                  |j                        }n*|j                         d d \  }}|| j                  ||      }| j                  dz   |z   |z   }|| j                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr;   r   r   )r(   rP   rN   r*   r8   &create_position_ids_from_inputs_embedsr5  r2  r1  r/  index_selectrB   r   detach)rf   rM   r  r9   r   r0   seq_lenmax_poss           r4   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forwarde  s'     $>>+LC#At//1G "Y%%&  )--/4LC##JJ=Zpq ""Q&03IIT\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr6   c                 0   |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      j                         |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr;   r   r<   r   )	r(   r,   r@   rN   rL   r8   r   r)   r   )rf   r  r9   input_shapesequence_lengthr   s         r4   rE  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<GGILbbbr6   r   )NNr   N)ro   rp   rq   rr   rI   r   r   r2  staticmethodr6  r,   no_gradr   r   rE  r   r   s   @r4   r.  r.  ;  s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( U]]_ -104&'/3wELL)w  -w !$	w
 u||,w w6cr6   r.  c                   x    e Zd ZdZ	 	 	 	 	 ddedededee   dee   dee   dee   f fd	Z e	d
dd      	 	 	 	 	 	 dde
j                  dee
j                     dee   dee
j                     dee
j                     dedee
j                     dee
j                  ee
j                     ee   f   fd       Z xZS )KosmosTextAttentionr   r   r   r   
is_decoderadd_inner_attn_layernormr   	layer_idxc	                 j   t         	|           || _        || _        || _        || _        ||z  | _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        d | _        |r't        j$                  ||j&                        | _        y y )Nr   r   r   r   )r   r  )r   r   r~   r   r   r   r   r   r   rQ  rS  r   r   r   r   r   r   inner_attn_lnr  r  )
rf   r~   r   r   r   rQ  rR  r   rS  r   s
            r4   r   zKosmosTextAttention.__init__  s    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTB "#!#iV=R=R!SD $r6   past_key_valuerU   4.58new_nameversionrV   encoder_hidden_statesr   layer_head_maskr   cache_positionr[   c                 l   |du}	|j                   dd \  }
}| j                  |      }|j                  |
|| j                  | j                        j                  dd      }d}|St        |t              rA|j                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|T|	s|nd}j%                  ||| j                  d|i      \  }}|	r)t        |t              rd|j                  | j                  <   t&        }| j(                  j*                  dk7  rt,        | j(                  j*                     } || ||||f| j.                  sd	n| j0                  | j2                  d
|\  }}|j5                  |
|d      j7                         }| j8                  | j9                  |      }| j;                  |      }||fS )r   Nr   r   Fr;   r]  Tr   r   )r   r   )r   r   rB   r   r   r   
isinstancer   
is_updatedgetrS  cross_attention_cacheself_attention_cacher  rl   r   r   r   updater   r~   r   r   r   r   r   r   r   rU  r   )rf   rV   r[  rU   r   r\  r   r]  r   is_cross_attentionr   r   query_statesr`  curr_past_key_valuecurrent_states
key_statesvalue_statesr   r   r   s                        r4   r   zKosmosTextAttention.forward  s     3$>!.!4!4Ra!8
J{{=1#((ZQUQ^Q^_iijkmno
&/+>?,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#RWaabcefgJ',,ZT^^T]][eefgijkL*7It+>+E+Ednn?OQ_>`,(
L &*_FY*ZAEO..t~~>(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPR),,[9KmmK0L((r6   )r   FFTN)NNNNFN)ro   rp   rq   rr   rI   r>  r   r-   r   r   r,   r   r   rk   r   r   r   s   @r4   rP  rP    sP   G %*38#$(#T #T 	#T
 #T TN#T #+4.#T tn#T D>#TJ %0A6R 9=+/1526"'15L)||L)  (5L) "%	L)
 !.L) "%,,/L)  L) !.L) 
u||Xell3Xe_D	EL) SL)r6   rP  c                   *     e Zd Zdef fdZd Z xZS )Kosmos2TextFFNr~   c                    t         |           |j                  | _        t        |j                     | _        |j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y r  )r   r   r   r
   activation_functionr   activation_dropoutr   r   r   ffn_dimr   r   r  r  ffn_layernormr   s     r4   r   zKosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr6   c                 b   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )	r   r   r   r   r   ro  r   rq  r   r   s     r4   r   zKosmos2TextFFN.forward  s    **488M+BC--mt?V?Vaeanan-o**=9/--mt||VZVcVc-dr6   )ro   rp   rq   r!   r   r   r   r   s   @r4   rl  rl    s    
U0 
Ur6   rl  c                       e Zd Zddef fdZ eddd      	 	 	 	 	 	 	 	 	 ddej                  deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee
   dee
   deej                     deej                  eeej                  ej                  f      f   fd       Z xZS )Kosmos2TextBlockr~   c           	         t         |           |j                  | _        t        || j                  |j                  |j
                  dd|      | _        |j                  | _        t        j                  | j                  |j                        | _        |j                  ret        || j                  |j                  |j
                  dd|      | _        t        j                  | j                  |j                        | _        t        |      | _        t        j                  | j                  |j                        | _        y )NT)r   r   r   rQ  rR  rS  r  F)r   r   r   rP  attention_headsr   r  r   r   r  r  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrl  ffnfinal_layer_norm)rf   r~   rS  r   s      r4   r   zKosmos2TextBlock.__init__$  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr6   rV  rU   rW  rX  rV   r   r[  encoder_attention_maskr\  cross_attn_layer_head_maskr   	use_cacher]  r[   c                 X   |}| j                  |      } | j                  d||||||
d|\  }}t        j                  j	                  || j                  | j
                        }||z   }d }|t        | d      st        d|  d      |}| j                  |      } | j                  d|||||||
d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }||z   }|f}|r|||fz  }|S )N)rV   rU   r   r\  r   r]  r   ry  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rV   r[  r   r\  rU   r   r]  ru   )rw  r  r   r   r   r   r7  r   rz  ry  r|  r{  )rf   rV   r   r[  r}  r\  r~  rU   r   r  r]  r   r  self_attn_weightscross_attn_weightsr  s                   r4   r   zKosmos2TextBlock.forwardC  s    !11-@+94>> ,
'+)+/),
 ,
(( --mt||VZVcVc-d =0 " ,40 =dV DD D 
 %H 88GM0A0A0A 	1+&;5 : /"3-	1 	1-M- MM11-4<<Z^ZgZg1hM$}4M !--m< / =0 ")+=>>Gr6   r   )	NNNNNNFTN)ro   rp   rq   r!   r   r   r,   r   r   r   r-   rk   rs   r   r   r   s   @r4   rt  rt  #  s9   X0 X> %0A6R 268<9=26=A+/,1$(15C||C !.C  (5	C
 !) 6C "%,,/C %-U\\$:C "%C $D>C D>C !.C 
u  (51B1BEDUDU1U+V"WW	XC SCr6   rt  c            '       |    e Zd ZdZdef fdZd Z	 	 	 	 	 ddeej                     deej                     deej                     de
d	eej                     f
d
Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     dee   deej                     d	eej                     dee   dee   dee   dee   deej                     dee   deeef   f$dZ xZS )Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    r~   c           	         t         |           || _        |j                  | _        |j                  | _        |j
                  rt        j                  |j                        nd| _	        t        j                  |j                  |j                  |j                        | _        t        |j                   |j                  |j                        | _        t        j$                  t'        |j(                        D cg c]  }t+        ||       c}      | _        t        j,                  |j                  |j.                        | _        d| _        y c c}w )Nr'   )rN   )r   r/  rN   )rS  F)r   r   r~   r   	layerdropscale_embeddingr:  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr.  max_position_embeddingsembed_positionsr  r  r  rt  r  r  
layer_normr  )rf   r~   ir   s      r4   r   zKosmos2TextTransformer.__init__  s   ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iThq%5f%JTh$ij,,v'7'79N9NO&+# %js   =Ec                     d }|d   dkD  r#t        ||j                  |j                  |      }|=t        ||j                  |d         j	                  |j                        }||n||z   }|S )Nr;   r   )r8   r9   r%   )rF   r$   r8   r5   r*   )rf   r   rK  r  r9   combined_attention_maskexpanded_attn_masks          r4   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-nm>Q>Q[fgi[j!k!n!n$$" '>&E"K]`wKw $ '&r6   r  rX   img_input_maskr9   r   c                    || j                  |      }|[|j                  |j                        j                  d|j	                  d            ||j                  t
        j                        <   || j                  z  }| j                  ||||      }|j                  |j                        }||z   }t        j                  j                  || j                  | j                        }|S )Nr;   r   )rM   r  r9   r   r   )r  r*   r8   rB   r(   r,   r-   r  r  r   r   r   r   )	rf   rM   r  rX   r  r9   r   	positionsrV   s	            r4   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM.++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%	1--mt||VZVcVc-dr6   rM   r   image_embeds_position_maskr[  r}  	head_maskcross_attn_head_maskrU   r  r   r  r  r]  r   r[   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||
t	        d      |"|j
                  }|j                  d|d         }n!|
|
j                         d d }nt	        d      | j                  r%| j                  r|rt        j                  d       d}|rN|	L|4t        t        | j                         t        | j                               nt        | j                         }	|r:t        |	t              r*t        j                  d       t        j                   |	      }	|	|	j#                         nd}|dkD  rd }d }| j%                  ||
||||	      }| j'                  ||||      }||t)        ||
j*                  |d   
      }t,        j.                  j1                  || j0                  | j                        }|rdnd }|rdnd }|r|dnd }t3        ||gddg      D ]j  \  }}|	|j                         d   t5        | j6                        k7  s3t	        d| dt5        | j6                         d|j                         d    d       t9        | j6                        D ]|  \  }}|r||fz  }| j                  r%t;        j<                  g       }|| j>                  k  r? ||||f||||   nd |||   nd |	|||d|}|d   }|sh||d   fz  }|t||d   fz  }~ | jA                  |      }|r||fz  }tC        ||	|||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer;   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r~   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )rM   r  rX   r  r9   r   r  r   ru   r  r  zThe `z` should be specified for z layers, but it is for .)r}  r\  r~  rU   r   r  r]  r   r   )rT   rU   rV   rW   cross_attentions)"r~   r   r  r  r   r   rB   r(   r  r   loggerwarning_oncer   r   r_  rk   from_legacy_cacheget_seq_lengthr  r  r5   r$   r   r   r   ziplenr  r  r,   randr  r  r   )rf   rM   r   rX   r  r[  r}  r  r  rU   r  r   r  r   r  r  r]  r   rK  r9   rV   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer  decoder_layerdropout_probabilityr!  s                                 r4   r   zKosmos2TextTransformer.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	 ]%>cdd"#//K!r;r?;I&',,.s3KTUU&&4==##p "	0 )4 $L$DlZ^ZeZeFfg!5 
 OU;\
 2CCOTOETE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12H-J]J]grsugv%w"--mt||VZVcVc-d #7BD0d&7<Q<]rdh %(4H(IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)% (>3<3H3dI]Ii,@,Eos /"3#- M *!,M =#3"55(4(]1-=,??(9 #9> 6  -!118+++%1
 	
r6   )NNNr   NNNNNNNNNNNNNNNNN)ro   rp   rq   rr   r!   r   r  r   r,   r   rI   r  r   r-   r   r   r   rk   r   r   r   r   s   @r4   r  r    s   ,0 ,('4 15/315&'/3!  -! u||,	!
 !.! !$! u||,!J -115/3=A8<9=,07;+/04/3$(,0/3&*15#M
ELL)M
 !.M
 u||,	M

 %-U\\$:M
  (5M
 !) 6M
 ELL)M
 'u||4M
 "%M
  -M
 u||,M
 D>M
 $D>M
 'tnM
  d^!M
" !.#M
$ -.%M
& 
u??	@'M
r6   r  c                   P    e Zd ZU eed<   dZddgZdZdZdZ	de
j                  fdZy)Kosmos2PreTrainedModelr~   Tr  rt  r   c                    t        | t              r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        | t        t        f      r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        |t              rt        j                  j                  |j                  d|j                   dz  z         t        j                  j                  |j"                  j$                  |j                  j&                  |z         t        j                  j                  |j(                  j$                  |j                  j&                  |z         nt        |t*              r|j                   dz  d|j                  j,                  z  dz  z  z  }|j                   dz  |z  }t        j                  j                  |j.                  j$                  |       t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       nt        |t6              r|j                  j8                  dz  d|j                  j,                  z  dz  z  z  }d|j                  j8                  z  dz  |z  }t        j                  j                  |j:                  j$                  |       t        j                  j                  |j<                  j$                  |       nt        |t>              rt        j                  j                  |j.                  j$                         t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       n3t        |t@              rlt        j                  j                  |j:                  j$                         t        j                  j                  |j<                  j$                  |       nt        |t              r7t        j                  j                  |jB                  j$                         npt        |tD              r`t        j                  j                  |jF                  j$                         t        j                  j                  |jH                         n t        |tJ              r|jL                  j$                  jN                  j                  d       |jL                  jP                  |jL                  j$                  jN                  |jL                  jP                     jS                          nct        |t        jT                        rI|j$                  jN                  jW                  d       |jX                  jN                  jS                          t        |t        jZ                        r2|jX                  %|jX                  jN                  jS                          yyy)zInitialize the weightsr   r   )meanstd)r  r   Nr'   ).r_  Kosmos2VisionModelr~   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configr}   r   initnormal_r   r   r   r   initializer_ranger   r   r  r   r   r   r   r   r   r   r   rP  rl  lm_headKosmos2ImageToTextProjectiondenselatent_queryr  r  datarN   zero_r  fill_r   r   )rf   r   factorr  in_proj_stdout_proj_stdfc_stds          r4   _init_weightsz$Kosmos2PreTrainedModel._init_weightsy  s   d./[[33F|-LMN[[..AAFd-/EFG++&&C|-LMN++))22Cf56GGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 67!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE 01!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 34GGOOFMM00cO:GGOOFMM00cO:GGOOFMM00cO:GGOOFOO22O</GGOOFJJ--3O7GGOOFJJ--3O7 67GGOOFNN11sO; <=GGOOFLL//SO9GGOOF//0 67&&++33#3F""..:##**//0C0C0O0OPVVX-MM$$S)KK""$fbii(V[[-DKK""$ .E(r6   N)ro   rp   rq   r    rt   supports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar   Moduler  ru   r6   r4   r  r  p  s;    &*#46HI"&N2%BII 2%r6   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )r  r~   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r#  model	post_initr   s     r4   r   zKosmos2VisionModel.__init__  s&     -f5
r6   r[   c                 B    | j                   j                  j                  S r   )r  r   r   rm   s    r4   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r6   r   r  r   r  c                 .    | j                  |||||      S )N)r   r   r  r   r  r  )rf   r   r   r  r   r  s         r4   r   zKosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r6   r,  )ro   rp   rq   r"   rt   main_input_namer   r   r  r  r   r   r,   rs   r-   r   rk   r   r   r   r   s   @r4   r  r    s    $O2 5bii 5  59,0/3).&*
u001
 $D>
 'tn	

 #'
 d^
 
u00	1
 
r6   r  c            )       "    e Zd ZU eed<   def fdZdej                  fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     dee   deej                     deej                     dee   dee   dee   dee   deej                     dee   deeef   f$d              Z xZS )r  r~   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r  r  r  r   s     r4   r   zKosmos2TextModel.__init__  s&     +F3
r6   r[   c                 .    | j                   j                  S r   r  r  rm   s    r4   r  z%Kosmos2TextModel.get_input_embeddings      zz&&&r6   rM   r   rX   r  r[  r}  r  r  rU   r  r   r  r   r  r  r]  r   c                      | j                   di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d||S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        rM   r   rX   r  r[  r}  r  r  rU   r  r   r  r   r  r  r]  ru   r  )rf   rM   r   rX   r  r[  r}  r  r  rU   r  r   r  r   r  r  r]  r   s                     r4   r   zKosmos2TextModel.forward  s    J tzz 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 $
  *#
 	
r6   r  )ro   rp   rq   r!   rt   r   r   r  r  r   r   r   r,   r   r   r-   r   r   r   rk   r   r   r   r   s   @r4   r  r    s   0 'bii '  -115/3=A8<9=,07;+/04/3$(,0/3&*15#5
ELL)5
 !.5
 u||,	5

 %-U\\$:5
  (55
 !) 65
 ELL)5
 'u||45
 "%5
  -5
 u||,5
 D>5
 $D>5
 'tn5
  d^!5
" !.#5
$ -.%5
& 
u??	@'5
  5
r6   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            +           e Zd ZU eed<   dgZdef fdZdej                  fdZ	dej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     dee   deej                     deej                     deej"                     dee   dee   dee   dee   deej                     dee   deeef   f&d              Z	 	 	 	 	 	 	 d fd	Z xZS )r  r~   zlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)in_featuresout_featuresr   )
r   r   r  r  r   r   r   r  r  r  r   s     r4   r   zKosmos2TextForCausalLM.__init__  sI     +F3
yyV-=-=FL]L]dij 	r6   r[   c                 .    | j                   j                  S r   r  rm   s    r4   r  z+Kosmos2TextForCausalLM.get_input_embeddings(  r  r6   c                     | j                   S r   )r  rm   s    r4   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings+  s    ||r6   rM   r   rX   r  r[  r}  r  r  rU   r  r   labelsr  r   r  r  r]  r   c                    ||n| j                   j                  }||rt        j                  d       d} | j                  di d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|ddd||}| j                  |d         }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                  |j                        S )aK  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FrM   r   rX   r  r[  r}  r  r  rU   r  r   r  r   r  r  Tr]  r   )ry   r  r  )rx   ry   rU   rV   rW   r  ru   )r~   r  r  warningr  r  loss_functionr  r   rU   rV   rW   r  )rf   rM   r   rX   r  r[  r}  r  r  rU   r  r   r  r  r   r  r  r]  r   r  	lm_logitsrx   s                         r4   r   zKosmos2TextForCausalLM.forward.  sh   T &1%<k$++B]B]klI$** 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 
  *#
& LL,	%4%%sYvRVR]R]RhRhslrsD0#33!//))$55
 	
r6   c	                    |d   dk7  rd }d }n|||j                         d d n|j                         \  }
}|j                         d   }t        j                  |t        j                  |
||z
  ft        j                  |j
                        fd      }t        |   |f|||||||d|	}|j                  dd        |S )Nr   r;   )r(   r$   r8   r   r=   )rU   r   rX   r  r  r  r]  r   )	r(   r,   rC   rD   r-   r8   r   prepare_inputs_for_generationpop)rf   rM   rX   r  rU   r   r  r  r]  model_kwargsr   rH  mask_lenmodel_inputsr   s                 r4   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s     !!L)-& (3?L?X-"4"4"6s";^g^l^l^nJ1668<H)..KKj'H2D%EUZZ`i`p`pq *& w<

+)%'A')

 

 	.r6   )NNNNNNNNNNNNNNNNN)NNNNNNN)ro   rp   rq   r!   rt   _tied_weights_keysr   r   r  r  r  r   r   r   r,   r   r   
LongTensorr-   r   r   r   rk   r   r   r  r   r   s   @r4   r  r    s    *+0 'bii 'ryy   -115/3=A8<9=,07;+/04/3-1$(,0/3&*15%O
ELL)O
 !.O
 u||,	O

 %-U\\$:O
  (5O
 !) 6O
 ELL)O
 'u||4O
 "%O
  -O
 u||,O
 ))*O
 D>O
 $D>O
  'tn!O
" d^#O
$ !.%O
& +,'O
( 
u77	8)O
  O
h #'- -r6   r  c                   .     e Zd ZdZdef fdZd Z xZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r~   c                    t         |           t        j                  |j                  j
                  |j                  j                        | _        t        j                  t        j                  |j                  |j                  j                              | _        t        |j                  |j                  j                  |j                  j                  |j                  j                   dd      | _        y )NF)r   rQ  rR  )r   r   r   r   r  r   r  r   r  r   r,   r   latent_query_numr  rP  rv  r   x_attnr   s     r4   r   z%Kosmos2ImageToTextProjection.__init__  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r6   c                    | j                  |      }| j                  j                  d      j                  |j	                  d      dd      }t        j                  ||gd      }| j                  ||d d d       \  }}||fS )Nr   r;   r   r=   )rV   r[  rU   r   r   )r  r  r   r)   r(   r,   rC   r  )rf   featuresrV   r  key_value_statesr   s         r4   r   z$Kosmos2ImageToTextProjection.forward  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m\%BJ&*kk&"2 " '2 '
#| l**r6   )ro   rp   rq   rr   r    r   r   r   r   s   @r4   r  r    s    w
} 
+r6   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %           e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
	 	 ddej                  dee   dee   fd	Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     dee   deej$                     deej$                     deej$                     dee   dee   dee   dedee   dee   deeef   f d              Z xZS )r  r~   r   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r4   r   zKosmos2Model.__init__  sN     *6+=+=>.v/C/CD(DV(L% 	r6   r[   c                 B    | j                   j                  j                  S r   r  r  r  rm   s    r4   r  z!Kosmos2Model.get_input_embeddings      $$111r6   c                 :    || j                   j                  _        y r   r  rf   r   s     r4   set_input_embeddingsz!Kosmos2Model.set_input_embeddings      -2*r6   return_attentionsr   c                     | j                  ||      }| j                   j                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }}|r||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r;   r=   )r  r  r'  r   r   	normalizer   )rf   r   r  r   rZ   rX   rY   s          r4   get_image_featureszKosmos2Model.get_image_features  s    " #//%%= 0 

 ((..==>QRS>TU}}..|.D.2.K.KL.Y++!666r6   rM   r  r   r  rU   rX   r  r   r  r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}|$|t	        d      | j                  |d|      \  }} | j                  d||||||||	|
||dd|}t        |j                  |j                  |j                  |j                  |||      S )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r  r   )rM   r   rX   r  r  rU   r  r   r  r   r  r  )rT   rU   rV   rW   rX   rY   rZ   ru   )r~   r   r  r  r   r  r  rS   rT   rU   rV   rW   )rf   r   rM   r  r   r  rU   rX   r  r   r  r   r  r   r  r   rZ   rY   r  s                      r4   r   zKosmos2Model.forward  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``262I2IOg 3J 3/L/ "$// 
)%'A+'%/!5
 
  "%77#33!//))%"7 3
 	
r6   )FF)NNNNNNNNNNNNFN)ro   rp   rq   r    rt   r  r   r   r  r  r  r,   rs   r   r-   r  r   r   r   r   r   r   r   rk   rS   r   r   r   s   @r4   r  r    s    $O} 2bii 23 -238	'' $D> #+4.	>  04,0=A15,0+//304/3$(,0/3).&*a
u||,a
 ELL)a
 %-U\\$:	a

 !.a
 ELL)a
 "%a
 u||,a
  -a
 u||,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
  a
r6   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            #           e Zd ZU eed<   dZdgZdef fdZdej                  fdZ
d Zdej                  fdZd	 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     dee   deej$                     deej$                     deej$                     deej(                     dee   dee   dee   dee   deeef   fd              Z ej8                         	 	 	 	 	 	 ddeej$                     deej$                     d
eej$                     deej$                     deej$                     deej$                     fd       Z xZS )r  r~   r   ztext_model.lm_head.weightc                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  r  r  r  r  r  r   r  r   s     r4   r   z(Kosmos2ForConditionalGeneration.__init__}  sN     01C1CD.v/C/CD(DV(L% 	r6   r[   c                 B    | j                   j                  j                  S r   r  rm   s    r4   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings  r  r6   c                 :    || j                   j                  _        y r   r  r  s     r4   r  z4Kosmos2ForConditionalGeneration.set_input_embeddings  r  r6   c                 6    | j                   j                         S r   )r  r  rm   s    r4   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r6   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)rf   new_embeddingss     r4   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=r6   rM   r  r   r  rU   rX   r  r   r  r  r   r  r   c                 <   ||n| j                   j                  }||n| j                   j                  }d}d}|~|t        d      | j	                  |||      }| j                  j
                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }} | j                  d
||||||||	|
|||dd|}t        |j                  |j                  |j                  |j                   |j"                  |||	      S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r  r   r;   r=   T)rM   r   rX   r  r  rU   r  r   r  r  r   r  r  )rx   ry   rU   rV   rW   rX   rY   rZ   ru   )r~   r   r  r   r  r  r'  r   r   r
  r   r  rw   rx   ry   rU   rV   rW   )rf   r   rM   r  r   r  rU   rX   r  r   r  r  r   r  r   rZ   rY   
lm_outputss                     r4   r   z'Kosmos2ForConditionalGeneration.forward  s^   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 # $# !_``"&"3"3)"3%9 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/$T__ 
)%'A+'%/!5
 

" :$$&66$22!,,%"7 3	
 		
r6   c           	         |j                  dd       }||t        d| d      |||}|n| j                  |      }	| j                  j                  j	                  |	d         }t
        j                  j                  |d      }| j                  |      \  }}
 | j                  j                  d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r;   r=   )rM   r   rX   r  r  ru   )r  r   r  r  r'  r   r   r
  r   r  generate)rf   r   r  rM   r   rX   r  r   r  rZ   rY   outputs               r4   r  z(Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/))) 
)%'A'
 
 r6   )NNNNNNNNNNNNN)NNNNNN)ro   rp   rq   r    rt   r  r  r   r   r  r  r  r  r  r   r   r   r,   r   r   r  r-   r   r   r   rk   rw   r   rN  r  r   r   s   @r4   r  r  r  sN    $O56	} 	2bii 237ryy 7>  04,0=A15,0+//304/3-1$(,0/3u
u||,u
 ELL)u
 %-U\\$:	u

 !.u
 ELL)u
 "%u
 u||,u
  -u
 u||,u
 ))*u
 D>u
 $D>u
 'tnu
 +,u
  
u@@	A!u
  u
n U]]_ 04=A,015/304%u||,% %-U\\$:% ELL)	%
 !.% u||,%  -% %r6   r  )r  r  r  r   )r   )r   )Qrr   r:  dataclassesr   typingr   r   r   r   r,   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   configuration_kosmos2r    r!   r"   
get_loggerro   r  r   r$   rI   r5   Sizer8   rF   rP   rS   rw   r  r}   r>  r   r   r   r  r  r#  r.  rP  rl  rt  r  r  r  r  r  r  r  r  __all__ru   r6   r4   <module>r,     s)     ! 1 1   ! C C ) B 9  G & j j 0 X X 
		H	%[u|| [EKK [(3- [ jk\ZZ\(-\=B\\\cf\$4  
 
  
  
F 
%
 %
 %
RPbii Pv %II%<<% 
% <<	%
 U\\*% % %,F)RYY F)Tryy  / : /fT
299 T
p3
ryy 3
nUcryy Ucpv)")) v)rRYY .d1 dNc
RYY c
L :%_ :% :%z
/ 
BC
- C
L S3_ SSl +299  +F 
V
) V

V
r {&<o {{| Xr6   