
    <hD                    d   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*J+r+  \&RX                  " \-5      r.SRS\	R^                  S\	R`                  S\\1   4S jjr2 SSS\	Rf                  S\	R`                  S\	Rh                  S\14S jjr5SSS jr6\\$" SS9 " S S\"5      5       5       r7\\$" SS9 " S  S!\"5      5       5       r8 " S" S#\Rr                  5      r: STS$\Rr                  S%\	R^                  S&\	R^                  S'\	R^                  S(\\	R^                     S)\;S*\;4S+ jjr< " S, S-\Rr                  5      r= " S. S/\Rr                  5      r> " S0 S1\5      r? " S2 S3\Rr                  5      r@ " S4 S5\Rr                  5      rA " S6 S7\Rr                  5      rB " S8 S9\Rr                  5      rC " S: S;\Rr                  5      rD " S< S=\5      rE " S> S?\Rr                  5      rF\$ " S@ SA\5      5       rG " SB SC\G5      rH " SD SE\G5      rI\$" SFS9 " SG SH\G\5      5       rJ " SI SJ\Rr                  5      rK\$" SKS9 " SL SM\G5      5       rL\$" SNS9 " SO SP\G\5      5       rM/ SQQrNg)UzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   U R                  5       u  p4Ub  UOUnU SS2SSSS24   R                  USX$5      R                  U5      nSU-
  nUR                  UR                  [        R
                  5      [        R                  " U5      R                  5      $ )zW
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r!   r"   r#   bszsrc_lenexpanded_maskinverted_masks          d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr3   -   s     99;LC ,g'GD$)*11#q'KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc           	         U u  pE[         R                  " XU4[         R                  " U5      R                  US9n[         R                  " UR                  S5      US9nUR                  XwS-   R                  UR                  S5      S5      :  S5        UR                  U5      nUS:  a*  [         R                  " [         R                  " XSXS9U/SS9nUSSSS2SS24   R                  USXUU-   5      $ )z:
Make causal mask used for bi-directional self-attention.
)r6   r   r   r"   r6   dimN)r*   fullr,   r-   aranger&   masked_fill_viewr(   catzerosr'   )r5   r"   r6   r7   r.   r#   r!   	mask_conds           r2   _make_causal_maskrD   ;   s     #LC::w(%++e*<*@*@PDTYYr]6:Iiq="6"6tyy}a"HH!L775>D!yy%++gUbdhioqrdAq !((aDZ:Z[[r4   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r;   )neintr*   cumsumtype_aslong)	input_idspadding_idxr7   r!   incremental_indicess        r2   "create_position_ids_from_input_idsrN   M   sW     <<$((*D <<!4<<TBE[[_cc##%33r4   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   J   \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S	'   Sr\\	S
'   S\\   4S jrSrg)Kosmos2ModelOutput]   aj  
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
    `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
    encoder_sequence_length, embed_size_per_head)`.

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
    the weighted average in the self-attention heads.
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
    The output of the [`Kosmos2VisionModel`].
Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)text_model_outputrY   Ngetattrto_tuple.0kselfs     r2   	<genexpr>.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   <      
   LLDGRYZ^`aRbRkRkRmm    14tuplekeysre   s   `r2   ra   Kosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r4    )__name__
__module____qualname____firstlineno____doc__rS   r   r*   FloatTensor__annotations__rT   rk   rU   rV   rW   rX   rY   r   r   ra   __static_attributes__rp   r4   r2   rQ   rQ   ]   s    , 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r4   rQ   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   r   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   Sr\\\R                        \	S
'   Sr\\	S'   S\\   4S jrSrg)*Kosmos2ForConditionalGenerationModelOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
    `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
    encoder_sequence_length, embed_size_per_head)`.

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
    the weighted average in the self-attention heads.
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
    The output of the [`Kosmos2VisionModel`].
NlosslogitsrT   rU   rV   rW   rX   rY   rZ   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr]   r_   rb   s     r2   rf   FKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rh   ri   rj   rm   s   `r2   ra   3Kosmos2ForConditionalGenerationModelOutput.to_tuple   ro   r4   rp   )rq   rr   rs   rt   ru   r|   r   r*   rv   rw   r}   rT   rk   rU   rV   rW   rX   rY   r   r   ra   rx   rp   r4   r2   rz   rz      s    4 )-D(5$$
%,*.FHU&&'.AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r4   rz   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )Kosmos2VisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r9   
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr*   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr>   r'   re   r   	__class__s     r2   r    Kosmos2VisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr4   
embeddingsheightwidthrZ   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr9   g      ?r	   r   bicubicF)r&   modealign_cornersr;   )shaper   weight	unsqueezer*   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolater@   rA   )re   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr<   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encoding0Kosmos2VisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr4   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().r"   r   r   r9   r;   )r   r   
ValueErrorr   r   r"   r(   flatten	transposer   r'   r*   rA   r   r   r   )re   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r2   forwardKosmos2VisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr4   )	r   r   r   r   r   r   r   r   r   F)rq   rr   rs   rt   r    r   r*   TensorrG   r   rv   r   rx   __classcell__r   s   @r2   r   r      sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r4   r   modulequerykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr9   r;   ptrainingr   r   )	r*   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r2   eager_attention_forwardr     s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r4   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )Kosmos2VisionAttentioni%  =Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   r   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r2   r   Kosmos2VisionAttention.__init__(  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar4   rU   r   causal_attention_maskoutput_attentionsrZ   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
#Input shape: Batch x Time x Channelr   r   flash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r@   r   r   r   r   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )re   rU   r   r   r   r   
seq_lengthr   queriesrl   valuesattention_interfacer   r   s                 r2   r   Kosmos2VisionAttention.forward<  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r4   )r   r   r   r   r   r   r   r   r   r   r   )NNF)rq   rr   rs   rt   ru   r   r*   r   r   r+   rk   r   rx   r   r   s   @r2   r   r   %  s    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r4   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Kosmos2VisionMLPiu  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r   r   r   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r2   r   Kosmos2VisionMLP.__init__v  sb    #F$5$5699V//1I1IJ99V55v7I7IJr4   rU   rZ   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r
  r  re   rU   s     r2   r   Kosmos2VisionMLP.forward}  s4    /**=9/r4   )r
  r   r  r  )
rq   rr   rs   rt   r   r*   r   r   rx   r   r   s   @r2   r  r  u  s)    KU\\ ell  r4   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )Kosmos2VisionEncoderLayeri  r   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r   s     r2   r   "Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr4   rU   r   r   r   rZ   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rU   r   r   r   )r  r  r  r  )re   rU   r   r   r   residualr   outputss           r2   r   !Kosmos2VisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr4   )r   r  r  r  r  r   )rq   rr   rs   rt   r    r   r*   r   r   r+   rk   rv   r   rx   r   r   s   @r2   r  r    sl    S2 S -2&||& &  %||	&
 $D>& 
u  	!& &r4   r  c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )Kosmos2VisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Kosmos2VisionEncoderLayer`].

Args:
    config: Kosmos2VisionConfig
r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointing)re   r   r   r   s      r2   r   Kosmos2VisionEncoder.__init__  sU    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A%r   r   r   output_hidden_statesreturn_dictrZ   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nrp   )r   r   r   )rS   rU   rV   )r   r   r,  use_return_dict	enumerater)  r   )re   inputs_embedsr   r   r   r,  r-  encoder_statesall_attentionsrU   idxencoder_layerlayer_outputss                r2   r   Kosmos2VisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r4   )r   r*  r)  )NNNNN)rq   rr   rs   rt   ru   r    r   r   r   r*   r   r+   r   rk   r   r   rx   r   r   s   @r2   r$  r$    s    ,2 ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r4   r$  c                      ^  \ rS rSrS\4U 4S jjr     SS\\R                     S\\	   S\\	   S\	S\\	   S	\
\\4   4S
 jjrSrU =r$ )Kosmos2VisionTransformeri  r   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r  )r   r   r   r   r   r   r   r  r  pre_layrnormr$  encoderpost_layernorm)re   r   r   r   s      r2   r   !Kosmos2VisionTransformer.__init__  sd    &&	1&9LL8M8MN+F3 ll9:O:OPr4   r   r   r,  r   r-  rZ   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r   )r1  r   r,  r-  r   r   )rS   pooler_outputrU   rV   )r   r   r,  r/  r   r   r;  r<  r=  r   rU   rV   )
re   r   r   r,  r   r-  rU   encoder_outputsrS   pooled_outputs
             r2   r    Kosmos2VisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r4   )r   r   r<  r=  r;  NNNFN)rq   rr   rs   rt   r    r   r   r*   rv   r+   r   rk   r   r   rx   r   r   s   @r2   r9  r9    s    Q2 Q 59,0/3).&*'
u001'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
 '
r4   r9  c                   2  ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5           SS
\\R                     S\\R                     S\S\\R                     4S jj5       rS rSrU =r$ )(Kosmos2TextSinusoidalPositionalEmbeddingiG  zDThis module produces sinusoidal positional embeddings of any length.r   embedding_dimrL   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g )Nr   )r   r   offsetrG  rL   make_weights)re   r   rG  rL   r   s       r2   r   1Kosmos2TextSinusoidalPositionalEmbedding.__init__K  s8    *&-++5}Rr4   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )Nweightsr:   Fr   )get_embeddinghasattrr(   rN  r"   r6   r   )re   rL  rG  rL   emb_weightss        r2   rJ  5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsS  s\    ((T4##%..t||/A/A$,,J]J].^KYFr4   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r   i'  r   r   r   r;   r9   N)mathlogr*   expr>   int64floatr   rA   sincosr@   rB   r(   get_default_dtype)rL  rG  rL   half_dimembs        r2   rO  6Kosmos2TextSinusoidalPositionalEmbedding.get_embedding[  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r4   rK   r1  r7   r   c                 p   UbE  UR                  5       u  pVUc/  [        XR                  U5      R                  UR                  5      nO)UR                  5       S S u  pVUc  U R                  X#5      nU R                  S-   U-   U-   nXpR                  R                  S5      :  a3  U R                  XpR                  -   U R                  U R                  5        U R                  R                  SUR                  S5      5      R                  XVU R                  R                  S   5      R                  5       $ )Nr9   r   r   )r&   rN   rL   r(   r6   &create_position_ids_from_inputs_embedsrN  rJ  rI  rG  index_selectr@   r   detach)re   rK   r1  r7   r   r.   seq_lenmax_poss           r2   r   0Kosmos2TextSinusoidalPositionalEmbedding.forwardq  s     $>>+LC#A//1G "Y%%&  )--/4LC##JJ=q ""Q&03II\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvr4   c                 *   UR                  5       SS nUS   n[        R                  " U R                  S-   X@R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      R                  5       U-   $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr9   r   r:   r   )	r&   r*   r>   rL   rJ   r6   r   r'   r   )re   r1  r7   input_shapesequence_lengthr   s         r2   r`  OKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<GGILbbbr4   )rG  rI  rL   r  )NNr   N)rq   rr   rs   rt   ru   rG   r   r   rJ  staticmethodrO  r*   no_gradr   r   r`  rx   r   r   s   @r2   rF  rF  G  s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( ]]_ -104&'/3wELL)w  -w !$	w
 u||,w w6c cr4   rF  c                   n  ^  \ rS rSrSr     SS\S\S\S\\   S\\   S\\   S	\\   4U 4S
 jjjr	      SS\
R                  S\\
R                     S\\   S\\
R                     S\\
R                     S\S\\
R                     S\\
R                  \\
R                     \\   4   4S jjrSrU =r$ )KosmosTextAttentioni  r   r   r   r   
is_decoderadd_inner_attn_layernormr   	layer_idxc	                 J  > [         T	U ]  5         Xl        X l        X0l        X@l        X#-  U l        U R                  U-  U R                  :w  a  [        SU R                   SU S35      eU R                  S-  U l        XPl	        Xl
        [        R                  " X"US9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        S U l        U(       a$  [        R$                  " X!R&                  S9U l        g g )Nr   r   r   r   )r   r  )r   r   r   r   r   r   r   r   r   rn  rp  r   r   r   r   r   r   inner_attn_lnr  r  )
re   r   r   r   r   rn  ro  r   rp  r   s
            r2   r   KosmosTextAttention.__init__  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TB "#!#i=R=R!SD $r4   rU   encoder_hidden_statespast_key_valuer   layer_head_maskr   cache_positionrZ   c                 
   USLn	UR                   SS u  pU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nU R#                  U5      nUR                  U
SU R                  U R                  5      R                  SS5      nUR                  U
SU R                  U R                  5      R                  SS5      nUbN  U	(       d  UOSnWR%                  UUU R                  SU05      u  nnU	(       a  SUR                  U R                  '   [&        nU R(                  R*                  S:w  aT  U R(                  R*                  S:X  a  U(       a  [,        R/                  S	5        O[0        U R(                  R*                     nU" U UUUU4U R2                  (       d  S
OU R4                  U R6                  S.UD6u  nnUR9                  XS5      R;                  5       nU R<                  b  U R=                  U5      nU R?                  U5      nUU4$ )r   Nr   r   r9   rw  Tr   r   r   r   )r   r   ) r   r   r@   r   r   r   
isinstancer   
is_updatedgetrp  cross_attention_cacheself_attention_cacher)  rl   r  r   r   updater   r   r   r   r   r   r   r   r   r   r   rr  r   )re   rU   rt  ru  r   rv  r   rw  r   is_cross_attentionr   r   query_statesrz  curr_past_key_valuecurrent_states
key_statesvalue_statesr  r   r   s                        r2   r   KosmosTextAttention.forward  s    3$>!.!4!4Ra!8
{{=1#((QUQ^Q^_iijkmno%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<,33DNNCHHJ.55dnnELLL^4J;;~6L#RWaabcefgJ',,ZT^^T]][eefgijkL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))*"EPPR),,[9KmmK0L((r4   )r   r   r   r   rr  rn  r   rp  r   r   r   r   r   )r   FFTN)NNNNFN)rq   rr   rs   rt   ru   rG   rX  r   r+   r   r*   r   r   rk   r   rx   r   r   s   @r2   rm  rm    s@   G %*38#$(#T #T 	#T
 #T TN#T #+4.#T tn#T D>#T #TP 9=*.1526"'15Q)||Q)  (5Q) !	Q)
 !.Q) "%,,/Q)  Q) !.Q) 
u||Xell3Xe_D	EQ) Q)r4   rm  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Kosmos2TextFFNi  r   c                   > [         TU ]  5         UR                  U l        [        UR                     U l        UR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        g r  )r   r   r   r
   activation_functionr
  activation_dropoutr   r   r   ffn_dimr  r  r  r  ffn_layernormr   s     r2   r   Kosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr4   c                 R   U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R                  S9nU$ )Nr   )	r
  r  r   r   r   r  r   r  r  r  s     r2   r   Kosmos2TextFFN.forward)  s    **488M+BC--m?V?Vaeanan-o**=9/--m||VZVcVc-dr4   )r  r
  r   r  r  r  )	rq   rr   rs   rt   r   r   r   rx   r   r   s   @r2   r  r    s    
U0 
U r4   r  c                     ^  \ rS rSrSS\4U 4S jjjr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\
   S\\
   S\\R                     S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )Kosmos2TextBlocki3  r   c           
        > [         TU ]  5         UR                  U l        [        UU R                  UR                  UR
                  SSUS9U l        UR                  U l        [        R                  " U R                  UR                  S9U l        UR                  (       a`  [        UU R                  UR                  UR
                  SSUS9U l        [        R                  " U R                  UR                  S9U l        [        U5      U l        [        R                  " U R                  UR                  S9U l        g )NT)r   r   r   rn  ro  rp  r  F)r   r   r   rm  attention_headsr   r  r   r   r  r  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normr  ffnfinal_layer_norm)re   r   rp  r   s      r2   r   Kosmos2TextBlock.__init__4  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr4   rU   r   rt  encoder_attention_maskrv  cross_attn_layer_head_maskru  r   	use_cacherw  rZ   c                 R   UnU R                  U5      nU R                  " SUUUUUU
S.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nS nUb  [        U S5      (       d  [        SU  S35      eUnU R                  U5      nU R                  " SUUUUUUU
S.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U5      nX-   nU4nU(       a  XU4-  nU$ )N)rU   ru  r   rv  r   rw  r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rU   rt  r   rv  ru  r   rw  rp   )r  r  r   r   r   r   rP  r   r  r  r  r  )re   rU   r   rt  r  rv  r  ru  r   r  rw  r   r   self_attn_weightscross_attn_weightsr!  s                   r2   r   Kosmos2TextBlock.forwardS  s~    !11-@+/>> ,
'))+/),
 ,
( --m||VZVcVc-d 0 " ,400 =dV DD D 
 %H 88GM040A0A 	1+&;5 :-"3-	1 	1-M MM11-<<Z^ZgZg1hM$4M !--m< / 0 "+=>>Gr4   )r   r   r  r  r  r  r  r  r  )	NNNNNNFTN)rq   rr   rs   rt   r   r   r*   r   r   r   r+   rk   rv   r   rx   r   r   s   @r2   r  r  3  s*   X0 X XD 268<9=26=A*.,1$(15C||C !.C  (5	C
 !) 6C "%,,/C %-U\\$:C !C $D>C D>C !.C 
u  (51B1BEDUDU1U+V"WW	XC Cr4   r  c            '         ^  \ rS rSrSrS\4U 4S jjrS r     SS\\	R                     S\\	R                     S\\	R                     S	\S
\\	R                     4
S jjr                SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\\	R                        S\\	R                     S
\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\   S\\\4   4$S jjrSrU =r$ )Kosmos2TextTransformeri  z
Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

Args:
    config: Kosmos2TextConfig
r   c           
        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR
                  (       a   [        R                  " UR                  5      OSU l	        [        R                  " UR                  UR                  UR                  S9U l        [        UR                   UR                  UR                  S9U l        [        R$                  " ['        UR(                  5       Vs/ sH  n[+        XS9PM     sn5      U l        [        R,                  " UR                  UR.                  5      U l        SU l        g s  snf )Nr%   )rL   )r   rG  rL   )rp  F)r   r   r   r   	layerdropscale_embeddingrT  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensrF  max_position_embeddingsembed_positionsr&  r'  r)  r  r  r  
layer_normr*  )re   r   ir   s      r2   r   Kosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iThq%5f%JTh$ij,,v'7'79N9NO&+# %js   >Ec                     S nUS   S:  a   [        UUR                  UR                  US9nUb9  [        XR                  US   S9R	                  UR                  5      nUc  UOXe-   nU$ )Nr9   r   )r6   r7   r#   )rD   r"   r6   r3   r(   )re   r   rg  r1  r7   combined_attention_maskexpanded_attn_masks          r2   _prepare_decoder_attention_mask6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-n>Q>Q[fgi[j!k!n!n$$" '>&E"K]Kw $ '&r4   r1  rW   img_input_maskr7   r   c                    Uc  U R                  U5      nUbW  UR                  UR                  5      R                  SUR	                  S5      5      X$R                  [
        R                  S9'   X R                  -  nU R                  UUUUS9nUR                  UR                  5      nX'-   n[        R                  R                  XR                  U R                  S9nU$ )Nr9   r   )rK   r1  r7   r   r   )r  r(   r6   r@   r&   r*   r+   r  r  r   r   r   r   )	re   rK   r1  rW   r  r7   r   	positionsrU   s	            r2   forward_embedding(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%1--m||VZVcVc-dr4   rK   r   image_embeds_position_maskrt  r  	head_maskcross_attn_head_maskrT   r  r   r,  r-  rw  r   rZ   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  U
b  [	        S5      eUb"  UR
                  nUR                  SUS   5      nO"U
b  U
R                  5       S S nO[	        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       aB  [        U	[        5      (       d-  [        R                  S5        Sn[        R                  " U	5      n	U	b  U	R!                  5       OSnUS:  a  S nS nU R#                  UU
UUUUS	9nU R%                  UUUU5      nUb  Ub  ['        XjR(                  US   S
9n[*        R,                  R/                  UU R.                  U R                  S9nU(       a  SOS nU(       a  SOS nU(       a  Ub  SOS n[1        Xx/SS/5       Hn  u  nnUc  M  UR                  5       S   [3        U R4                  5      :w  d  M7  [	        SU S[3        U R4                  5       SUR                  5       S    S35      e   [7        U R4                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [8        R:                  " / 5      nUU R<                  :  a  ML  U" UUU4UUb  UU   OS Ub  UU   OS U	UUUS.UD6nUS   nU(       d  M}  UUS   4-  nUc  M  UUS   4-  nM     U R?                  U5      nU(       a  U	RA                  5       n	U(       a  UU4-  n[C        UU	UUUS9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer9   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )rK   r1  rW   r  r7   r   r  r   rp   r  r  zThe `z` should be specified for z layers, but it is for .)r  rv  r  ru  r   r  rw  r   r   )rS   rT   rU   rV   cross_attentions)"r   r   r,  r  r   r   r@   r&   r*  r   r   r   ry  r   r   from_legacy_cacheget_seq_lengthr  r  r3   r"   r   r   r   ziplenr)  r0  r*   randr  r  to_legacy_cacher   )re   rK   r   rW   r  rt  r  r  r  rT   r1  r   r  r   r,  r-  rw  r   rg  return_legacy_cacher7   rU   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer4  decoder_layerdropout_probabilityr6  s                                  r2   r   Kosmos2TextTransformer.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	 ]%>cdd"#//K!r;r?;I&',,.s3KTUU&&4==##p "	#Z??\
 #'1CCOTOETE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12HJ]J]grsugv%w"--mt||VZVcVc-d #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)% (>3<3H3dI]Ii,@,Eos."3#- M *!,M  =#3"55(4(]1-=,??(9 #9> 6-==?O  -!118+++%1
 	
r4   )	r   r   r  r  r  r*  r  r  r)  )NNNr   NNNNNNNNNNNNNNNNN)rq   rr   rs   rt   ru   r   r   r  r   r*   r   rG   r  listrv   r+   r   r   r   rk   r   r   rx   r   r   s   @r2   r  r    s   ,0 ,('4 15/315&'/3!  -! u||,	!
 !.! !$! u||,!J -115/3=A8<9=,07;=A04/3$(,0/3&*15#L
ELL)L
 !.L
 u||,	L

 %-U\\$:L
  (5L
 !) 6L
 ELL)L
 'u||4L
 "$u'8'8"9:L
  -L
 u||,L
 D>L
 $D>L
 'tnL
  d^!L
" !.#L
$ -.%L
& 
u??	@'L
 L
r4   r  c                   Z    \ rS rSr% \\S'   SrSS/rSrSr	Sr
S\R                  4S jrSrg	)
Kosmos2PreTrainedModeli~  r   Tr  r  r   c                 6   [        U [        5      (       a  U R                  R                  nO;[        U [        [
        45      (       a   U R                  R                  R                  n[        U [        [        45      (       a  U R                  R                  nO;[        U [        [
        45      (       a   U R                  R                  R                  n[        U[        5      (       a  [        R                  R                  UR                  SUR                   S-  W-  S9  [        R                  R                  UR"                  R$                  UR                  R&                  U-  S9  [        R                  R                  UR(                  R$                  UR                  R&                  U-  S9  GO[        U[*        5      (       Ga  UR                   S-  SUR                  R,                  -  S-  -  W-  nUR                   S-  U-  n[        R                  R                  UR.                  R$                  US9  [        R                  R                  UR0                  R$                  US9  [        R                  R                  UR2                  R$                  US9  [        R                  R                  UR4                  R$                  US9  GO[        U[6        5      (       a  UR                  R8                  S-  SUR                  R,                  -  S-  -  W-  nSUR                  R8                  -  S-  U-  n[        R                  R                  UR:                  R$                  US9  [        R                  R                  UR<                  R$                  US9  GO[        U[>        5      (       a  [        R                  R                  UR.                  R$                  WS9  [        R                  R                  UR0                  R$                  US9  [        R                  R                  UR2                  R$                  US9  [        R                  R                  UR4                  R$                  US9  GO>[        U[@        5      (       af  [        R                  R                  UR:                  R$                  WS9  [        R                  R                  UR<                  R$                  US9  GO[        U[        5      (       a4  [        R                  R                  URB                  R$                  WS9  GOz[        U[D        5      (       a]  [        R                  R                  URF                  R$                  WS9  [        R                  R                  URH                  5        GO[        U[J        5      (       a  URL                  R$                  RN                  R                  SWS9  URL                  RP                  bE  URL                  R$                  RN                  URL                  RP                     RS                  5         Oh[        U[        RT                  5      (       aI  UR$                  RN                  RW                  S5        URX                  RN                  RS                  5         [        U[        RZ                  5      (       a3  URX                  b%  URX                  RN                  RS                  5         ggg)zInitialize the weightsr   r   )meanstd)r  r   Nr%   ).ry  Kosmos2VisionModelr   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configr   r   initnormal_r   r   r   r   initializer_ranger   r   r(  r   r   r   r   r  r   r  r  rm  r  lm_headKosmos2ImageToTextProjectiondenselatent_queryr  r  datarL   zero_r  fill_r   r   )re   r   factorr  in_proj_stdout_proj_stdfc_stds          r2   _init_weights$Kosmos2PreTrainedModel._init_weights  s   d.//[[33F|-LMNN[[..AAFd-/EFGG++&&C|-LMNN++))22Cf566GGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 677!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE 011!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 344GGOOFMM00cO:GGOOFMM00cO:GGOOFMM00cO:GGOOFOO22O<//GGOOFJJ--3O7GGOOFJJ--3O7 677GGOOFNN11sO; <==GGOOFLL//SO9GGOOF//0 677&&++33#3F""..:##**//0C0C0O0OPVVX--MM$$S)KK""$fbii((V[[-DKK""$ .E(r4   rp   N)rq   rr   rs   rt   r   rw   supports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar   Moduler  rx   rp   r4   r2   r  r  ~  s;    &*#46HI"&N2%BII 2%r4   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )r  i  r   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r  )r   r   r9  model	post_initr   s     r2   r   Kosmos2VisionModel.__init__  s&     -f5
r4   rZ   c                 B    U R                   R                  R                  $ r  )r  r   r   rm   s    r2   get_input_embeddings'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r4   r   r,  r   r-  c                 (    U R                  UUUUUS9$ )N)r   r   r,  r   r-  r  )re   r   r   r,  r   r-  s         r2   r   Kosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r4   r   rD  )rq   rr   rs   rt   r    rw   main_input_namer   r   r  r  r   r   r*   rv   r+   r   rk   r   r   rx   r   r   s   @r2   r  r    s    $O2 5bii 5  59,0/3).&*
u001
 $D>
 'tn	

 #'
 d^
 
u00	1
 
r4   r  c            )       N  ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	\
\                SS\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\\R                         S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4$S jj5       5       rSrU =r$ )r  i  r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r  )r   r   r  r  r  r   s     r2   r   Kosmos2TextModel.__init__  s&     +F3
r4   rZ   c                 .    U R                   R                  $ r  r  r  rm   s    r2   r  %Kosmos2TextModel.get_input_embeddings      zz&&&r4   rK   r   rW   r  rt  r  r  r  rT   r1  r   r  r   r,  r-  rw  r   c                     U R                   " S0 SU_SU_SU_SU_SU_SU_SU_SU_S	U	_S
U
_SU_SU_SU_SU_SU_SU_UD6$ )a4  
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
rK   r   rW   r  rt  r  r  r  rT   r1  r   r  r   r,  r-  rw  rp   r   )re   rK   r   rW   r  rt  r  r  r  rT   r1  r   r  r   r,  r-  rw  r   s                     r2   r   Kosmos2TextModel.forward  s    J zz 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 $
  *#
 	
r4   r   r  )rq   rr   rs   rt   r   rw   r   r   r  r  r   r   r   r*   r   r  rv   r+   r   r   r   rk   r   r   rx   r   r   s   @r2   r  r    s   0 'bii '  -115/3=A8<9=,07;=A04/3$(,0/3&*15#5
ELL)5
 !.5
 u||,	5

 %-U\\$:5
  (55
 !) 65
 ELL)5
 'u||45
 "$u'8'8"9:5
  -5
 u||,5
 D>5
 $D>5
 'tn5
  d^!5
" !.#5
$ -.%5
& 
u??	@'5
  5
r4   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            +         ^  \ rS rSr% \\S'   S/rS\4U 4S jjrS\R                  4S jr
S\R                  4S jr\\                 SS\\R                      S	\\R                      S
\\R                      S\\R                      S\\R                      S\\R                      S\\R                      S\\R                      S\\\R$                        S\\R                      S\\R                      S\\R&                     S\\   S\\   S\\   S\\   S\\R                      S\\   S\\\4   4&S jj5       5       r       SU 4S jjrSrU =r$ )r  i#  r   zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NF)in_featuresout_featuresr   )
r   r   r  r  r   r   r   r  r  r  r   s     r2   r   Kosmos2TextForCausalLM.__init__-  sI     +F3
yyV-=-=FL]L]dij 	r4   rZ   c                 .    U R                   R                  $ r  r  rm   s    r2   r  +Kosmos2TextForCausalLM.get_input_embeddings6  r	  r4   c                     U R                   $ r  )r  rm   s    r2   get_output_embeddings,Kosmos2TextForCausalLM.get_output_embeddings9  s    ||r4   rK   r   rW   r  rt  r  r  r  rT   r1  r   labelsr  r   r,  r-  rw  r   c           	         Ub  UOU R                   R                  nUb  U(       a  [        R                  S5        SnU R                  " S0 SU_SU_SU_SU_SU_S	U_S
U_SU_SU	_SU
_SU_SU_SU_SU_SS_SU_UD6nU R                  US   5      nSnUb)  U R                  " SUXR                   R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FrK   r   rW   r  rt  r  r  r  rT   r1  r   r  r   r,  r-  Trw  r   )r}   r  r  )r|   r}   rT   rU   rV   r  rp   )r   r/  r   warningr  r  loss_functionr  r   rT   rU   rV   r  )re   rK   r   rW   r  rt  r  r  r  rT   r1  r   r  r  r   r,  r-  rw  r   r!  	lm_logitsr|   s                         r2   r   Kosmos2TextForCausalLM.forward<  sa   T &1%<k$++B]B]klI** 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 
  *#
& LL,	%%sYvR]R]RhRhslrsD0#33!//))$55
 	
r4   c	                   > US   S:w  a  S nS nOUb  Ub  UR                  5       S S OUR                  5       u  pUR                  5       S   n[        R                  " U[        R                  " XU-
  4[        R                  UR
                  S94SS9n[        TU ]  " U4UUUUUUUS.U	D6nUR                  SS 5        U$ )Nr   r9   )r&   r"   r6   r   r;   )rT   r   rW   r  r1  r  rw  r   )	r&   r*   rA   rB   r+   r6   r   prepare_inputs_for_generationpop)re   rK   rW   r  rT   r   r1  r  rw  model_kwargsr   rc  mask_lenmodel_inputsr   s                 r2   r  4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s     !!L)-& (3?L?X-"4"4"6s";^g^l^l^nJ1668<H)..KKjH2D%EUZZ`i`p`pq *& w<

+)%'A')

 

 	.r4   )r  r  )NNNNNNNNNNNNNNNNN)NNNNNNN)rq   rr   rs   rt   r   rw   _tied_weights_keysr   r   r  r  r  r   r   r   r*   r   r  rv   
LongTensorr+   r   r   r   rk   r   r   r  rx   r   r   s   @r2   r  r  #  s$    *+0 'bii 'ryy   -115/3=A8<9=,07;=A04/3-1$(,0/3&*15%O
ELL)O
 !.O
 u||,	O

 %-U\\$:O
  (5O
 !) 6O
 ELL)O
 'u||4O
 "$u'8'8"9:O
  -O
 u||,O
 ))*O
 D>O
 $D>O
  'tn!O
" d^#O
$ !.%O
& +,'O
( 
u77	8)O
  O
h #'- -r4   r  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )r  i  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r   c           	        > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  5      U l        [        R                  " [        R                  " UR                  UR                  R                  5      5      U l        [        UR                  UR                  R                  UR                  R                  UR                  R                   SSS9U l        g )NF)r   rn  ro  )r   r   r   r   r  r   r  r   r  r   r*   r   latent_query_numr  rm  r  r   x_attnr   s     r2   r   %Kosmos2ImageToTextProjection.__init__  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r4   c                     U R                  U5      nU R                  R                  S5      R                  UR	                  S5      SS5      n[
        R                  " X#/SS9nU R                  UUS S S S9u  p%X%4$ )Nr   r9   r   r;   )rU   rt  ru  r   r   )r  r  r   r'   r&   r*   rA   r(  )re   featuresrU   r  key_value_statesr   s         r2   r   $Kosmos2ImageToTextProjection.forward  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m%BJ&*kk&"2" '2 '
# **r4   )r  r  r(  )
rq   rr   rs   rt   ru   r   r   r   rx   r   r   s   @r2   r  r    s    w
} 
+ +r4   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %       <  ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S r  SS\R                  S\\   S	\\   4S
 jjr\\              SS\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\\R                        S\\R&                     S\\R&                     S\\R&                     S\\   S\\   S\\   S	\S\\   S\\   S\\\4   4 S jj5       5       rSrU =r$ )r  i  r   r   c                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        U5      U l	        U R                  5         g r  )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r2   r   Kosmos2Model.__init__  sN     *6+=+=>.v/C/CD(DV(L% 	r4   rZ   c                 B    U R                   R                  R                  $ r  r0  r  r  rm   s    r2   r  !Kosmos2Model.get_input_embeddings      $$111r4   c                 8    XR                   R                  l        g r  r5  re   r   s     r2   set_input_embeddings!Kosmos2Model.set_input_embeddings      -2*r4   return_attentionsr   c                     U R                  UUS9nU R                   R                  R                  US   5      n[        R                  R                  USS9nU R                  U5      u  pVU(       a  XV4$ U$ )a  
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    return_attentions (`bool`, *optional*, defaults to `False`):
        Whether to return `projection_attentions` or not.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate positional embeddings or not.
)r   r   r   r9   r;   )r1  r  r=  r   r   	normalizer2  )re   r   r=  r   rY   rW   rX   s          r2   get_image_featuresKosmos2Model.get_image_features  s    " #//%%= 0 

 ((..==>QRS>TU}}..|.D.2.K.KL.Y+66r4   rK   r  r   r  rT   rW   r1  r   r  r   r,  r-  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSnUc"  Uc  [	        S5      eU R                  USUS9u  nnU R                  " SUUUUUUUU	U
UUSS.UD6n[        UR                  UR                  UR                  UR                  UUUS9$ )aM  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Kosmos2Model

>>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> text = (
...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
...     "</object>"
... )

>>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

>>> last_hidden_state = model(
...     pixel_values=inputs["pixel_values"],
...     input_ids=inputs["input_ids"],
...     attention_mask=inputs["attention_mask"],
...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
... ).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 91, 2048]
```N<You have to specify either `pixel_values` or `image_embeds`.T)r=  r   )rK   r   rW   r  r  rT   r1  r   r  r   r,  r-  )rS   rT   rU   rV   rW   rX   rY   rp   )r   r   r,  r/  r   r@  r0  rQ   rS   rT   rU   rV   )re   r   rK   r  r   r  rT   rW   r1  r   r  r   r,  r   r-  r   rY   rX   r!  s                      r2   r   Kosmos2Model.forward  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``262I2IOg 3J 3/L/ // 
)%'A+'%/!5
 
  "%77#33!//))%"7 3
 	
r4   r2  r0  r1  )FF)NNNNNNNNNNNNFN)rq   rr   rs   rt   r   rw   r  r   r   r  r  r:  r*   rv   r   r+   r@  r   r   r   r  r   r   r   rk   rQ   r   rx   r   r   s   @r2   r  r    s    $O} 2bii 23 -238	'' $D> #+4.	>  04,0=A15,0=A/304/3$(,0/3).&*a
u||,a
 ELL)a
 %-U\\$:	a

 !.a
 ELL)a
 "$u'8'8"9:a
 u||,a
  -a
 u||,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
  a
r4   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            #         ^  \ rS rSr% \\S'   SrS/rS\4U 4S jjrS\	R                  4S jrS rS\	R                  4S	 jrS
 r\\             SS\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\\R*                        S\\R&                     S\\R&                     S\\R&                     S\\R,                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       r      SS\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     4S jjrSrU =r $ )r  i  r   r   ztext_model.lm_head.weightc                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        U5      U l	        U R                  5         g r  )r   r   r  r  r0  r  r  r1  r  r2  r  r   s     r2   r   (Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	r4   rZ   c                 B    U R                   R                  R                  $ r  r5  rm   s    r2   r  4Kosmos2ForConditionalGeneration.get_input_embeddings  r7  r4   c                 8    XR                   R                  l        g r  r5  r9  s     r2   r:  4Kosmos2ForConditionalGeneration.set_input_embeddings  r<  r4   c                 6    U R                   R                  5       $ r  )r0  r  rm   s    r2   r  5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r4   c                 :    U R                   R                  U5        g r  )r0  set_output_embeddings)re   new_embeddingss     r2   rP  5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=r4   rK   r  r   r  rT   rW   r1  r   r  r  r   r,  r   c                 2   Ub  UOU R                   R                  nUb  UOU R                   R                  nSnSnUcy  Uc  [        S5      eU R	                  UUUS9nU R                  R
                  R                  US   5      n[        R                  R                  USS9nU R                  U5      u  nnU R                  " S
UUUUUUUU	U
UUUSS.UD6n[        UR                  UR                  UR                  UR                   UR"                  UUUS	9$ )a	  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

>>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> prompt = "<grounding> An image of"

>>> inputs = processor(text=prompt, images=image, return_tensors="pt")

>>> generated_ids = model.generate(
...     pixel_values=inputs["pixel_values"],
...     input_ids=inputs["input_ids"],
...     attention_mask=inputs["attention_mask"],
...     image_embeds=None,
...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
...     use_cache=True,
...     max_new_tokens=64,
... )
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
>>> processed_text
'<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

>>> caption, entities = processor.post_process_generation(generated_text)
>>> caption
'An image of a snowman warming himself by a fire.'

>>> entities
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
```NrC  )r   r   r,  r   r9   r;   T)rK   r   rW   r  r  rT   r1  r   r  r  r   r,  r-  )r|   r}   rT   rU   rV   rW   rX   rY   rp   )r   r   r,  r   r1  r  r=  r   r   r?  r2  r0  rz   r|   r}   rT   rU   rV   )re   r   rK   r  r   r  rT   rW   r1  r   r  r  r   r,  r   rY   rX   
lm_outputss                     r2   r   'Kosmos2ForConditionalGeneration.forward  s\   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 # $# !_``"&"3"3)"3%9 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/__ 
)%'A+'%/!5
 

" :$$&66$22!,,%"7 3	
 		
r4   c           	         UR                  SS 5      nUb  Ub  [        SU S35      eUc  Ub  UnUcj  U R                  U5      n	U R                  R                  R	                  U	S   5      n[
        R                  R                  USS9nU R                  U5      u  pZU R                  R                  " SUUUUUS.UD6nU$ )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r9   r;   )rK   r   rW   r  r1  rp   )r  r   r1  r  r=  r   r   r?  r2  r0  generate)re   r   r  rK   r   rW   r1  r   rW  rY   rX   outputs               r2   rX  (Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L)) 
)%'A'
 
 r4   rE  )NNNNNNNNNNNNN)NNNNNN)!rq   rr   rs   rt   r   rw   r  r#  r   r   r  r  r:  r  rP  r   r   r   r*   r   r  rv   r$  r+   r   r   r   rk   rz   r   rX  rx   r   r   s   @r2   r  r    sH    $O56	} 	2bii 237ryy 7>  04,0=A15,0=A/304/3-1$(,0/3u
u||,u
 ELL)u
 %-U\\$:	u

 !.u
 ELL)u
 "$u'8'8"9:u
 u||,u
  -u
 u||,u
 ))*u
 D>u
 $D>u
 'tnu
 +,u
  
u@@	A!u
  u
r 04=A,015/304%u||,% %-U\\$:% ELL)	%
 !.% u||,%  -% %r4   r  )r  r  r  r  )r   )r   )Oru   rT  dataclassesr   typingr   r   r   r   r*   torch.utils.checkpointr   activationsr
   cache_utilsr   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_kosmos2r   r   r    
get_loggerrq   r   r   r"   rG   r3   Sizer6   rD   rN   rQ   rz   r  r   rX  r   r   r  r  r$  r9  rF  rm  r  r  r  r  r  r  r  r  r  r  __all__rp   r4   r2   <module>rk     s$     ! 1 1    ! 5 ) B 9  G & j j X X 
		H	%[u|| [EKK [(3- [ jk\ZZ\(-\=B\\\cf\$4  
#
 #
 #
L 
(
 (
 (
XPbii Pv %II%<<% 
% <<	%
 U\\*% % %,L)RYY L)`ryy  / : /fT
299 T
p3
ryy 3
nUcryy Ucpz)")) z)zRYY .c1 cLb
RYY b
J :%_ :% :%z
/ 
BC
- C
L S3_ SSl +299  +F 
V
) V

V
r z&<o zzz Xr4   