
    h                     ,   d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ e ed       G d de                    Z%e ed       G d de                    Z& ed       G d dejN                               Z( G d dejN                        Z) G d d ejN                        Z*	 dDd!ejN                  d"ejV                  d#ejV                  d$ejV                  d%eejV                     d&e,d'e,fd(Z- G d) d*ejN                        Z. G d+ d,ejN                        Z/ G d- d.ejN                        Z0 G d/ d0e      Z1 G d1 d2ejN                        Z2 G d3 d4ejN                        Z3 G d5 d6ejh                        Z5 G d7 d8e      Z6d9ejV                  d:e7fd;Z8 G d< d=e6      Z9 ed>       G d? d@e6             Z:e G dA dBe6e             Z;g dCZ<y)E    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple   )	AutoModel   )Ovis2ConfigOvis2VisionConfigzJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                   :    e Zd ZU dZdZeej                     ed<   y)Ovis2ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     g/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/ovis2/modeling_ovis2.pyr   r   *   s    	 8<%"3"34;r(   r   zQ
    Base class for Ovis2 causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Ovis2CausalLMOutputWithPastaA  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r    r!   r"   r#   r,   r   r$   r%   r&   r-   r.   r
   r/   tupler0   r   r'   r(   r)   r+   r+   ?   s     )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju001297;%"3"34;r(   r+   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Ovis2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Ovis2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr$   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r)   r7   zOvis2RMSNorm.__init___   s1     	ll5::k#:; #r(   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   Tkeepdim)	dtypetor$   float32powmeanrsqrtr;   r:   )r<   r/   input_dtypevariances       r)   forwardzOvis2RMSNorm.forwardg   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r(   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r1   r:   shaper;   r<   s    r)   
extra_reprzOvis2RMSNorm.extra_reprn   s*    ))*+6$2G2G1HIIr(   )gư>)r    r!   r"   r7   rL   rP   __classcell__r?   s   @r)   r4   r4   ]   s    $;Jr(   r4   c                   $     e Zd Z fdZd Z xZS )Ovis2VisionMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y Nbiasr6   r7   configr=   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr<   rZ   r?   s     r)   r7   zOvis2VisionMLP.__init__s       !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r(   c                     | j                  | j                  | j                  |            | j                  |      z        }|S Nr`   rb   r^   r_   r<   xr`   s      r)   rL   zOvis2VisionMLP.forward}   6    NN4;;t~~a/@#ADLLQRO#ST	r(   r    r!   r"   r7   rL   rQ   rR   s   @r)   rT   rT   r       0r(   rT   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )Ovis2VisionEmbeddingsrZ   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       t'        |j                  |j(                        | _        y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   rA   F)
persistent)r6   r7   rZ   r=   	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr$   arangeexpandr4   rms_norm_epsrms_normrc   s     r)   r7   zOvis2VisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jop$V%7%79L9LMr(   pixel_valuesreturnc                 (   | j                   j                  j                  }| j                  |j                  |            }|j	                  d      j                  dd      }| j                  |      }|| j                  | j                        z   }|S )NrD   r   r   )	r}   r:   rD   rE   flatten	transposer   r   rv   )r<   r   target_dtypepatch_embeds
embeddingss        r)   rL   zOvis2VisionEmbeddings.forward   s    ++2288++LOO,O,OP!))!,66q!<
]]:.
$"9"9$:K:K"LL
r(   )
r    r!   r"   r   r7   r$   r%   TensorrL   rQ   rR   s   @r)   rn   rn      s/    N0 N*E$5$5 %,, r(   rn   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrA   )dimrD   )ptrainingr   r   )r$   matmulr   r   
functionalsoftmaxrF   rE   rD   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r)   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r(   c            
            e Zd ZdZ fdZ	 ddej                  deej                     deej                  eej                     f   fdZ	 xZ
S )Ovis2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      FrW   r6   r7   rZ   r=   rx   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalr   r\   qkv_biask_projv_projq_projout_projrc   s     r)   r7   zOvis2VisionAttention.__init__   2   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr(   r/   r   r   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS z#Input shape: Batch x Time x Channelr   r   eager        )r   r   r   rN   r   r   r   viewr   r   r   r   rZ   _attn_implementationr   r   r   r   r   reshaper   r   r<   r/   r   r   
batch_size
seq_lengthrx   querieskeysvaluesattention_interfacer   r   s                r)   rL   zOvis2VisionAttention.forward   a    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r(   rf   r    r!   r"   r#   r7   r$   r   r   r1   rL   rQ   rR   s   @r)   r   r      V    GX, 26$)||$) !.$)
 
u||Xell33	4$)r(   r   c                   $     e Zd Z fdZd Z xZS )Ovis2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y rV   rY   rc   s     r)   r7   zOvis2MLP.__init__   rd   r(   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rf   rg   rh   s      r)   rL   zOvis2MLP.forward  rj   r(   rk   rR   s   @r)   r   r      rl   r(   r   c            
            e Zd ZdZ fdZ	 ddej                  deej                     deej                  eej                     f   fdZ	 xZ
S )Ovis2Attentionr   c                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y r   r   rc   s     r)   r7   zOvis2Attention.__init__
  r   r(   r/   r   r   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS r   r   r   s                r)   rL   zOvis2Attention.forward  r   r(   rf   r   rR   s   @r)   r   r     r   r(   r   c            	            e Zd Zdef fdZ	 ddej                  deej                     dee	   dej                  fdZ
 xZS )	Ovis2VisionEncoderLayerrZ   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y rf   )r6   r7   r   	attentionr   ffnr4   r=   r   	rms_norm1	rms_norm2rc   s     r)   r7   z Ovis2VisionEncoderLayer.__init__E  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr(   r/   r   r   r   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)r/   r   r'   )r   r   r   r   )r<   r/   r   r   norm_hidden_statesr   _
mlp_outputs           r)   rL   zOvis2VisionEncoderLayer.forwardL  sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2r(   rf   )r    r!   r"   r   r7   r$   r   r   r   r   rL   rQ   rR   s   @r)   r   r   D  sY    O0 O 26|| !. +,	
 
r(   r   c            	       t     e Zd ZdZdef fdZee	 ddee	j                     dee   defd              Z xZS )	Ovis2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Ovis2VisionEncoderLayer`].

    Args:
        config: Ovis2VisionConfig
    rZ   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r6   r7   rZ   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r<   rZ   r   r?   s      r)   r7   zOvis2VisionEncoder.__init__f  sV    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A#r   r   r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )Nlast_hidden_state)r   r   )r<   inputs_embedsr   r   r/   encoder_layers         r)   rL   zOvis2VisionEncoder.forwardm  s5     &![[M)-R6RM ) ??r(   rf   )r    r!   r"   r#   r   r7   r   r   r   r$   r   r   r   r   rL   rQ   rR   s   @r)   r   r   ]  sh    ,0 ,  26
@ !.
@ +,	
@
 

@  
@r(   r   c                   X     e Zd Zdef fdZe	 ddeej                     fd       Z	 xZ
S )Ovis2VisionTransformerrZ   c                     t         |           || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        d| _        y r   )r6   r7   rZ   rn   r   r   encoderr4   r=   r   r   r   rc   s     r)   r7   zOvis2VisionTransformer.__init__}  sO    /7)&1$V%7%79L9LM&+#r(   r   c                     | j                  |      } | j                  d||d|}|j                  }| j                  |      }t	        |      S )N)r   r   r   r'   )r   r   r   r   r   )r<   r   r   r   r/   encoder_outputsr   s          r)   rL   zOvis2VisionTransformer.forward  sa     5+74<< ,
'),
 ,
 ,== MM*;<1BCCr(   rf   )r    r!   r"   r   r7   r   r   r$   r   rL   rQ   rR   s   @r)   r   r   |  s?    ,0 ,  26D !.D Dr(   r   c                   P     e Zd Zdej                  dej                  f fdZ xZS )Ovis2VisualEmbeddingTablevisual_tokensr   c                    |j                   t        j                  t        j                  t        j                  t        j
                  t        j                  fv rt        | !  |      S t        j                  || j                        S rf   )rD   r$   int8int16int32int64longr6   rL   r   r:   )r<   r   r?   s     r)   rL   z!Ovis2VisualEmbeddingTable.forward  sW    5::u{{EKKV[V`V`"aa7?=11||M4;;77r(   )r    r!   r"   r$   r   rL   rQ   rR   s   @r)   r   r     s#    8U\\ 8ell 8 8r(   r   c                   B    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZdZdZy)Ovis2PreTrainedModelrZ   modelTr   r.   N)r    r!   r"   r   r&   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr'   r(   r)   r   r     sF    &*#/0"3 N!"&r(   r   r-   r   c                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NTrB   r   )memory_formatg      ?)r   maxr$   
zeros_likelegacy_contiguous_formatscatter_detach)r-   r   y_softindexy_hardrets         r)   hard_softmaxr    sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJr(   c                        e Zd ZU eed<   def fdZdej                  deej                  ej                  f   fdZ
 xZS )Ovis2VisionModelrZ   c                    t         |   |       || _        t        |      | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  z  |j                  z  | j                  | j
                  z
  d      | _        t        j                  | j                  | j
                  z
        | _        y NFrW   )r6   r7   rZ   r   transformernum_visual_indicator_tokens
vocab_sizer   r\   r=   hidden_stridehead_linear	LayerNorm	head_normrc   s     r)   r7   zOvis2VisionModel.__init__  s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr(   r   r   c           	          | j                   |fi |}|d   }| j                  j                  dkD  r|j                  \  }}}| j                  j                  }t	        t        j                  |            }	|	|	z  |k7  rt        d      ||	|z  z
  |z  }
t        j                  j                  |ddd|
d|
fdd      }|	|
z  }	|j                  ||	|z  ||	|z  ||      }|j                  dddddd      }|j                  |d	||z  |z        }| j                  |      }| j                  |      }| j                  j                  d
k(  r$t        j                  j!                  |d	d      }|S | j                  j                  dk(  rt#        |d	      }|S | j                  j                  dk(  r!t        j                  j%                  |d	      }S )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         rA   gumbel_argmaxT)r   hard	st_argmaxr   r   )r  rZ   r  rN   intmathsqrtr   r   r   padr   permuter  r  tokenize_functiongumbel_softmaxr  r   )r<   r   r   outputsr   
num_imagesseq_len
hidden_dimr  sqrt_lpad_sizer-   
prob_tokens                r)   rL   zOvis2VisionModel.forward  s   "$""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uhF 1 9 9Fm3]FmD[]jlv! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ  [[**k9%f"5J  [[**i7..v2.>Jr(   )r    r!   r"   r   r&   r7   r$   r%   r1   r   rL   rQ   rR   s   @r)   r  r    sF    Z0 Z!E$5$5 !E%,,X]XdXdJdDe !r(   r  zu
    The Ovis2 model which consists of a vision backbone and a language model, without a language modeling head.
    c            !       4    e Zd Zi Zdef fdZd Zd Zd Zd Z	de
j                  de
j                  fd	Zd
e
j                  de
j                  de
j                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ee
j                     dee
j                     dee
j$                     dee
j                     dee   dee
j                     dee
j                     dee   dee   dee   dee   dee
j                     deee
j$                  f   deeef   fd              Z xZS )
Ovis2ModelrZ   c                    t         |   |       t        |j                        | _        t        j                  |j                        | _        t        |j                  j                  |j                        | _        |j                  j                  | _        |j                  | _
        |j                  | _        | j                          y rf   )r6   r7   r  vision_configvision_towerr   from_configtext_configlanguage_modelr   r  r=   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_ids	post_initrc   s     r)   r7   zOvis2Model.__init__  s     ,V-A-AB'33F4F4FG'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K'r(   c                 6    | j                   j                         S rf   )r;  get_input_embeddingsrO   s    r)   rA  zOvis2Model.get_input_embeddings  s    ""7799r(   c                 :    | j                   j                  |       y rf   )r;  set_input_embeddingsr<   r   s     r)   rC  zOvis2Model.set_input_embeddings  s    007r(   c                     || _         y rf   r;  r<   decoders     r)   set_decoderzOvis2Model.set_decoder  s
    %r(   c                     | j                   S rf   rF  rO   s    r)   get_decoderzOvis2Model.get_decoder
  s    """r(   r   r   c                 4   | j                  |      }|j                  \  }}}t        j                  ||| j                   j                  f|j
                  |j                  d|j                        }t        j                  ||gd      }| j                  |      }t        j                  | j                  | j                   j                  z
  | j                  t        j                        j                  |j                        }| j                  |      }||fS )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`, *optional*):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`, *optional*):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        F)rD   devicerequires_gradlayoutr   r%  r   )r8  rN   r$   zerosr  rD   rM  rO  catr<  r   r=  r   rE   )	r<   r   image_featuresr   img_seq_lenr   padding_tensorvisual_indicatorvisual_indicator_featuress	            r)   get_image_featureszOvis2Model.get_image_features  s    ( **<8%3%9%9"
Kd&7&7&S&ST &&!((!((
 NN#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 %)$@$@AQ$R!888r(   	input_idsr   rR  c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        rD   rM  rA   r   r   z6Image features and image tokens do not match: tokens: z, features )rA  r$   tensorrZ   image_token_idr   rM  allsum	unsqueeze	expand_asrE   rN   numelr   )r<   rX  r   rR  special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskzOvis2Model.get_placeholder_mask6  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r(   r   rv   r.   labels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionlogits_to_keepc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
|d u |d uz  rt        d      | | j	                         |      }| | j                  |      \  }}| j                  |||      }|j                  ||      }t        | j                        D ]  \  }}|Y| | j	                         t        j                  |t        j                  |j                              k(  }|j                  d      }n||k(  j                  |j                        }|j!                         s||   j#                  ||         j                  |j                  |j$                        ||<     | j&                  d	||||||	|
d||d
|}t)        |j*                  |j,                  |j.                  |j0                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )r   rR  rZ  rA   T)
r   rv   r.   r   rg  rh  ri  rj  rk  rl  )r   r.   r/   r0   r   r'   )rZ   rh  ri  r   rA  rW  re  masked_scatter	enumerater>  r$   r[  r   rM  r]  rE   anyr`  rD   r;  r   r   r.   r/   r0   )r<   rX  r   r   rv   r.   r   rf  rg  rh  ri  rj  rk  rl  r   rR  rV  rb  ivisual_indicator_idmaskr-  s                         r)   rL   zOvis2Model.forwardN  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ 7D557	BM#8<8O8O]i8O8j5N5!%!:!:+- "; "
 *889K^\M*3D4S4S*T&&$(,GD,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88:1!4"=#67M00-2E2EF "$' +U  &$%% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r(   NNNNNNNNNNNNr   )r    r!   r"   _checkpoint_conversion_mappingr   r7   rA  rC  rI  rK  r$   r%   rW  
LongTensorre  r   r   r   r   r
   boolr   r&  r1   r   rL   rQ   rR   s   @r)   r5  r5    s    &("	{ 	:8&#'9'''9 
		'9R"))":?:K:K"]b]n]n"0  15481537+/59-1$(,0/3&*5934J
E,,-J
 u001J
 !.	J

 u//0J
 "%J
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 c5<</0J
  
u..	/!J
  J
r(   r5  c            !       :    e Zd Zi ZdgZdef fdZd Zd Zde	j                  fdZd Zd	 Zd
ej                  fdZed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej0                     d
eej                     deej2                     deej0                     dee   deej                     deej0                     dee   dee   dee   dee   deej0                     deeej2                  f   deeef   fd              Z 	 	 	 	 	 	 d fd	Z! xZ"S )Ovis2ForConditionalGenerationzlm_head.weightrZ   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r  )
r6   r7   r5  r   r   r\   r=   r  lm_headr?  rc   s     r)   r7   z&Ovis2ForConditionalGeneration.__init__  sF     '
yy!3!3V5F5FUSr(   c                 6    | j                   j                         S rf   )r   rA  rO   s    r)   rA  z2Ovis2ForConditionalGeneration.get_input_embeddings  s    zz..00r(   c                 :    | j                   j                  |       y rf   )r   rC  rD  s     r)   rC  z2Ovis2ForConditionalGeneration.set_input_embeddings  s    

''.r(   r   c                     | j                   S rf   )r|  rO   s    r)   get_output_embeddingsz3Ovis2ForConditionalGeneration.get_output_embeddings  s    ||r(   c                 :    | j                   j                  |       y rf   )r   rI  rG  s     r)   rI  z)Ovis2ForConditionalGeneration.set_decoder  s    

w'r(   c                 6    | j                   j                         S rf   )r   rK  rO   s    r)   rK  z)Ovis2ForConditionalGeneration.get_decoder  s    zz%%''r(   r   c                 :    | j                   j                  |      S )Nrn  )r   rW  )r<   r   s     r)   rW  z0Ovis2ForConditionalGeneration.get_image_features  s    zz,,,,GGr(   c                 .    | j                   j                  S rf   )r   r;  rO   s    r)   r;  z,Ovis2ForConditionalGeneration.language_model  s    zz(((r(   c                 .    | j                   j                  S rf   )r   r8  rO   s    r)   r8  z*Ovis2ForConditionalGeneration.vision_tower  s    zz&&&r(   c                     t        d      )NzNot needed for Ovis2)AttributeErrorrO   s    r)   multi_modal_projectorz3Ovis2ForConditionalGeneration.multi_modal_projector  s    344r(   rX  r   rv   r.   r   rf  rg  rh  ri  rj  rk  rl  c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  d||||||||	|
d|d|}|d   }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)rX  r   r   rv   r.   r   rg  rh  ri  rj  rk  r   )r-   rf  r  )r,   r-   r.   r/   r0   r   r'   )rZ   rh  ri  r   
isinstancer&  slicer|  loss_functionr:  r  r+   r.   r/   r0   r   )r<   rX  r   r   rv   r.   r   rf  rg  rh  ri  rj  rk  rl  r   r-  r/   slice_indicesr-   r,   s                       r)   rL   z%Ovis2ForConditionalGeneration.forward  s7   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r(   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r.   r   r   rk  rl  r   r   )r6   prepare_inputs_for_generation)r<   rX  r.   r   r   r   rk  rl  r   model_inputsr?   s             r)   r  z;Ovis2ForConditionalGeneration.prepare_inputs_for_generation  sV     w<
+')))
 
 !! ,8L(r(   ru  )NNNNNN)#r    r!   r"   rv  _tied_weights_keysr   r7   rA  rC  r   Moduler  rI  rK  r$   r%   rW  propertyr;  r8  r  r   r   r   rw  r   r
   rx  r   r&  r1   r+   rL   r  rQ   rR   s   @r)   rz  rz    s   %'"*+{ 1/ryy ((Hu/@/@ H ) ) ' ' 5 5  15481537+/59-1$(,0/3&*5934R
E,,-R
 u001R
 !.	R

 u//0R
 "%R
   1 12R
 ))*R
 D>R
 $D>R
 'tnR
 d^R
 !!1!12R
 c5<</0R
  
u11	2!R
  R
n  r(   rz  )r   r5  rz  )r   )=r'  dataclassesr   typingr   r   r   r$   r   activationsr	   cache_utilsr
   
generationr   integrationsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   autor   configuration_ovis2r   r   r   r+   r  r4   rT   rn   r   floatr   r   r   r   r   r   r   r   r   r   r&  r  r  r5  rz  __all__r'   r(   r)   <module>r     s=  ,  ! , ,   !   ) 7 9 H F & V V  ? 
<6 < < 
<+ < <0 Y'J299 J (J(RYY  BII P %II%<<% 
% <<	%
 U\\*% % %.:)299 :)zryy  :)RYY :)z8 2@ @>DRYY D<8 8'? ' C 1+ 1h 
g
% g

g
T [$8/ [ [| Rr(   