
    <h                        S SK r S SKrS SKJr  S SKJrJrJrJr  S SK	r
S SKrS SKJr  S SKJs  Jr  S SKJrJrJr  S SKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSK J!r!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(J)r)J*r*  \\%" SS9 " S S\$5      5       5       r+\\%" SS9 " S S\$5      5       5       r,\\% " S S\$5      5       5       r- " S S\R\                  5      r/ SKS\R\                  S\R`                  S\R`                  S\R`                  S\\R`                     S\1S \14S! jjr2 " S" S#\R\                  5      r3 " S$ S%\R\                  5      r4 " S& S'\5      r5 " S( S)\R\                  5      r6 " S* S+\R\                  5      r7 " S, S-\R\                  5      r8S. r9 SLS/\R`                  S0\1S1\1S2\1S3\1S4\R`                  4S5 jjr:SMS6 jr;S7 r<S8 r= " S9 S:\R\                  5      r>\% " S; S<\"5      5       r?\%" S=S9 " S> S?\?5      5       r@ " S@ SA\R\                  5      rA\%" SBS9 " SC SD\?5      5       rB\% " SE SF\?5      5       rC\%" SGS9 " SH SI\?5      5       rD/ SJQrEg)N    N)	dataclass)AnyCallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuple   )Siglip2ConfigSiglip2TextConfigSiglip2VisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
Siglip2VisionOutput*   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r    r   torchFloatTensor__annotations__r!   r"   tupler#   __static_attributes__r$       d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/siglip2/modeling_siglip2.pyr   r   *   sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r/   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
Siglip2TextOutput<   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr!   .r"   r#   r$   )r%   r&   r'   r(   r)   r4   r   r*   r+   r,   r!   r"   r-   r#   r.   r$   r/   r0   r2   r2   <   sr    
 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r/   r2   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)Siglip2OutputN   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Siglip2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Siglip2VisionModel`].
Nlosslogits_per_imagelogits_per_textr4   r    text_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r;   r<   N)getattrto_tuple).0kselfs     r0   	<genexpr>)Siglip2Output.to_tuple.<locals>.<genexpr>m   s<      
   LLDGRYZ^`aRbRkRkRmm s   14)r-   keysrD   s   `r0   rA   Siglip2Output.to_tuplel   s#     
YY[
 
 	
r/   r$   )r%   r&   r'   r(   r)   r8   r   r*   r+   r,   r9   r:   r4   r    r;   r   r<   r-   r   rA   r.   r$   r/   r0   r6   r6   N   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r/   r6   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\
S\R                  4S j5       rS	\R                  S\R                  S\R                  4S
 jrSrU =r$ )Siglip2VisionEmbeddingss   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        [        R                  " UR                  U R
                  -  U R
                  -  U R                  S9U l	        UR                  U l
        [        U R                  S-  5      U l        [        R                  " U R                  U R                  5      U l        g )N)in_featuresout_featuresg      ?)super__init__rM   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embeddingrD   rM   	__class__s     r0   rR    Siglip2VisionEmbeddings.__init__t   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr/   positional_embeddingsspatial_shapes
max_lengthr=   c           	      N   UR                   S   nU R                   S   nU R                  n[        R                  " X2U4U R                  US9nU R                  SSS5      R                  S5      n U R                  R                  S:X  a  U R                  [        R                  5      n [        U5       Hn  nX   u  p[        R                  " U X4SSS	S
9n
U
R                  XHU	-  5      R                  SS5      n
U
R                  U5      n
XUSX-  24'   U
S   XgX-  S24'   Mp     U$ )a  
Resize positional embeddings to image-specific size and pad to a fixed size.

Args:
    positional_embeddings (`torch.Tensor`):
        Position embeddings of shape (height, width, embed_dim)
    spatial_shapes (`torch.LongTensor`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
    max_length (`int`):
        Maximum length of the positional embeddings to pad resized positional embeddings to

Returns:
    `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
r   )devicedtype   r   cpubilinearFT)sizemodealign_corners	antialiasN)shaperh   r*   emptyrg   permute	unsqueezetypetofloat32rangeFinterpolatereshape	transpose)rb   rc   rd   
batch_sizerT   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r0   resize_positional_embeddings4Siglip2VisionEmbeddings.resize_positional_embeddings   sI   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z"A*-MF!"%_#" "4!;!;IPU~!V!`!`abde!f "4!6!6|!DBT1.>.>+>?BTUVBW*fn.>+>?% #( .-r/   pixel_valuesc                 :   U R                   R                  R                  nU R                  UR                  US95      nU R                  R                  R                  U R                  U R                  S5      nU R                  XRUR                  S   S9nXF-   nU$ )a  
Args:
    pixel_values (`torch.FloatTensor`):
        Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
    spatial_shapes (`list[tuple[int, int]]`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
)rh   rf   r   )rd   )	rY   weightrh   ru   r^   rz   r\   r   rp   )rD   r   rc   target_dtypepatch_embedsrb   resized_positional_embeddings
embeddingss           r0   forwardSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!l>P>PQR>S )J )
%
 "A
r/   )rM   rT   rZ   rY   rU   r^   r\   )r%   r&   r'   r(   r   rR   staticmethodr*   Tensor
LongTensorr[   r   r+   r   r.   __classcell__r`   s   @r0   rK   rK   s   s    Q2 Q 8.$||8.((8. 8. 
	8. 8.tE$5$5 uGWGW \a\h\h  r/   rK   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrf   )dimrh   )ptrainingr   ri   )r*   matmulr{   rV   
functionalsoftmaxrv   ru   rh   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r/   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
Siglip2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rQ   rR   rM   rS   rT   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalrV   rW   k_projv_projq_projout_projr_   s     r0   rR   Siglip2Attention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar/   r"   r   r=   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ )z#Input shape: Batch x Time x Channelr   ri   eager        )r   r   r   )rp   r   r   r   viewr   r   r{   r   rM   _attn_implementationr   r   r   r   r   rz   r   r   )rD   r"   r   r   r|   
seq_lengthrT   queriesrG   valuesattention_interfacer   r   s                r0   r   Siglip2Attention.forward  sS    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0((r/   )rM   r   rT   r   r   r   r   r   r   r   r   N)r%   r&   r'   r(   r)   rR   r*   r   r   r-   r   r.   r   r   s   @r0   r   r      s[    GB. 26$)||$) !.$)
 
u||Xell33	4$) $)r/   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
Siglip2MLPi-  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rQ   rR   rM   r   
hidden_actactivation_fnrV   rW   rS   intermediate_sizefc1fc2r_   s     r0   rR   Siglip2MLP.__init__.  sb    #F$5$5699V//1I1IJ99V55v7I7IJr/   r"   r=   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )rD   r"   s     r0   r   Siglip2MLP.forward5  s4    /**=9/r/   )r   rM   r   r   )
r%   r&   r'   r(   rR   r*   r   r   r.   r   r   s   @r0   r   r   -  s)    KU\\ ell  r/   r   c            
          ^  \ rS rSrS\\\4   4U 4S jjr S
S\R                  S\R                  S\
\   S\\R                     4S jjrS	rU =r$ )Siglip2EncoderLayeri<  rM   c                 <  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        g )Neps)rQ   rR   rS   rT   rV   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr_   s     r0   rR   Siglip2EncoderLayer.__init__=  sm    ++<<F<Q<QR)&1<<F<Q<QRf%r/   r"   r   output_attentionsr=   c                     UnU R                  U5      nU R                  UUUS9u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU4nU(       a  Xe4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r"   r   r   )r   r   r   r   )rD   r"   r   r   residualr   outputss          r0   r   Siglip2EncoderLayer.forwardE  s      !((7&*nn')/ '5 '
#
 !0 ((7/ 0 "&Gr/   )rT   r   r   r   r   )F)r%   r&   r'   r(   r   r   r   rR   r*   r   r   boolr-   r+   r   r.   r   r   s   @r0   r   r   <  sg    &u%8:K%KL & -2	$||$ $ $D>	$
 
u  	!$ $r/   r   c            
          ^  \ rS rSrSrS\4U 4S jjr\   SS\\	R                     S\\   S\\   S\4S	 jj5       rS
rU =r$ )Siglip2Encoderil  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Siglip2EncoderLayer`].

Args:
    config: Siglip2Config
rM   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rQ   rR   rM   rV   
ModuleListrw   num_hidden_layersr   layersgradient_checkpointing)rD   rM   _r`   s      r0   rR   Siglip2Encoder.__init__u  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %ks   A%r   r   output_hidden_statesr=   c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUnU R                   H-  nU(       a  XW4-   nU" UUUS9n	U	S   nU(       d  M%  XiS   4-   nM/     U(       a  XW4-   n[	        UUUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr$   )r   r   r   )r!   r"   r#   )rM   r   r   r   r   )
rD   inputs_embedsr   r   r   encoder_statesall_attentionsr"   encoder_layerlayer_outputss
             r0   r   Siglip2Encoder.forward|  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[M#!/2B!B)"3M *!,M  !/3C2E!E )  +.>>N+(%
 	
r/   )rM   r   r   NNN)r%   r&   r'   r(   r)   r   rR   r   r   r*   r   r   r   r   r.   r   r   s   @r0   r   r   l  sl    ,} ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r/   r   c                      ^  \ rS rSrS\4U 4S jjr\\  SS\R                  S\R                  S\R                  S\\   S\\   S	\4S
 jj5       5       rSrU =r$ )Siglip2VisionTransformeri  rM   c                 ~  > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        US5      (       d  SOUR                  U l        U R                  (       a  [        U5      U l        UR"                  S:H  U l        g )Nr   vision_use_headTflash_attention_2)rQ   rR   rM   rS   rK   r   r   encoderrV   r   r   post_layernormhasattrr   use_head$Siglip2MultiheadAttentionPoolingHeadheadr   _use_flash_attention_2rD   rM   rT   r`   s      r0   rR   !Siglip2VisionTransformer.__init__  s    &&	1&9%f- ll9:O:OP$+F4E$F$FFLbLb==<VDDI&,&A&AEX&X#r/   r   r   rc   r   r   r=   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  X5      nUb'  U R                  (       d  [        X&R                  5      nOUnU R                  UUUUS9nUR                  n	U R                  U	5      n	U R                  (       a  U R                  X5      OSn
[        U	U
UR                  UR                  S9$ )z
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
Nr   r   r   r   r!   pooler_outputr"   r#   )rM   r   r   r   r   r   rh   r   r!   r   r   r   r   r"   r#   )rD   r   r   rc   r   r   r"   encoder_attention_maskencoder_outputsr!   r  s              r0   r    Siglip2VisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 E%d.I.I%?PcPc%d"%3"+/<<'1/!5	 ,8 ,
 ,== //0ABHL		"3D[_)/')77&11	
 	
r/   )r   rM   r   r   r   r   r   NN)r%   r&   r'   r(   r   rR   r   r   r*   r+   r   r   r   r   r   r   r.   r   r   s   @r0   r   r     s    Y2 Y  -1/3*
''*
 *
 ((	*

 $D>*
 'tn*
 
$*
  *
r/   r   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )Siglip2TextEmbeddingsi  rM   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nposition_ids)r   rf   F)
persistent)rQ   rR   rS   rV   r]   
vocab_sizetoken_embeddingmax_position_embeddingsr^   register_bufferr*   arangeexpandr   s      r0   rR   Siglip2TextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r/   	input_idsr  r   r=   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nrf   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rp   r^   r   r   r  r  )rD   r  r  r   r   max_position_embeddingposition_embeddingsr   s           r0   r   Siglip2TextEmbeddings.forward  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r/   )r^   r  r   )r%   r&   r'   r(   r   rR   r   r*   r   r+   r   r   r.   r   r   s   @r0   r	  r	    sp    

0 

 153759	E,,- u//0   1 12	
 
 r/   r	  c                    S nXSU-  -
  :  d  XSU-  -   :  a  [         R                  " SSS9  U" X1-
  U-  5      nU" XA-
  U-  5      nU R                  SU-  S-
  SU-  S-
  5        U R                  5         U R	                  U[
        R                  " S5      -  5        U R                  U5        U R                  X4S9  g )Nc                 h    S[         R                  " U [         R                  " S5      -  5      -   S-  $ )N      ?       @)matherfsqrt)xs    r0   norm_cdf _trunc_normal_.<locals>.norm_cdf$  s(    dhhq499S>122c99r/   ri   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r  )minmax)	warningswarnuniform_erfinv_mul_r  r  add_clamp_)tensormeanstdabr!  lus           r0   _trunc_normal_r4  !  s    : 	1s7{1s7{ 2;	
 	!(c!"A!(c!"A OOAEAIq1uqy) NN KKdiin$%
KK MMaMr/   r-  r.  r/  r0  r1  r=   c                     [         R                  " 5          [        U SSX45        U R                  U5      R	                  U5        SSS5        g! , (       d  f       g= f)a=  Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(     ext{mean},      ext{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \leq     ext{mean} \leq b`.

NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsequently scaled and shifted by the mean and std args.

Args:
    tensor: an n-dimensional `torch.Tensor`
    mean: the mean of the normal distribution
    std: the standard deviation of the normal distribution
    a: the minimum cutoff value
    b: the maximum cutoff value
r   r  N)r*   no_gradr4  r*  r+  )r-  r.  r/  r0  r1  s        r0   trunc_normal_tf_r7  E  s<    * 
vq#q,Cd# 
s   /A
Ac                 F   [        U 5      u  pEUS:X  a  UnOUS:X  a  UnOUS:X  a  XE-   S-  nUW-  nUS:X  a"  [        U [        R                  " U5      S-  S9  g US:X  aB  [        R
                  " 5          U R                  [        R                  " U5      S9  S S S 5        g US	:X  aK  [        R                  " S
U-  5      n[        R
                  " 5          U R                  U* U5        S S S 5        g [        SU 35      e! , (       d  f       g = f! , (       d  f       g = f)Nfan_infan_outfan_avgri   truncated_normalg۶%?r/  normaluniformr   zinvalid distribution )	r   r7  r  r  r*   r6  normal_r(  r   )	r-  r   rm   distributionr9  r:  denomvariancebounds	            r0   variance_scaling_rE  _  s    3F;OFx				!Q&u}H))TYYx%8;N%NO		!]]_NNtyy2N3 _		"		!h,']]_OOUFE* _ 0?@@ _ _s   5$DD
D
D c                     [        U SSS9  g )Nr9  r<  rm   rA  rE  r-  s    r0   lecun_normal_rJ  x  s    f8:LMr/   c                     [        U SSS9  g )Nr9  r>  rG  rH  rI  s    r0   default_flax_embed_initrL  |  s    f8(Cr/   c                      ^  \ rS rSrS\4U 4S jjr\\     SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\4S
 jj5       5       rSrU =r$ )Siglip2TextTransformeri  rM   c                 >  > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  5      U l        UR                  S:H  U l        g )Nr   r   )rQ   rR   rM   rS   r	  r   r   r   rV   r   r   final_layer_normrW   projection_sizer   r   r   r   s      r0   rR   Siglip2TextTransformer.__init__  sw    &&	/7%f- "Y<Q<Q RIIi)?)?@	&,&A&AEX&X#r/   r  r   r  r   r   r=   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eUR	                  5       nUR                  SUS   5      nU R                  XS9nUb&  U R                  (       d  [        X'R                  5      nU R                  UUUUS9nUR                  n	U R                  U	5      n	U	S S 2SS S 24   n
U R                  U
5      n
[        U	U
UR                  UR                   S9$ )NzYou have to specify input_idsrf   )r  r  r  r  )rM   r   r   r   rl   r   r   r   r   rh   r   r!   rP  r   r   r"   r#   )rD   r  r   r  r   r   input_shaper"   r  r!   pooled_outputs              r0   r   Siglip2TextTransformer.forward  s"    2C1N-TXT_T_TqTq$8$D $++JjJj 	 <==nn&NN2{27	)W %d.I.I7H[H[\N+/<<')/!5	 ,8 ,
 ,== 112CD *!R(3		-0)/')77&11	
 	
r/   )r   rM   r   r   rP  r   NNNNN)r%   r&   r'   r(   r   rR   r   r   r   r*   r   r   r   r   r.   r   r   s   @r0   rN  rN    s    	Y0 	Y  -115/3,0/3.
ELL).
 !..
 u||,	.

 $D>.
 'tn.
 
$.
  .
r/   rN  c                   F    \ rS rSr% \\S'   SrSr/ SQrSr	Sr
SrSrS rSrg)	Siglip2PreTrainedModeli  rM   siglip2T)r	  rK   r   r   c                 V   [        U[        5      (       a  [        U R                  [        5      (       a   U R                  R                  R
                  OU R                  R
                  n[        R                  R                  UR                  R                  S[        R                  " U5      -  S9  g[        U[        R                  5      (       a  [        UR                  5        g[        U[        5      (       Ga  [        R                  R!                  UR"                  R                  5        [        R                  R!                  UR$                  R                  5        [        R                  R!                  UR&                  R                  5        [        R                  R!                  UR(                  R                  5        [        R                  R+                  UR"                  R,                  5        [        R                  R+                  UR$                  R,                  5        [        R                  R+                  UR&                  R,                  5        [        R                  R+                  UR(                  R,                  5        g[        U[.        5      (       a  [        R                  R!                  UR0                  R                  5        [        R                  R!                  UR2                  R                  5        [        R                  R                  UR0                  R,                  SS9  [        R                  R                  UR2                  R,                  SS9  g[        U[4        5      (       a  [        R                  R!                  UR6                  R8                  5        [        R                  R!                  UR:                  R<                  R8                  5        [        R                  R+                  UR:                  R>                  R8                  5        g[        U[@        5      (       at  [B        RD                  " [B        RF                  " S5      5      nURH                  R8                  RK                  U5        URL                  R8                  RO                  5         g[        U[P        5      (       ak  [        R                  R                  URR                  R                  U R                  R                  R
                  S-  U R                  RT                  -  S9  g[        U[        RV                  [        RX                  45      (       aM  [[        UR                  5        UR,                  b*  [        R                  R+                  UR,                  5        gg[        U[        R\                  5      (       aJ  UR,                  R8                  RO                  5         UR                  R8                  RK                  S5        gg)zInitialize the weightsr   r=  gư>r  r   N)/
isinstancerK   rM   r   vision_configrS   rV   initr@  r^   r   npr  r]   rL  r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probedata	attentionin_proj_weightin_proj_biasSiglip2Modelr*   logr-  logit_scalefill_
logit_biaszero_Siglip2ForImageClassification
classifierinitializer_factorrW   Conv2drJ  r   )rD   r   r   logit_scale_inits       r0   _init_weights$Siglip2PreTrainedModel._init_weights  s   f566 dkk=99 ))55[[,, 
 GGOOF55<<!bggenBTOU--#FMM2 011GG##FMM$8$89GG##FMM$8$89GG##FMM$8$89GG##FOO$:$:;GGNN6==--.GGNN6==--.GGNN6==--.GGNN6??//0
++GG##FJJ$5$56GG##FJJ$5$56GGOOFJJOOO6GGOOFJJOOO6 DEEGG##FLL$5$56GG##F$4$4$C$C$H$HIGGNN6++88==>--$yyc):;##))*:;""((* =>>GGOO!!((KK--994?$++B`B``   BII 677&--({{&v{{+ '--KK""$MM$$S) .r/   r$   N)r%   r&   r'   r(   r   r,   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrs  r.   r$   r/   r0   rY  rY    s:    !&*#  N"&,*r/   rY  zL
    The text model from Siglip2 without any head or projection on top.
    c                     ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	S r
\\     SS\\R                     S\\R                     S	\\R                     S
\\   S\\   S\4S jj5       5       rSrU =r$ )Siglip2TextModeli  rM   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rQ   rR   rN  
text_model	post_initr_   s     r0   rR   Siglip2TextModel.__init__  s&     08r/   r=   c                 B    U R                   R                  R                  $ r   r  r   r  rH   s    r0   get_input_embeddings%Siglip2TextModel.get_input_embeddings  s    ))999r/   c                 8    XR                   R                  l        g r   r  )rD   r   s     r0   set_input_embeddings%Siglip2TextModel.set_input_embeddings  s    5:""2r/   r  r   r  r   r   c                 (    U R                  UUUUUS9$ )aX  
Examples:

```python
>>> from transformers import AutoTokenizer, Siglip2TextModel

>>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r  r   r  r   r   r  )rD   r  r   r  r   r   s         r0   r   Siglip2TextModel.forward  s)    6 )%/!5  
 	
r/   r  rW  )r%   r&   r'   r(   r   r,   rR   rV   Moduler  r  r   r   r   r*   r   r   r   r   r.   r   r   s   @r0   r}  r}    s     0 :bii :;  -115/3,0/3
ELL)
 !.
 u||,	

 $D>
 'tn
 
$
  
r/   r}  c                      ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	\R                     S\R                  4S jjr
S	rU =r$ )r   i7  zMultihead Attention Pooling.rM   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        R                  R                  UR                  UR                  SS9U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        UR                  U l        g )Nr   T)batch_firstr   )rQ   rR   rV   	Parameterr*   randnrS   rc  MultiheadAttentionr   re  r   r   	layernormr   r   r   r_   s     r0   rR   -Siglip2MultiheadAttentionPoolingHead.__init__:  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STf%33r/   hidden_stater   r=   c                    UR                   S   nU R                  R                  USS5      nUbc  UR                   S   UR                   S   pe[        X!R                  U5      nUR                  SU R
                  US5      nUR                  SXV5      nU R                  XAXS9S   nUnU R                  U5      nXpR                  U5      -   nUS S 2S4   $ )Nr   r   rf   )	attn_mask)
rp   rc  repeatr   rh   r   rz   re  r  r   )rD   r  r   r|   rc  
target_len
source_lenr   s           r0   r   ,Siglip2MultiheadAttentionPoolingHead.forwardC  s    !''*


!!*a3%%*[[^\5G5G5J
7HZHZ\fgN+221dnnjRSTN+33B
ON~~e<~bcde~~l3((<"88AqD!!r/   )re  r  r   r   rc  r   )r%   r&   r'   r(   r)   r   rR   r*   r   r   r   r.   r   r   s   @r0   r   r   7  sF    &42 4"ELL "(5<<BX "didpdp " "r/   r   zN
    The vision model from Siglip2 without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\  SS\R                  S\R                  S\R                   S	\\   S
\\   S\4S jj5       5       rSrU =r$ )Siglip2VisionModeliV  rM   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rQ   rR   r   vision_modelr  r_   s     r0   rR   Siglip2VisionModel.__init___  s)     4V< 	r/   r=   c                 B    U R                   R                  R                  $ r   )r  r   rY   rH   s    r0   r  'Siglip2VisionModel.get_input_embeddingsg  s      ++;;;r/   pixel_attention_maskrc   r   r   c                 (    U R                  UUUUUS9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```r   r   rc   r   r   r  )rD   r   r  rc   r   r   s         r0   r   Siglip2VisionModel.forwardj  s,    F   %/)/!5 ! 
 	
r/   r  r  )r%   r&   r'   r(   r   r,   main_input_namerR   rV   r  r  r   r   r*   r+   r   r   r   r   r   r   r.   r   r   s   @r0   r  r  V  s      $O2 <bii <  -1/3'
'''
 $ll'
 ((	'

 $D>'
 'tn'
 
$'
  '
r/   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\     SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\	R                  4S
 jj5       r\     SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\	R                  4S jj5       r\\         SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S	\4S jj5       5       rSrU =r$ )rh  i  rM   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  n[        R                  U5      n[        R                  U5      nUR                  U l        UR                  U l        [        R                  " [         R"                  " S5      5      U l        [        R                  " [         R"                  " S5      5      U l        U R)                  5         g )NzNconfig.text_config is expected to be of type Siglip2TextConfig but is of type .zRconfig.vision_config is expected to be of type Siglip2VisionConfig but is of type r   )rQ   rR   r\  text_configr   	TypeErrorrt   r]  r   r}  _from_configr  r  r  rV   r  r*   r  rj  rl  r  )rD   rM   r  r]  r  r  r`   s         r0   rR   Siglip2Model.__init__  s"    &,,.?@@++,-Q0 
 &..0CDD--./q2 
 ((,, &22;?
)66}E %//(55<<A7,,u{{1~6 	r/   r  r   r  r   r   r=   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nU$ )a  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`Siglip2TextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
>>> with torch.no_grad():
...     text_features = model.get_text_features(**inputs)
```r  )rM   r   r   r  r  )rD   r  r   r  r   r   text_outputsrU  s           r0   get_text_featuresSiglip2Model.get_text_features  sr    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22r/   r   r  rc   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nU$ )a<  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`Siglip2VisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     image_features = model.get_image_features(**inputs)
```
r  )rM   r   r   r  r  )rD   r   r  rc   r   r   vision_outputsrU  s           r0   get_image_featuresSiglip2Model.get_image_features  su    P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 '44r/   return_lossc
           
         Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  UUUUU	S9n
U R	                  UUUUU	S9nU
R
                  nUR
                  nXR                  SSSS9-  nXR                  SSSS9-  n[        R                  " XR                  5       R                  UR                  5      5      nU R                  R                  UR                  5      U R                  R                  UR                  5      nnXR                  5       -  U-   nUR                  5       nSnU(       a  [        R                  " UR!                  S5      UR                  S	9n[        R"                  " U5      * SU-  -   n[        R$                  R&                  R)                  UU-  5      n[        R*                  " USS
9* nUR-                  5       n[/        UUUUUUU
S9$ )a<  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
31.9% that image 0 is 'a photo of 2 cats'
```
Nr  r  ri   rf   T)r   r   keepdimr   )rg   r   )r8   r9   r:   r4   r    r;   r<   )rM   r   r   r  r  r  normr*   r   tru   rg   rj  rl  expeyerl   	ones_likerV   r   
logsigmoidsumr.  r6   )rD   r  r   r  rc   r   r  r  r   r   r  r  r    r4   r:   rj  rl  r9   r8   r  m1_diag1logliknlls                          r0   r   Siglip2Model.forward!  s   d 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 48??)%/!5 4C 4
 &33"00 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)OO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r/   )rl  rj  r  r  rW  )	NNNNNNNNN)r%   r&   r'   r(   r   r,   rR   r   r   r*   r   r   r+   r  r   r  r   r6   r   r.   r   r   s   @r0   rh  rh    s   } @  -115/3,0/3+ELL)+ !.+ u||,	+
 $D>+ 'tn+ 
		+ +Z  597;59,0/36u0016 'u||46 !!1!12	6
 $D>6 'tn6 
		6 6p  15487;591537&*,0/3e
E,,-e
 u001e
 'u||4	e

 !!1!12e
 !.e
 u//0e
 d^e
 $D>e
 'tne
 
e
  e
r/   rh  z
    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrSrS\SS4U 4S jjr\\      SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\   S\	\   S\4S jj5       5       rSrU =r$ )rn  i  r   rM   r=   Nc                   > [         TU ]  U5        UR                  U l        [        R	                  UR
                  5      nUR                  U l        UR                  S:  a5  [        R                  " UR
                  R                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rQ   rR   
num_labelsr  r  r]  r  rV   rW   rS   Identityro  r  )rD   rM   r  r`   s      r0   rR   &Siglip2ForImageClassification.__init__  s      ++ *66v7K7KL(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r/   r  rc   labelsr   r   c                 4   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nUbL  US   R                  UR                  5      n	[        R                  " X-  SS9[        R                  " U	SS9-  nO[        R                  " USS9nU R                  U5      n
SnUGb  UR                  U
R                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [!        5       nU R                  S:X  a&  U" U
R#                  5       UR#                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [%        5       nU" U
R'                  S	U R                  5      UR'                  S	5      5      nO,U R                   R                  S:X  a  [)        5       nU" X5      n[+        UU
UR,                  UR.                  S
9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
>>> import torch
>>> from PIL import Image
>>> import requests

>>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> # note: we are loading a `Siglip2Model` from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
>>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
>>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the two classes
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: LABEL_1
```
N)r   rc   r   r   ).Nr   r  
regressionsingle_label_classificationmulti_label_classificationrf   )r8   logitsr"   r#   )rM   r   r   r  r!   ru   rg   r*   r  r.  ro  problem_typer  rh   longr[   r
   squeezer	   r   r   r   r"   r#   )rD   r   r  rc   r  r   r   r   sequence_output	pool_maskr  r8   loss_fcts                r0   r   %Siglip2ForImageClassification.forward  s2   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/)/!5 /@ /
 "33  +,Y7::?;Q;QRI#ii(CKeiiXaghNiiO#jja@O 1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r/   )ro  r  r  )NNNNNN)r%   r&   r'   r(   r  r   rR   r   r   r   r*   r   r   r   r   r   r.   r   r   s   @r0   rn  rn    s     %O}  $  047;59)-,0/3d
u||,d
 'u||4d
 !!1!12	d

 &d
 $D>d
 'tnd
 
d
  d
r/   rn  )rh  rY  r}  r  rn  )r   )r   r  g       r  )r  r9  r>  )Fr  r&  dataclassesr   typingr   r   r   r   numpyr_  r*   torch.nnrV   torch.nn.functionalr   rx   r   r	   r
   torch.nn.initr   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   configuration_siglip2r   r   r   r   r2   r6   r  rK   r   floatr   r   r   r   r   r   r	  r4  r7  rE  rJ  rL  rN  rY  r}  r   r  rh  rn  __all__r$   r/   r0   <module>r     s  *   ! 1 1      A A 7 ! B 9 b b F B B X X 
	?+ 	? 	? 
	? 	? 	?  
K  
   
Fbbii bX %II%<<% 
% <<	%
 U\\*% % %.;)ryy ;)| -4 -`M
RYY M
`:
ryy :
z%BII %P! J \_$LL$ %$27$BG$SX$
\\$4A2ND<
RYY <
~ <*_ <* <*~ 
0
- 0

0
f"299 "> 
8
/ 8

8
v q
) q
 q
h {
$: {
{
|r/   