
    <h                        S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	  SSK
rSSKrSSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'J(r(J)r)  S r* SMS\RV                  S\,S\,S\,S\,S\RV                  4S jjr-SNS jr.S r/S r0\\#" SS9 " S S\"5      5       5       r1\\#" S S9 " S! S"\"5      5       5       r2\\# " S# S$\"5      5       5       r3 " S% S&\Rh                  5      r5 " S' S(\Rh                  5      r6 SOS)\Rh                  S*\RV                  S+\RV                  S,\RV                  S-\\RV                     S.\,S/\,4S0 jjr7 " S1 S2\Rh                  5      r8 " S3 S4\Rh                  5      r9 " S5 S6\5      r:\# " S7 S8\ 5      5       r; " S9 S:\Rh                  5      r< " S; S<\Rh                  5      r=\#" S=S9 " S> S?\;5      5       r> " S@ SA\Rh                  5      r? " SB SC\Rh                  5      r@\#" SDS9 " SE SF\;5      5       rA\# " SG SH\;5      5       rB\#" SIS9 " SJ SK\;5      5       rC/ SLQrDg)PzPyTorch Siglip model.    N)	dataclass)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuple	torch_int   )SiglipConfigSiglipTextConfigSiglipVisionConfigc                    S nXSU-  -
  :  d  XSU-  -   :  a  [         R                  " SSS9  U" X1-
  U-  5      nU" XA-
  U-  5      nU R                  SU-  S-
  SU-  S-
  5        U R                  5         U R	                  U[
        R                  " S5      -  5        U R                  U5        U R                  X4S9  g )Nc                 h    S[         R                  " U [         R                  " S5      -  5      -   S-  $ )N      ?       @)matherfsqrt)xs    b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/siglip/modeling_siglip.pynorm_cdf _trunc_normal_.<locals>.norm_cdf(   s(    dhhq499S>122c99       zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r!   )minmax)	warningswarnuniform_erfinv_mul_r"   r$   add_clamp_)tensormeanstdabr'   lus           r&   _trunc_normal_r<   %   s    : 	1s7{1s7{ 2;	
 	!(c!"A!(c!"A OOAEAIq1uqy) NN KKdiin$%
KK MMaMr)   r5   r6   r7   r8   r9   returnc                     [         R                  " 5          [        U SSX45        U R                  U5      R	                  U5        SSS5        g! , (       d  f       g= f)a=  Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(     ext{mean},      ext{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \leq     ext{mean} \leq b`.

NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsequently scaled and shifted by the mean and std args.

Args:
    tensor: an n-dimensional `torch.Tensor`
    mean: the mean of the normal distribution
    std: the standard deviation of the normal distribution
    a: the minimum cutoff value
    b: the maximum cutoff value
r   r    N)torchno_gradr<   r2   r3   )r5   r6   r7   r8   r9   s        r&   trunc_normal_tf_rA   I   s<    * 
vq#q,Cd# 
s   /A
Ac                 F   [        U 5      u  pEUS:X  a  UnOUS:X  a  UnOUS:X  a  XE-   S-  nUW-  nUS:X  a"  [        U [        R                  " U5      S-  S9  g US:X  aB  [        R
                  " 5          U R                  [        R                  " U5      S9  S S S 5        g US	:X  aK  [        R                  " S
U-  5      n[        R
                  " 5          U R                  U* U5        S S S 5        g [        SU 35      e! , (       d  f       g = f! , (       d  f       g = f)Nfan_infan_outfan_avgr*   truncated_normalg۶%?r7   normaluniformr   zinvalid distribution )	r   rA   r"   r$   r?   r@   normal_r0   
ValueError)	r5   scalemodedistributionrC   rD   denomvariancebounds	            r&   variance_scaling_rR   c   s    3F;OFx				!Q&u}H))TYYx%8;N%NO		!]]_NNtyy2N3 _		"		!h,']]_OOUFE* _ 0?@@ _ _s   5$DD
D
D c                     [        U SSS9  g )NrC   rF   rM   rN   rR   r5   s    r&   lecun_normal_rW   |   s    f8:LMr)   c                     [        U SSS9  g )NrC   rH   rT   rU   rV   s    r&   default_flax_embed_initrY      s    f8(Cr)   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
SiglipVisionModelOutput   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r^   r   r?   FloatTensor__annotations__r_   r`   tuplera   __static_attributes__rb   r)   r&   r\   r\      sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r)   r\   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
SiglipTextModelOutput   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr_   .r`   ra   rb   )rc   rd   re   rf   rg   ro   r   r?   rh   ri   r_   r`   rj   ra   rk   rb   r)   r&   rm   rm      sr    
 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r)   rm   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)SiglipOutput   am  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`SiglipTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`SiglipVisionModel`].
Nlosslogits_per_imagelogits_per_textro   r^   text_model_outputvision_model_outputr=   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))rv   rw   N)getattrto_tuple).0kselfs     r&   	<genexpr>(SiglipOutput.to_tuple.<locals>.<genexpr>   s<      
   LLDGRYZ^`aRbRkRkRmm s   14)rj   keysr~   s   `r&   r{   SiglipOutput.to_tuple   s#     
YY[
 
 	
r)   rb   )rc   rd   re   rf   rg   rs   r   r?   rh   ri   rt   ru   ro   r^   rv   r   rw   rj   r   r{   rk   rb   r)   r&   rq   rq      s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r)   rq   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )SiglipVisionEmbeddings   configc                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr*   position_idsr   F
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr?   arangeexpandr~   r   	__class__s     r&   r   SiglipVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr)   
embeddingsheightwidthr=   c                    UR                   S   nU R                  R                  R                   S   n[        R                  R                  5       (       d%  XE:X  a   X#:X  a  U R                  U R                  5      $ U R                  R                  R                  S5      nUR                   S   nX R                  -  nX0R                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   r   g      ?r   r*   bicubicF)sizerM   align_corners)shaper   weightr?   jit
is_tracingr   	unsqueezer   r   reshapepermuter   
functionalinterpolateview)r~   r   r   r   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r&   interpolate_pos_encoding/SiglipVisionEmbeddings.interpolate_pos_encoding   s:    !&&q)//66<<Q? yy##%%+*F6?**4+<+<==1188BB1Er".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr)   pixel_valuesc                 X   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  XR                  XU5      -   nU$ XR                  U R                  5      -   nU$ )N)dtyper*   r   )
r   r   r   r   toflatten	transposer   r   r   )	r~   r   r   _r   r   target_dtypepatch_embedsr   s	            r&   forwardSiglipVisionEmbeddings.forward  s    *001e++2288++LOO,O,OP!))!,66q!<
##&C&CJX]&^^J  $&=&=d>O>O&PPJr)   )r   r   r   r   r   r   r   r   F)rc   rd   re   rf   r   r   r?   Tensorintr   rh   r   rk   __classcell__r   s   @r&   r   r      se    q1 q($5<< $ $UX $]b]i]i $L
E$5$5 
Z_ZfZf 
 
r)   r   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )SiglipTextEmbeddingsi  r   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nr   r   Fr   )r   r   r   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r?   r   r   r~   r   r   r   s      r&   r   SiglipTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r)   	input_idsr   inputs_embedsr=   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nr   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   rK   r   r   )r~   r   r   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r&   r   SiglipTextEmbeddings.forward&  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r)   )r   r   NNN)rc   rd   re   rf   r   r   r   r?   
LongTensorrh   r   r   rk   r   r   s   @r&   r   r     sp    

/ 

 153759	E,,- u//0   1 12	
 
 r)   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr   r   )r   r   )ptrainingr   r*   )r?   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r&   eager_attention_forwardr   A  s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r)   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
SiglipAttentioniX  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   r   num_attention_heads	num_headshead_dimrK   rL   attention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r&   r   SiglipAttention.__init__[  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar)   r`   r   r=   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ )z#Input shape: Batch x Time x Channelr   r*   eager        )r   r   r   )r   r   r   r   r   r   r   r   r   r   _attn_implementationr   r   rL   r   r   r   r   r   )r~   r`   r   r   
batch_sizer   r   queriesr   valuesattention_interfacer   r   s                r&   r   SiglipAttention.forwardo  sS    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0((r)   )r   r   r   r   r   r   r   r   r   rL   r   N)rc   rd   re   rf   rg   r   r?   r   r   rj   r   rk   r   r   s   @r&   r   r   X  s[    GB. 26$)||$) !.$)
 
u||Xell33	4$) $)r)   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	SiglipMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r  )r   r   r   r   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r&   r   SiglipMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr)   r`   r=   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r  )r~   r`   s     r&   r   SiglipMLP.forward  s4    /**=9/r)   )r  r   r  r  )
rc   rd   re   rf   r   r?   r   r   rk   r   r   s   @r&   r  r    s)    KU\\ ell  r)   r  c            
          ^  \ rS rSrS\\\4   4U 4S jjr S
S\R                  S\R                  S\
\   S\\R                     4S jjrS	rU =r$ )SiglipEncoderLayeri  r   c                 <  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        g )Neps)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r  mlpr   s     r&   r   SiglipEncoderLayer.__init__  sm    ++<<F<Q<QR(0<<F<Q<QRV$r)   r`   r   output_attentionsr=   c                     UnU R                  U5      nU R                  UUUS9u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU4nU(       a  Xe4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r`   r   r#  )r  r  r   r!  )r~   r`   r   r#  residualr   outputss          r&   r   SiglipEncoderLayer.forward  s      !((7&*nn')/ '5 '
#
 !0 ((7/ 0 "&Gr)   )r   r  r   r!  r  r   )rc   rd   re   rf   r   r   r   r   r?   r   r   boolrj   rh   r   rk   r   r   s   @r&   r  r    sg    %u%79I%IJ % -2	$||$ $ $D>	$
 
u  	!$ $r)   r  c                   F    \ rS rSr% \\S'   SrSr/ SQrSr	Sr
SrSrS rSrg)	SiglipPreTrainedModeli  r   siglipT)r   r   r  #SiglipMultiheadAttentionPoolingHeadc                 V   [        U[        5      (       a  [        U R                  [        5      (       a   U R                  R                  R
                  OU R                  R
                  n[        R                  R                  UR                  R                  S[        R                  " U5      -  S9  g[        U[        R                  5      (       a  [        UR                  5        g[        U[        5      (       Ga  [        R                  R!                  UR"                  R                  5        [        R                  R!                  UR$                  R                  5        [        R                  R!                  UR&                  R                  5        [        R                  R!                  UR(                  R                  5        [        R                  R+                  UR"                  R,                  5        [        R                  R+                  UR$                  R,                  5        [        R                  R+                  UR&                  R,                  5        [        R                  R+                  UR(                  R,                  5        g[        U[.        5      (       a  [        R                  R!                  UR0                  R                  5        [        R                  R!                  UR2                  R                  5        [        R                  R                  UR0                  R,                  SS9  [        R                  R                  UR2                  R,                  SS9  g[        U[4        5      (       a  [        R                  R!                  UR6                  R8                  5        [        R                  R!                  UR:                  R<                  R8                  5        [        R                  R+                  UR:                  R>                  R8                  5        g[        U[@        5      (       at  [B        RD                  " [B        RF                  " S5      5      nURH                  R8                  RK                  U5        URL                  R8                  RO                  5         g[        U[P        5      (       ak  [        R                  R                  URR                  R                  U R                  R                  R
                  S-  U R                  RT                  -  S9  g[        U[        RV                  [        RX                  45      (       aM  [[        UR                  5        UR,                  b*  [        R                  R+                  UR,                  5        gg[        U[        R\                  5      (       aJ  UR,                  R8                  RO                  5         UR                  R8                  RK                  S5        gg)zInitialize the weightsr   rG   gư>r    r   N)/
isinstancer   r   r   vision_configr   r   initrJ   r   r   npr$   r   rY   r   xavier_uniform_r   r   r   r   zeros_biasr  r  r  r,  probedata	attentionin_proj_weightin_proj_biasSiglipModelr?   logr5   logit_scalefill_
logit_biaszero_SiglipForImageClassification
classifierinitializer_factorr   r   rW   r  )r~   r   r   logit_scale_inits       r&   _init_weights#SiglipPreTrainedModel._init_weights  s   f455 dkk<88 ))55[[,, 
 GGOOF55<<!bggenBTOU--#FMM200GG##FMM$8$89GG##FMM$8$89GG##FMM$8$89GG##FOO$:$:;GGNN6==--.GGNN6==--.GGNN6==--.GGNN6??//0	**GG##FJJ$5$56GG##FJJ$5$56GGOOFJJOOO6GGOOFJJOOO6 CDDGG##FLL$5$56GG##F$4$4$C$C$H$HIGGNN6++88==>,,$yyc):;##))*:;""((* <==GGOO!!((KK--994?$++B`B``   BII 677&--({{&v{{+ '--KK""$MM$$S) .r)   rb   N)rc   rd   re   rf   r   ri   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrD  rk   rb   r)   r&   r*  r*    s:     &*#  N"&,*r)   r*  c            
          ^  \ rS rSrSrS\4U 4S jjr\   SS\\	R                     S\\   S\\   S\4S	 jj5       rS
rU =r$ )SiglipEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`SiglipEncoderLayer`].

Args:
    config: SiglipConfig
r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointing)r~   r   r   r   s      r&   r   SiglipEncoder.__init__   sS    mmvOgOgIh$iIhA%7%?Ih$ij&+# %js   A%r   r#  output_hidden_statesr=   c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUnU R                   H-  nU(       a  XW4-   nU" UUUS9n	U	S   nU(       d  M%  XiS   4-   nM/     U(       a  XW4-   n[	        UUUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nrb   )r#  r   r   )r_   r`   ra   )r   r#  rV  rS  r   )
r~   r   r   r#  rV  encoder_statesall_attentionsr`   encoder_layerlayer_outputss
             r&   r   SiglipEncoder.forward'  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[M#!/2B!B)"3M *!,M  !/3C2E!E )  +.>>N+(%
 	
r)   )r   rT  rS  r   )rc   rd   re   rf   rg   r   r   r   r   r?   r   r(  r   r   rk   r   r   s   @r&   rN  rN    sl    ,| ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r)   rN  c                      ^  \ rS rSrS\4U 4S jjr\\     SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\4S
 jj5       5       rSrU =r$ )SiglipTextTransformerig  r   c                 >  > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  5      U l        UR                  S:H  U l        g )Nr  flash_attention_2)r   r   r   r   r   r   rN  encoderr   r  r  final_layer_normr   projection_sizeheadr  _use_flash_attention_2r   s      r&   r   SiglipTextTransformer.__init__h  sw    &&	.v6$V, "Y<Q<Q RIIi)?)?@	&,&A&AEX&X#r)   r   r   r   r#  rV  r=   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eUR	                  5       nUR                  SUS   5      nU R                  XS9nUb&  U R                  (       d  [        X'R                  5      nU R                  UUUUS9nUR                  n	U R                  U	5      n	U	S S 2SS S 24   n
U R                  U
5      n
[        U	U
UR                  UR                   S9$ )NzYou have to specify input_idsr   )r   r   )r   r   r#  rV  r_   pooler_outputr`   ra   )r   r#  rV  rK   r   r   r   re  r   r   ra  r_   rb  rd  r   r`   ra   )r~   r   r   r   r#  rV  input_shaper`   encoder_outputsr_   pooled_outputs              r&   r   SiglipTextTransformer.forwards  s"    2C1N-TXT_T_TqTq$8$D $++JjJj 	 <==nn&NN2{27	)W %d.I.I7H[H[\N+/<<')/!5	 ,8 ,
 ,== 112CD *!R(3		-0)/')77&11	
 	
r)   )re  r   r   ra  rb  rd  NNNNN)rc   rd   re   rf   r   r   r   r   r   r?   r   r(  r   r   rk   r   r   s   @r&   r^  r^  g  s    	Y/ 	Y  -115/3,0/3.
ELL).
 !..
 u||,	.

 $D>.
 'tn.
 
$.
  .
r)   r^  zK
    The text model from SigLIP without any head or projection on top.
    c                     ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	S r
\\     SS\\R                     S\\R                     S	\\R                     S
\\   S\\   S\4S jj5       5       rSrU =r$ )SiglipTextModeli  r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r  )r   r   r^  
text_model	post_initr   s     r&   r   SiglipTextModel.__init__  s&     /7r)   r=   c                 B    U R                   R                  R                  $ r  rr  r   r   r   s    r&   get_input_embeddings$SiglipTextModel.get_input_embeddings  s    ))999r)   c                 8    XR                   R                  l        g r  rv  )r~   r   s     r&   set_input_embeddings$SiglipTextModel.set_input_embeddings  s    5:""2r)   r   r   r   r#  rV  c                 (    U R                  UUUUUS9$ )aT  
Examples:

```python
>>> from transformers import AutoTokenizer, SiglipTextModel

>>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r   r   r   r#  rV  rr  )r~   r   r   r   r#  rV  s         r&   r   SiglipTextModel.forward  s)    6 )%/!5  
 	
r)   r~  rn  )rc   rd   re   rf   r   ri   r   r   Modulerw  rz  r   r   r   r?   r   r(  r   r   rk   r   r   s   @r&   rp  rp    s     / :bii :;  -115/3,0/3
ELL)
 !.
 u||,	

 $D>
 'tn
 
$
  
r)   rp  c                   z   ^  \ rS rSrS\4U 4S jjr\\   S
S\\	   S\\	   S\\	   S\
4S jj5       5       rS	rU =r$ )SiglipVisionTransformeri  r   c                 X  > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        US5      (       d  SOUR                  U l        U R                  (       a  [        U5      U l        g g )Nr  vision_use_headT)r   r   r   r   r   r   rN  ra  r   r  r  post_layernormhasattrr  use_headr,  rd  r   s      r&   r    SiglipVisionTransformer.__init__  s    &&	08$V, ll9:O:OP$+F4E$F$FFLbLb==;FCDI r)   r#  rV  r   r=   c                 p   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  XS9nU R	                  UUUS9nUR
                  nU R                  U5      nU R                  (       a  U R                  U5      OS n[        UUUR                  UR                  S9$ )N)r   )r   r#  rV  rh  )r   r#  rV  r   ra  r_   r  r  rd  r   r`   ra   )	r~   r   r#  rV  r   r`   rk  r_   ri  s	            r&   r   SiglipVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 h+/<<'/!5 ,8 ,
 ,== //0AB8<		"344)/')77&11	
 	
r)   )r   r   ra  rd  r  r  NNF)rc   rd   re   rf   r   r   r   r   r   r(  r   r   rk   r   r   s   @r&   r  r    sm    
D1 
D  -1/338
 $D>
 'tn	

 #+4.
 
$
  
r)   r  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )r,  i  zMultihead Attention Pooling.r   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        R                  R                  UR                  UR                  SS9U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        g )Nr   T)batch_firstr  )r   r   r   	Parameterr?   randnr   r5  MultiheadAttentionr   r7  r  r  	layernormr  r!  r   s     r&   r   ,SiglipMultiheadAttentionPoolingHead.__init__  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STV$r)   c                     UR                   S   nU R                  R                  USS5      nU R                  X1U5      S   nUnU R	                  U5      nX@R                  U5      -   nUS S 2S4   $ )Nr   r   )r   r5  repeatr7  r  r!  )r~   hidden_stater  r5  r%  s        r&   r   +SiglipMultiheadAttentionPoolingHead.forward  sr    !''*


!!*a3~~e<HK~~l3((<"88AqD!!r)   )r7  r  r!  r5  )
rc   rd   re   rf   rg   r   r   r   rk   r   r   s   @r&   r,  r,    s    &%1 %
" 
"r)   r,  zM
    The vision model from SigLIP without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\   SS\\   S\\   S	\S\4S
 jj5       5       rSrU =r$ )SiglipVisionModeli&  r   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r  )r   r   r  vision_modelrs  r   s     r&   r   SiglipVisionModel.__init__/  s)     3F; 	r)   r=   c                 B    U R                   R                  R                  $ r  )r  r   r   r   s    r&   rw  &SiglipVisionModel.get_input_embeddings7  s      ++;;;r)   r#  rV  r   c                 &    U R                  UUUUS9$ )an  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, SiglipVisionModel

>>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```r   r#  rV  r   r  )r~   r   r#  rV  r   s        r&   r   SiglipVisionModel.forward:  s(    <   %/!5%=	 ! 
 	
r)   r  r  )rc   rd   re   rf   r   ri   main_input_namer   r   r  rw  r   r   r   r(  r   r   rk   r   r   s   @r&   r  r  &  s     $O1 <bii <  -1/3).!
 $D>!
 'tn	!

 #'!
 
$!
  !
r)   r  c                   "  ^  \ rS rSr% \\S'   S\4U 4S jjr\     SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\	R                  4S
 jj5       r\    SS\\	R                     S\\   S\\   S\S	\	R                  4
S jj5       r\\        SS\\	R                      S\\	R                     S\\	R                     S\\	R                      S\\   S\\   S\\   S\S	\4S jj5       5       rSrU =r$ )r:  i`  r   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  n[        R                  U5      n[        R                  U5      nUR                  U l        UR                  U l        [        R                  " [         R"                  " S5      5      U l        [        R                  " [         R"                  " S5      5      U l        U R)                  5         g )NzMconfig.text_config is expected to be of type SiglipTextConfig but is of type .zQconfig.vision_config is expected to be of type SiglipVisionConfig but is of type r   )r   r   r.  text_configr   	TypeErrortyper/  r   rp  _from_configr  rr  r  r   r  r?   r  r<  r>  rs  )r~   r   r  r/  rr  r  r   s         r&   r   SiglipModel.__init__d  s"    &,,.>??++,-Q0 
 &..0BCC--./q2 
 ((,, %11+>
(55mD %//(55<<A7,,u{{1~6 	r)   r   r   r   r#  rV  r=   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nU$ )a  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`SiglipTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
>>> with torch.no_grad():
...     text_features = model.get_text_features(**inputs)
```r}  )r   r#  rV  rr  ri  )r~   r   r   r   r#  rV  text_outputsrl  s           r&   get_text_featuresSiglipModel.get_text_features  sr    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22r)   r   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUS9nUR                  nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`SiglipVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     image_features = model.get_image_features(**inputs)
```r  )r   r#  rV  r  ri  )r~   r   r#  rV  r   vision_outputsrl  s          r&   get_image_featuresSiglipModel.get_image_features  sr    B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 '44r)   return_lossc	           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUS9n	U R	                  UUUUUS9n
U	R
                  nU
R
                  nXR                  SSSS9-  nXR                  SSSS9-  n[        R                  " XR                  5       R                  UR                  5      5      nU R                  R                  UR                  5      U R                  R                  UR                  5      pXR                  5       -  U-   nUR                  5       nSnU(       a  [        R                  " UR!                  S5      UR                  S	9n[        R"                  " U5      * SU-  -   n[        R$                  R&                  R)                  UU-  5      n[        R*                  " USS
9* nUR-                  5       n[/        UUUUUU
U	S9$ )a  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
31.9% that image 0 is 'a photo of 2 cats'
```Nr  r}  r*   r   T)r   r   keepdimr   )devicer   )rs   rt   ru   ro   r^   rv   rw   )r   r#  rV  r  rr  ri  normr?   r   tr   r  r<  r>  expeyer   	ones_liker   r   
logsigmoidsumr6   rq   )r~   r   r   r   r   r  r#  rV  r   r  r  r^   ro   ru   r<  r>  rt   rs   r  m1_diag1logliknlls                         r&   r   SiglipModel.forward  s   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 48??)%/!5 4C 4
 &33"00 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)OO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r)   )r>  r<  rr  r  rn  )NNNF)NNNNNNNF)rc   rd   re   rf   r   ri   r   r   r   r?   r   r(  rh   r  r  r   r   rq   r   rk   r   r   s   @r&   r:  r:  `  s   | @  -115/3,0/3+ELL)+ !.+ u||,	+
 $D>+ 'tn+ 
		+ +Z  59,0/3)..u001. $D>. 'tn	.
 #'. 
		. .`  15481537&*,0/3).^
E,,-^
 u001^
 !.	^

 u//0^
 d^^
 $D>^
 'tn^
 #'^
 
^
  ^
r)   r:  z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrSrS\SS4U 4S jjr\\     SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\S\4S jj5       5       rSrU =r$ )r@  iF  r   r   r=   Nc                   > [         TU ]  U5        UR                  U l        [        R	                  UR
                  5      nUR                  U l        UR                  S:  a5  [        R                  " UR
                  R                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )r   r   
num_labelsr  r  r/  r  r   r   r   IdentityrA  rs  )r~   r   r  r   s      r&   r   %SiglipForImageClassification.__init__O  s      ++ )55f6J6JK(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r)   labelsr#  rV  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUS9nUR                  n[
        R                  " USS9nU R                  U5      nSn	UGb  UR                  UR                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l
        OoU R                  S:  aN  UR                  [
        R                  :X  d  UR                  [
        R                  :X  a  SU R                   l
        OSU R                   l
        U R                   R                  S:X  aI  [        5       n
U R                  S:X  a&  U
" UR!                  5       UR!                  5       5      n	OU
" X5      n	OU R                   R                  S:X  a=  [#        5       n
U
" UR%                  SU R                  5      UR%                  S5      5      n	O,U R                   R                  S:X  a  ['        5       n
U
" X5      n	[)        U	UUR*                  UR,                  S	9$ )
a\  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, SiglipForImageClassification
>>> import torch
>>> from PIL import Image
>>> import requests

>>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> # note: we are loading a `SiglipModel` from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
>>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
>>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the two classes
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: LABEL_1
```N)r#  rV  r   r   r  
regressionsingle_label_classificationmulti_label_classificationr   )rs   logitsr`   ra   )r   r#  rV  r  r_   r?   r6   rA  r   r  problem_typer  r   longr   r   squeezer
   r   r	   r   r`   ra   )r~   r   r  r#  rV  r   r&  sequence_outputr  rs   loss_fcts              r&   r   $SiglipForImageClassification.forwarda  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/!5%=	 /@ /
 "33  **_!<1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r)   )rA  r  r  )NNNNF)rc   rd   re   rf   r  r   r   r   r   r   r?   r   r(  r   r   rk   r   r   s   @r&   r@  r@  F  s     %O|  $  04)-,0/3).X
u||,X
 &X
 $D>	X

 'tnX
 #'X
 
X
  X
r)   r@  )r:  r*  rp  r  r@  )r  r    g       r!   )r    rC   rH   )r  )Erg   r"   r.   dataclassesr   typingr   r   r   r   numpyr1  r?   r   torch.nnr	   r
   r   torch.nn.initr   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   configuration_siglipr   r   r   r<   r   floatrA   rR   rW   rY   r\   rm   rq   r  r   r   r   r   r  r  r*  rN  r^  rp  r  r,  r  r:  r@  __all__rb   r)   r&   <module>r     s      ! 1 1    A A 7 ! B 9 b b F M M T T! J \_$LL$ %$27$BG$SX$
\\$4A2ND 	?k 	? 	? 	?K 	? 	?  
;  
   
FERYY ER%299 %^ %II%<<% 
% <<	%
 U\\*% % %.;)bii ;)~		 -3 -` <*O <* <*@M
BII M
`<
BII <
~ 
0
+ 0

0
f-
bii -
`"")) "0 
2
- 2

2
j b
' b
 b
J o
#8 o
o
dr)   