
    <h"U                     b   S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJr  SSKJr  \R4                  " \5      r\ " S S\5      5       r " S S\
R<                  5      r S$S\
R<                  S\R@                  S\R@                  S\R@                  S\\R@                     S\!S\!4S jjr" " S S\
R<                  5      r# " S S\
R<                  5      r$ " S S\5      r% " S  S!\
R<                  5      r& " S" S#\
R<                  5      r'g)%zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)ModelOutputcan_return_tuplelogging   )IdeficsVisionConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
IdeficsVisionModelOutput(   a  
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       Z/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/idefics/vision.pyr   r   (   sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r%   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )IdeficsVisionEmbeddingsF   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r*   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr    randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr*   	__class__s     r&   r6    IdeficsVisionEmbeddings.__init__G   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr%   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  U R                  5      nUR                   S   S-
  nXF:X  a  X#:X  a  U$ USS2S4   nUSS2SS24   nUR                   S   n	X R                  R                  -  n
X0R                  R                  -  nU
S-   US-   p[
        R                  " U5      nUR                  S[        U5      [        U5      U	5      nUR                  SSSS5      nUR                  [        R                  :H  nU(       a4  [        R                  S5        UR                  [        R                   5      n["        R$                  R'                  UX-  X-  4S	S
S9nU(       a  UR                  [        R                  5      n[        U
5      UR                   S   :w  d  [        U5      UR                   S   :w  aB  [)        S[        U
5      [        U5      4 SUR                   S   UR                   S   4 S35      eUR                  SSSS5      R+                  SSU	5      n[        R,                  " UR/                  S5      U4SS9$ )z
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
resolution images.

Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
r   Nr   r3   g?r   r1   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaperD   r2   r*   r:   mathsqrtreshapeintpermutedtyper    bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rI   rL   rM   rN   rA   	pos_embedrB   class_pos_embedpatch_pos_embedr8   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r&   interpolate_pos_encoding0IdeficsVisionEmbeddings.interpolate_pos_encoding^   sV    !&&q)A-++D,=,=>	!*Q.'FO#AqD/#AqrE*$$R(	++"8"88!7!77 (5s':MC<O}!YY}5)11!S9K5LcRdNegpq)11!Q1=(..%..@h .00=O--33'<m>`a	 4 
 -00@O}!6!6r!::c->PTcTiTijlTm>m0]1CSEW1W0X Y00?0E0Eb0I?K`K`acKd0d/eefh  *11!Q1=BB1b)Tyy/33A6HaPPr%   pixel_valuesrr   c                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)r_   r1   r   r3   rW   )rY   r9   rg   r@   weightr_   rc   flatten	transposer=   rG   r    ri   rr   rD   r2   )rI   rt   rr   
batch_sizer?   rM   rN   target_dtypepatch_embedsclass_embedsrL   s              r&   forwardIdeficsVisionEmbeddings.forward   s'   2>2D2D/
&'(E__,D (% 9)4??*;;su 
 ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
 $#&C&CJX]&^^J  $&=&=d>O>O&PPJr%   )	r=   r*   r8   r9   rA   rB   r@   r:   rD   F)r   r   r   r   r   r6   r    Tensorr]   rr   r!   boolr~   r$   __classcell__rJ   s   @r&   r(   r(   F   sr    q2 q./Q5<< /Q /QUX /Q]b]i]i /QbE$5$5 QU bgbnbn  r%   r(   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr3   rU   )rX   r_   )ptrainingr   r1   )r    matmulry   r   re   softmaxfloat32rc   r_   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r&   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r%   c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                     S\	\R                     S\	\
   S	\\R                  \	\R                     4   4
S
 jjrSrU =r$ )IdeficsVisionAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr*   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r5   r6   r*   r7   r8   num_attention_heads	num_headshead_dimrg   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrH   s     r&   r6   IdeficsVisionAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar%   r   r   causal_attention_maskoutput_attentionsrO   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   r1   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )rY   r   r   r   rh   r   r   ry   r*   _attn_implementationr   r   ra   rb   r   r   r   r   r\   r   r   )rI   r   r   r   r   rz   
seq_lengthr8   querieskeysvaluesattention_interfacer   r   s                 r&   r~   IdeficsVisionAttention.forward   s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r%   )r*   r   r8   r   r   r   r   r   r   r   r   )NNF)r   r   r   r   r   r   r6   r    r   r   r   r#   r~   r$   r   r   s   @r&   r   r      s    GB2 B. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r%   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )IdeficsVisionMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r5   r6   r*   r	   
hidden_actactivation_fnr   r   r7   intermediate_sizefc1fc2rH   s     r&   r6   IdeficsVisionMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr%   r   rO   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )rI   r   s     r&   r~   IdeficsVisionMLP.forward  s4    /**=9/r%   )r   r*   r   r   )
r   r   r   r   r6   r    r   r~   r$   r   r   s   @r&   r   r     s)    KU\\ ell  r%   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )IdeficsVisionEncoderLayeri!  r*   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)eps)r5   r6   r7   r8   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rH   s     r&   r6   "IdeficsVisionEncoderLayer.__init__"  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr%   r   r   r   r   rO   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   )r   r   r   r   )rI   r   r   r   r   residualr   outputss           r&   r~   !IdeficsVisionEncoderLayer.forward*  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr%   )r8   r   r   r   r   r   )r   r   r   r   r   r6   r    r   r   r   r#   r!   r~   r$   r   r   s   @r&   r   r   !  sl    S2 S -2&||& &  %||	&
 $D>& 
u  	!& &r%   r   c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )IdeficsVisionEncoderiT  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`IdeficsVisionEncoderLayer`].

Args:
    config: IdeficsVisionConfig
r*   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r5   r6   r*   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rI   r*   _rJ   s      r&   r6   IdeficsVisionEncoder.__init__]  sU    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A%r   r   r   output_hidden_statesreturn_dictrO   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )r   r   r   )r   r   r   )r*   r   r   use_return_dict	enumerater   r   )rI   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r&   r~   IdeficsVisionEncoder.forwardc  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r%   )r*   r   r   )NNNNN)r   r   r   r   r   r   r6   r   r   r    r   r   r   r#   r   r~   r$   r   r   s   @r&   r   r   T  s    ,2 ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r%   r   c                      ^  \ rS rSrS\4U 4S jjr     SS\\R                     S\\	   S\\	   S\\	   S\\	   S	\
\\4   4S
 jjrSrU =r$ )IdeficsVisionTransformeri  r*   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )r5   r6   r*   r7   r(   rL   r   r   r   pre_layrnormr   encoderpost_layernorm)rI   r*   r8   rJ   s      r&   r6   !IdeficsVisionTransformer.__init__  sd    &&	1&9LL8M8MN+F3 ll9:O:OPr%   rt   r   r   rr   r   rO   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUSS2SSS24   n	U R                  U	5      n	U(       d	  X4USS -   $ [        UU	UR                  UR                  S9$ )z
Returns:

Nz You have to specify pixel_values)rr   )r   r   r   r   r   r   )r   pooler_outputr   r   )r*   r   r   r   rg   rL   r   r   r   r   r   r   )
rI   rt   r   r   rr   r   r   encoder_outputsr   pooled_outputs
             r&   r~    IdeficsVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r%   )r*   rL   r   r   r   )NNNFN)r   r   r   r   r   r6   r   r    r!   r   r   r#   r   r~   r$   r   r   s   @r&   r   r     s    Q2 Q 59,0/338&*+
u001+
 $D>+
 'tn	+

 #+4.+
 d^+
 
u00	1+
 +
r%   r   )r   )(r   rZ   dataclassesr   typingr   r   r   r    torch.utils.checkpointr   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   configuration_ideficsr   
get_loggerr   ra   r   Moduler(   r   rd   r   r   r   r   r   r   r   r%   r&   <module>r      s4   [  ! , ,    ! 9 K 5 
 7 
		H	% ?{ ? ?:`bii `V %II%<<% 
% <<	%
 U\\*% % %.L)RYY L)`ryy  / : /fT
299 T
p7
ryy 7
r%   