
    <h                    .   S r SSKJr  SSKJrJrJrJr  SSKrSSK	J
s  Jr  SSKrSSKJ
r
  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,  SSK-J.r.J/r/  \'" 5       (       a  SSK0J1r1  SSK2J3r3  \(Rh                  " \55      r6\\%" SS9 " S S\5      5       5       r7\\%" SS9 " S S\5      5       5       r8    SAS jr9/ 4S jr: " S  S!\
Rv                  5      r< " S" S#\
Rz                  5      r> " S$ S%\
R~                  5      r@ " S& S'\R                  R~                  5      rAS( rBSBS) jrC " S* S+\
R~                  5      rD SCS,\
R~                  S-\R                  S.\R                  S/\R                  S0\\R                     S1\FS2\F4S3 jjrG " S4 S5\
R~                  5      rH " S6 S7\5      rI " S8 S9\5      rJ\% " S: S;\ 5      5       rK\% " S< S=\K5      5       rL " S> S?\K\5      rM/ S@QrNg)DzPyTorch Idefics model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformer)	BlockMask)make_flex_block_causal_maskz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
IdeficsBaseModelOutputWithPast6   a(  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.

    If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
    hidden_size)` is output.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
    `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
    encoder_sequence_length, embed_size_per_head)`.

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r&   r   torchFloatTensor__annotations__r'   tupler(   r)   r*   __static_attributes__r+       d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/idefics/modeling_idefics.pyr$   r$   6   s    , 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;Br6   r$   zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                   "   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)IdeficsCausalLMOutputWithPastZ   aV  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr'   r(   r)   r*   r+   )r,   r-   r.   r/   r0   r;   r   r1   r2   r3   r<   r'   listr(   r4   r)   r*   r5   r+   r6   r7   r9   r9   Z   s    $ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;Br6   r9   c                    [         R                  " U R                  S   5      R                  SS5      R	                  SU5      R                  S5      R                  U R                  5      nU R                  SU5      n UR                  S5      US'   UR                  S5      US'   UR                  S5      US'   UR                  S5      US'   SU;   a  US   nUR                  SU5      US'   Ub  UR                  SU5      US	'   US   b  US   R                  SU5      US'   US   b  US   R                  SU5      US'   X4$ US   b  US   R                  SU5      US'   X4$ US   b  US   R                  SU5      US'   X4$ )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r1   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderrE   encoder_outputsmodel_kwargsexpanded_return_idxrD   s           r7   expand_inputs_for_generationrT   {   s    	Y__Q'(--b!4;;A{KPPQSTWWXaXhXhi  &&q*=>I#/#3#3N#CL /;/?/?@Z/[L+,+7+;+;<R+SL'(+7+;+;<R+SL'(<'%&67)7)D)DQH[)\%&!)7)D)DQH[)\%&*+7/;<R/S/`/`"0
+, N#/'3N'C'P'PQRTg'h^$ "" 
0	1	=3?@Z3[3h3h"4
/0 "" 
,	-	9/;<R/S/`/`"0
+, ""r6   c                 P  ^ [         R                  [         R                  [         R                  S.nU Vs/ sH  o2U   PM	     nnU R	                  5        HH  mU(       a-  [        U4S jU 5       5      (       a  TR                  S5        M7  TR                  S5        MJ     U $ s  snf )N)	LayerNormLinear	Embeddingc              3   :   >#    U H  n[        TU5      v   M     g 7fN)
isinstance).0tmodules     r7   	<genexpr>freeze_model.<locals>.<genexpr>   s     $]D\qZ%:%:D\s   TF)r   rV   rW   rX   modulesanyrequires_grad_)modelmodule_exceptionsmappingmmodule_exceptions_mappedr^   s        @r7   freeze_modelri      s    \\))\\G
 5FF4Eq
4EF--/$]D\$]!]!]!!$'!!%(	 "
 L  Gs   B#c                   ^   ^  \ rS rSrSr    S
S\\   SS4U 4S jjjrS rS\	4S jr
S	rU =r$ )IdeficsDecoupledEmbedding   a  
Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
then it will create `num_additional_embeddings` additional parameters that are always trained. If
`num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
Npartially_freezereturnc           	      F  > Ub  Xq:  a  [        SU SU 35      e[        T	U ]  " SUUUUUS.UD6  Xl        Xpl        X l        X@l        U(       a  U R                  R                  S5        U R
                  S:  a'  [        R                  " U R
                  UUUS9U l        gg)	a  
Args:
    num_embeddings (`int`):
        Size of the dictionary of embeddings
    num_additional_embeddings (`int`):
        Number of additional embeddings. Only useful when you `partially_freeze=True`.
    embedding_dim (`int`):
        The size of each embedding vector
    partially_freeze: (`bool`, *optional*, defaults to `False`):
        If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
    padding_idx (`int`, *optional*):
        The padding index (needs to be less than num_embeddings)

Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
`max_norm` or `norm_type`. We are not supporting these.
Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrK   dtypepadding_idxFr   )rp   rq   rK   rr   r+   )
ValueErrorsuper__init__rp   rs   num_additional_embeddingsrm   weightrc   r   rX   additional_embedding)
selfrp   rw   rq   rm   rK   rr   rs   kwargs	__class__s
            r7   rv   "IdeficsDecoupledEmbedding.__init__   s    6 "{'CN{m[`ao`pqrr 	
)'#	
 	
 -&)B& 0KK&&u-))A-(*#==+	)D% .r6   c                 \   U R                   S:X  a   [        R                  " XR                  5      $ UR	                  5       n[
        R                  " XR                  :  5      nX   nU R                  X0R                  -
  5      nSX'   [        R                  " XR                  5      nXEU'   U$ )a{  
we have 2 embeddings, with different indices - one pretrained self.weight and another
self.additional_embedding.weight that is being trained.

in order to make a lookup of the input ids, we:
1. find out the indices of the entries belonging to the 2nd embedding
2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
   embedding starts from 0 and not num_embeddings
3. perform the 2nd embedding lookup
4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
5. perform the 1st embedding lookup
6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
measure.

r   )	rw   F	embeddingrx   cloner1   whererp   ry   )rz   rN   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectors         r7   forward!IdeficsDecoupledEmbedding.forward   s    * ))Q.;;y++66 OO%	#(;;y<O<O/O#P %.%H" $ 9 9:TWjWj:j k /0	+kk)[[9 1F,-r6   c                 n    SU R                    SU R                   SU R                   SU R                   3$ )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)rp   rw   rq   rm   rz   s    r7   
extra_repr$IdeficsDecoupledEmbedding.extra_repr  sq     !4!4 55QRVRpRpQq  rB  CG  CU  CU  BV  Vi  jn  j  j  i@  A  	Ar6   )ry   rw   rp   rs   rm   )FNNN)r,   r-   r.   r/   r0   r   boolrv   r   strr   r5   __classcell__r|   s   @r7   rk   rk      sS     ,13
 #4.3 
3 3j%NAC A Ar6   rk   c                      ^  \ rS rSrSr     SS\S\S\S\S\S	S4U 4S
 jjjrS\R                  S	\R                  4S jr
S	\4S jrSrU =r$ )IdeficsDecoupledLineari   a  
Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
then it will create `out_additional_features * in_features` additional parameters that are always trained. If
`out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
Nin_featuresout_featuresout_additional_featuresbiasrm   rn   c                 (  > [         TU ]  XXFU5        X0l        XPl        Xl        X l        U(       a=  U R                  R                  S5        U(       a  U R                  R                  S5        US:  a  [        R                  " UUUUUS9U l        gg)a'  
out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
`partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
Fr   )r   r   r   rK   rr   N)ru   rv   r   rm   r   r   rx   rc   r   r   rW   additional_fc)	rz   r   r   r   r   rm   rK   rr   r|   s	           r7   rv   IdeficsDecoupledLinear.__init__)  s     	D%H'>$ 0&(KK&&u-		((/"Q&!#'4"D 'r6   inputc                     [         R                  " XR                  U R                  5      nU R                  S:  a)  U R                  U5      n[        R                  " X#4S5      nU$ )Nr   r?   )r   linearrx   r   r   r   r1   cat)rz   r   outputadditional_featuress       r7   r   IdeficsDecoupledLinear.forwardM  sQ    %dii8''!+"&"4"4U";YY<bAFr6   c           
          SU R                    SU R                   SU R                   SU R                  SL SU R                   3
$ )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nr   r   r   r   r   rm   r   s    r7   r   !IdeficsDecoupledLinear.extra_reprV  s    d../t?P?P>QQklp  mI  mI  lJ  JQ  RV  R[  R[  cg  Rg  Qh  h{  |@  |Q  |Q  {R  S  	Sr6   )r   r   r   r   rm   )r   TTNN)r,   r-   r.   r/   r0   intr   rv   r1   Tensorr   r   r   r5   r   r   s   @r7   r   r      s     ()!%"" " "%	"
 " " 
" "HU\\ ell SC S Sr6   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )IdeficsRMSNormi\  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z-
IdeficsRMSNorm is equivalent to T5LayerNorm
N)ru   rv   r   	Parameterr1   onesrx   variance_epsilon)rz   hidden_sizeepsr|   s      r7   rv   IdeficsRMSNorm.__init__]  s/     	ll5::k#:; #r6   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   r?   T)keepdim)rJ   r1   float32powmeanrsqrtr   rx   rr   float16bfloat16)rz   r(   variances      r7   r   IdeficsRMSNorm.forwarde  s     ##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r6   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r4   rx   rG   r   r   s    r7   r   IdeficsRMSNorm.extra_repro  s*    ))*+6$2G2G1HIIr6   )r   rx   )gư>)	r,   r-   r.   r/   rv   r   r   r5   r   r   s   @r7   r   r   \  s    $+J Jr6   r   c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )IdeficsEmbeddingit  c           	        > [         TU ]  5         Xl        X l        X0l        SU R                  [
        R                  " SU R                  S[
        R                  S9R                  U[
        R                  S9U R                  -  -  -  nU R                  SUSS9  U R                  X R                  R                  [
        R                  " 5       S	9  g )
N      ?r   r   rr   rK   rr   inv_freqF
persistentseq_lenrK   rr   )ru   rv   dimmax_position_embeddingsbaser1   rF   int64rJ   floatregister_buffer_set_cos_sin_cacher   rK   get_default_dtype)rz   r   r   r   rK   r   r|   s         r7   rv   IdeficsEmbedding.__init__u  s    '>$	IIQ!5;;?BB&X]XcXcBdgkgogooq
 	ZeD 	+MM4H4HPUPgPgPi 	  	
r6   c                    Xl         [        R                  " U R                   U[        R                  S9R	                  U R
                  5      n[        R                  " SX@R
                  5      n[        R                  " XU4SS9nU R                  SUR                  5       R                  U5      SS9  U R                  SUR                  5       R                  U5      SS9  g )	Nr   zi,j->ijr?   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr1   rF   r   type_asr   einsumr   r   cosrJ   sin)rz   r   rK   rr   r]   freqsembs          r7   r   #IdeficsEmbedding._set_cos_sin_cache  s    ")LL00u{{S[[\`\i\ijY==9iiB/\3779<<+>5Q\3779<<+>5Qr6   c                     X R                   :  a$  U R                  X!R                  UR                  S9  U R                  S U R                  UR                  S9U R                  S U R                  UR                  S94$ )Nr   r   )r   r   rK   rr   r   rJ   r   )rz   xr   s      r7   r   IdeficsEmbedding.forward  su    ,,,##GHHAGG#T OOHW%((qww(7OOHW%((qww(7
 	
r6   )r   r   r   r   )i   i'  NrZ   )	r,   r-   r.   r/   rv   r   r   r5   r   r   s   @r7   r   r   t  s    
"R
 
r6   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr?   r   r   )rG   r1   r   )r   x1x2s      r7   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r6   c                     X$   R                  U5      nX4   R                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a&  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`):
        The position indices of the tokens corresponding to the query and key tensors. For example, this can be
        used to pass offsetted position ids when working with a KV-cache.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embeds           r7   apply_rotary_pos_embr     s]    * 

%
%m
4C


%
%m
4Cw;q>C/0Gw;q>C/0Gr6   c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r	$ )
IdeficsMLPi  r   intermediate_size
hidden_actc                    > [         TU ]  5         [        R                  " XSS9U l        [        R                  " X!SS9U l        [        R                  " XSS9U l        [        U   U l        g )NFr   )	ru   rv   r   rW   	gate_proj	down_projup_projr
   act_fn)rz   r   r   r   r|   s       r7   rv   IdeficsMLP.__init__  sS     	;N#4NyyeLZ(r6   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      $ rZ   )r   r   r   r   )rz   r   s     r7   r   IdeficsMLP.forward  s0    ~~dkk$..*;<t||ANOOr6   )r   r   r   r   )
r,   r-   r.   r/   r   r   rv   r   r5   r   r   s   @r7   r   r     s0    
)
) 
) 	
)P Pr6   r   r^   querykeyvaluerE   scalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr?   )r   rr   ptrainingr   r   )r1   matmul	transposer   
functionalsoftmaxr   rJ   rr   r   r  
contiguous)
r^   r   r   r   rE   r   r   r{   attn_weightsattn_outputs
             r7   eager_attention_forwardr    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r6   c                     ^  \ rS rSrSr     SS\S\S\S\S\S\S	\	\   4U 4S
 jjjr
S\R                  S\S\4S jr       SS\R                  S\	\R                     S\	\R                     S\	\R                     S\	\\R                        S\S\S\	\R                     S\\R                  \	\R                     \	\\R                        4   4S jjrSrU =r$ )IdeficsAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc                   > [         T	U ]  5         XPl        Xl        X l        X-  U l        X0l        SU l        U R
                  S-  U l        Xpl	        Uc-  [        R                  SU R                  R                   S35        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eX@l        [!        ["        R$                  S5      (       d  [        S	5      eU R                  (       a  [!        UR&                  S
5      (       d  U R                  OUR&                  R(                  n["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " XU R
                  -  SS9U l        ["        R*                  " UX R
                  -  SS9U l        O["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " X R
                  -  USS9U l        [5        U R
                  5      U l        X`l        U R8                  (       aG  [;        U R
                  UR<                  S9U l        [;        U R
                  UR<                  S9U l         g g )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!ru   rv   r  r   r  head_dimr   	is_causalr   r  loggerwarning_oncer|   r,   rt   r  hasattrr   r  vision_configr  rW   q_projk_projv_projo_projr   
rotary_embr  r   rms_norm_epsq_layer_normk_layer_norm)
rz   r   r  r   r  r  r  r  kv_input_dimr|   s
            r7   rv   IdeficsAttention.__init__  st    	&"#0}}d*" !8!8 9 :, , MMI%$*:*::QRVRbRbQc$YKr3 
 #5r}}&DEEHII""(/0D0Dk(R(R  X^XlXlXvXv  ))  MM)DK
 ))Ldmm2KRWXDK))MM)DK ))  MM)DK
 ))  MM)DK
 ))  MM)DK
 ii%

 +4==9, .t}}&BUBU VD .t}}&BUBU VD r6   tensorr   bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )rH   r  r  r  r  )rz   r'  r   r(  s       r7   _shapeIdeficsAttention._shape=  s5    {{3GQQRSUVWbbddr6   r(   key_value_statesrE   r   past_key_valueoutput_attentions	use_cachecache_positionrn   c	                 x   U R                   =(       d    US Ln
UR                  5       u  pnU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU
(       d  U R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nOUR                  5       u  nnnU R                  U5      R                  UUU R                  U R
                  5      R                  SS5      nU R                  U5      R                  UUU R                  U R
                  5      R                  SS5      nUR                  S   nUb  UUS   -  nU
(       d-  U R                  U[        UU5      S9u  nn[        XUUU5      u  pUb%  SU0nUR                  UUU R                  U5      u  nnU R                  (       a"  U R!                  U5      nU R#                  U5      n[$        nU R&                  R(                  S:w  aT  U R&                  R(                  S:X  a  U(       a  [*        R-                  S	5        O[.        U R&                  R(                     nU" U UUUU4U R0                  (       d  S
OU R2                  U R4                  S.U	D6u  nnUR7                  XS5      R9                  5       nU R;                  U5      nU(       a  S nUU4$ )Nr   r   r   r   )r   r0  eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r?   )r  sizer  rH   r  r  r  r  r  rG   r!  maxr   updater  r  r#  r$  r  r  _attn_implementationr  r  r   r  r   r   reshaper  r   )rz   r(   r,  rE   r   r-  r.  r/  r0  r{   r  r(  q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer
  r	  s                            r7   r   IdeficsAttention.forward@  s    "44T8HPT8T%**,A{{=166s4>>SWS`S`akklmopq!]388T^^UYUbUbcmmnoqrsJ;;}5::3t~~W[WdWdeoopqstuL+002LAvq%56;;CY]YfYfgqqrsuvwJ,-223PTP]P]^hhijlmn   %%b)
%.++J!|SU=STHC';LVY[^`l'm$L %,n=L'5'<'<ZW[WeWegs't$J,,\:L**:6J(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))#b9DDFkk+.LL((r6   )r  r   r  r   r  r  r$  r  r  r  r   r#  r  r  r!  r   r  )r4  FNFNNNNNFFN)r,   r-   r.   r/   r0   r   r   r   r   r   rv   r1   r   r*  
LongTensorr4   r   r5   r   r   s   @r7   r  r    s   G #(#'$#'OWOW OW 	OW
 !OW !OW OW C=OW OWbeU\\ eC ec e 4815378<"'59J)||J) #5<<0J) !.	J)
 u//0J) !u||!45J)  J) J) !!1!12J) 
u||Xell3XeELL>Q5RR	SJ) J)r6   r  c                   \  ^  \ rS rSrSS\S\\   4U 4S jjjr      SS\R                  S\\R                     S\\R                     S\\\R                        S	\\   S
\\   S\\R                     S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )IdeficsDecoderLayeri  r  r  c                   > [         TU ]  5         UR                  U l        [        U R                  UR                  UR
                  UUS9U l        [        U R                  UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR
                  U l        g )N)r   r  r   r  r  r   r   r   r  )ru   rv   r   r  num_attention_headsr   	self_attnr   r   r   mlpr   r"  input_layernormpost_attention_layernormrz   r  r  r|   s      r7   rv   IdeficsDecoderLayer.__init__  s    !--)((00NN
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%~~r6   r(   rE   r   r-  r.  r/  r0  rn   c                    Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUn	U R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
)r(   rE   r   r-  r.  r/  r0  r  r+   )rM  rK  r   r  r   r  rN  rL  )rz   r(   rE   r   r-  r.  r/  r0  r{   residualself_attn_weightsoutputss               r7   r   IdeficsDecoderLayer.forward  s    2 !,,]; ,0>> 	,
')%)/)	,
 	,
( --m||VZVcVc-d 0 !55mD/--m||VZVcVc-d 0 "++Gr6   )r   r   rM  rL  rN  rK  rZ   )NNNFFN)r,   r-   r.   r/   r   r   r   rv   r1   r   rE  r4   r   r2   r   r5   r   r   s   @r7   rG  rG    s    &} &# & &, 26378<,1$)597||7 !.7 u//0	7
 !u||!457 $D>7 D>7 !!1!127 
u  (51B1BEDUDU1U+V"WW	X7 7r6   rG  c                   |  ^  \ rS rSrSS\S\\   4U 4S jjjr       SS\R                  S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\\R                        S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )IdeficsGatedCrossAttentionLayeri  r  r  c           
      	  > [         TU ]  5         UR                  U l        [        U R                  UR                  SUR
                  UUR                  US9U l        [        U R                  UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR
                  U l        ["        R$                  " 5       U l        ["        R$                  " 5       U l        UR*                  S:X  Ga  UR,                  S:X  ax  ["        R.                  " [0        R2                  " SSU R                  5      5      U l        ["        R.                  " [0        R2                  " SSU R                  5      5      U l        GOUR,                  S:X  a`  ["        R.                  " [0        R2                  " S5      5      U l        ["        R.                  " [0        R2                  " S5      5      U l        GO[9        S	UR,                   S
35      eUR*                  S:X  Ga  UR,                  S:X  ax  ["        R.                  " [0        R:                  " SSU R                  5      5      U l        ["        R.                  " [0        R:                  " SSU R                  5      5      U l        GOUR,                  S:X  a`  ["        R.                  " [0        R:                  " S5      5      U l        ["        R.                  " [0        R:                  " S5      5      U l        GO|[9        S	UR,                   S
35      eUR*                  S;   Ga9  UR,                  S:X  a  ["        R.                  " [0        R<                  " SUR>                  SSU R                  4S95      U l        ["        R.                  " [0        R<                  " SUR>                  SSU R                  4S95      U l        OUR,                  S:X  as  ["        R.                  " [0        R<                  " SUR>                  SS95      U l        ["        R.                  " [0        R<                  " SUR>                  SS95      U l        O2[9        S	UR,                   S
35      e[A        SUR*                   S35      e[C        U S5      (       a  [C        U S5      (       d  [9        S5      eg )NT)r   r  r  r   r  r  r  rI  r  zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr4  )r   stdr5  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"ru   rv   r   r  rJ  r   r  
cross_attnr   r   r   rL  r   r"  rM  rN  r  r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r1   rY  r`  ra  rt   r   r\  alphas_initializer_rangeNotImplementedErrorr  rO  s      r7   rv   (IdeficsGatedCrossAttentionLayer.__init__  s   !--*((00#NN!00
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%nn ggi##w.  H,(*U[[AtGWGW5X(Y%#%<<Aq$BRBR0S#T ""g-(*U[[^(D%#%<<A#?  #CFDUDUCVVW!XYY%%/  H,(*UZZ1dFVFV5W(X%#%<<

1aAQAQ0R#S ""g-(*UZZ](C%#%<<

1#>  #CFDUDUCVVW!XYY%%)II  H,(*LLcv/N/NVWYZ\`\l\lUmn)% $&<<LLcv/N/NVWYZ\`\l\lUmn$  ""g-(*LLcv/N/NVWY)% $&<<#6KjKjrs0u#v  #CFDUDUCVVW!XYY &(DVE]E]D^^s&tuu011gdM6R6RJKK 7Sr6   r(   rE   r*   rC   cross_attention_gater.  r/  r-  rn   c	                    Uc  [        S5      eUc  [        S5      eUb  [        S5      eUn
U R                  U5      nU R                  " S	UUUUS.U	D6u  p[        R
                  R                  XR                  U R                  S9nUR                  US:H  SS2SS2S4   S5      nXR                  U R                  5      U-  -   nUn
U R                  U5      nU R                  U5      n[        R
                  R                  XR                  U R                  S9nXR                  U R                  5      U-  -   nU4nU(       a  X4-  nU$ )
am  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    cross_attention_gate (`torch.FloatTensor`, *optional*):
        gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r(   r,  rE   r.  r  r   r4  r+   )rt   ri  rM  rb  r   r  r   r  r  masked_fillrd  r`  rN  rL  re  ra  )rz   r(   rE   r*   rC   rk  r.  r/  r-  r{   rR  rS  rT  s                r7   r   'IdeficsGatedCrossAttentionLayer.forward  sy   : &# 
  ' ^  %%&uvv ,,]; ,0?? ,
'0//	,

 ,
( --m{{UYUbUb-c%113G13LaQRTXj2Y[^_ #6#6t7L7L#MP]#]] !55mD/--m{{UYUbUb-c >>$2B2B#Cm#SS "++Gr6   )
rd  re  r`  ra  r  rb  r   rM  rL  rN  rZ   rD  )r,   r-   r.   r/   r   r   r   rv   r1   r   r   r4   r2   r   r5   r   r   s   @r7   rW  rW    s   @L} @L# @L @LJ 266:7;7;,1$)8<H||H !.H &ell3	H
 'u||4H 'u||4H $D>H D>H !u||!45H 
u  (51B1BEDUDU1U+V"WW	XH Hr6   rW  c                   F    \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrS rS	rg
)IdeficsPreTrainedModelij  r  rd   TrG  rW  Fc                 F   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[         5      (       a%  UR"                  R                  R                  5         g [        U[$        5      (       GaT  U R                   R&                  S:X  aI  UR(                  R                  R                  5         UR*                  R                  R                  5         g U R                   R&                  S:X  aK  UR(                  R                  R                  S5        UR*                  R                  R                  S5        g U R                   R&                  S;   aq  UR(                  R                  R                  SU R                   R,                  S9  UR*                  R                  R                  SU R                   R,                  S9  g g [        U[.        5      (       a%  UR0                  R                  R                  5         g g )Nr4  )r   r_  r   rY  r   >   r\  r]  r^  )r  initializer_ranger[   r   rW   Conv2drx   datanormal_r   zero_rX   rs   rV   fill_r   r   class_embeddingrW  rf  r`  ra  rh  r   latents)rz   r^   r_  s      r7   _init_weights$IdeficsPreTrainedModel._init_weightsv  s    kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$//MM$$S) 788""''//1 ?@@{{,,7'',,224""''--/..&8'',,2237""''--c2..2RR'',,44#4;;CgCg4h""''//Sdkk>b>b/c S  9::NN'') ;r6   r+   N)r,   r-   r.   r/   r   r3   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_can_compile_fullgraph_supports_attention_backendrz  r5   r+   r6   r7   rp  rp  j  s<    &*#.0QRN""&*r6   rp  c            '         ^  \ rS rSrSrS\4U 4S jjrS#S jr/ 4S jr/ 4S jr	\
\               S$S\\R                     S	\\R                     S
\\R                     S\\   S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4"S jj5       5       r S%S	\\R                  S4   S\R                  S\R                  S\S\4
S jjr\S	\R                  S\S\S\R8                  S\R                  S \4S! j5       rS"rU =r$ )&IdeficsModeli  z
Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

Args:
    config: IdeficsConfig
r  c           
        > [         TU ]  U5        Xl        UR                  U l        UR
                  U l        [        UR
                  UR                  UR                  UR                  U R                  S9U l
        UR                  R                  U l        UR                  U l        UR                  U R                  l        [        UR                  5      U l        UR                   (       a]  UR"                  n[%        UUR                  R&                  UR(                  UR*                  UR,                  UR.                  5      U l        [2        R4                  " [7        UR8                  5       Vs/ sH  n[;        XS9PM     sn5      U l        UR>                  U l        UR8                  U R>                  -  n[2        R4                  " [7        U5       Vs/ sH  n[A        XS9PM     sn5      U l!        SU l"        [G        UR                  URH                  S9U l%        U RM                  5         U RO                  U5        g s  snf s  snf )N)rp   rw   rq   rm   rs   )r  Fr  )(ru   rv   r  pad_token_idrs   
vocab_sizerk   additional_vocab_sizer   freeze_text_layersembed_tokensr  
image_sizer8  r   vision_modeluse_resamplerperceiver_configr   r  resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layersrG  layerscross_layer_intervalrW  gated_cross_attn_layersgradient_checkpointingr   r"  norm	post_initfreeze_relevant_params)rz   r  r  inum_cross_layersr|   s        r7   rv   IdeficsModel.__init__  s    !.. ++5!,,&,&B&B ,,#66((
 !..99#11282M2M/4V5I5IJ %66'@$$.. 00 22 33 44(D$ mm?DVE]E]?^_?^! 5?^_
 %+$?$?!!33t7P7PP')}}KPQaKbcKba,VAKbc(
$ ',#"6#5#56;N;NO	 	##F+ ` ds   3IIc                     Uc  U R                   nUR                  (       a  U R                  UR                  5        UR                  (       a  [	        U R
                  UR                  S9  g g N)re   )r  r  freeze_text_module_exceptionsfreeze_vision_layersri   r  freeze_vision_module_exceptions)rz   r  s     r7   r  #IdeficsModel.freeze_relevant_params  sQ    >[[F$$##F$H$HI&&**f>d>de 'r6   c                 T    U R                   U R                  4 H  n[        X!S9  M     g r  )r  r  ri   )rz   re   r^   s      r7   r  IdeficsModel.freeze_text_layers  s!    {{DII.FE /r6   c                 ,    [        U R                  US9  g r  )ri   r  )rz   re   s     r7   r  !IdeficsModel.freeze_vision_layers  s    T&&:KLr6   rN   rE   r   r'   inputs_embedsr@   rA   rB   rC   r/  r.  output_hidden_statesinterpolate_pos_encodingreturn_dictr0  r{   rn   c                 D   Ub  UR                   OUR                   nUb  UOU R                  R                  nUb  UOU R                  R                  nU
b  U
OU R                  R                  n
Ub  UOU R                  R
                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U
(       a  [        R                  S5        Sn
Uc  U R                  U5      n[        U[        S5      [        45      (       d  [        S5      eU
(       a  Uc
  [        5       nUR                   u  nnnUb  UR#                  5       OSnUU-   nUc0  [$        R&                  " UUUR                   S   -   UR                   S9nUbG  UcD  UR)                  5       R+                  S	5      S-
  nUR-                  US:H  S5        USS2U* S24   nOUc  UR/                  S5      n[1        XgU4 Vs/ sH  nUSL PM	     sn5      S
:w  a  [        S5      eUbw  UR3                  U R4                  US9nUR                   SS
 u  nnUR7                  5       R8                  " UU-  /UR                   S
S Q76 nU R;                  XmS9R<                  nOHUbE  UR?                  5       u  nnnnUR3                  U R4                  US9nUR9                  UU-  UU5      nU R                  R@                  (       aO  Uc4  U RC                  W5      nUR?                  S5      UR?                  S
5      nnOUR?                  5       u  nnnnUnO1Uc#  WR?                  S5      UR?                  S
5      nnO[        S5      eUR9                  UWU-  U5      nU	R?                  S5      nU	R/                  S	5      n	U	RE                  SSSU5      n	U	R9                  UUUU-  5      n	UbB  UR?                  5       u  nnnUU4nU	c  [$        RF                  " UUS9n	U RI                  U	5      n	OSn	U	S:H  RK                  S	S9R3                  U R4                  S9RM                  SS9R3                  U5      n Uc0  [$        RF                  " UU4[$        RN                  UR                   S9nU RQ                  X%XU5      nUn!U(       a  SOSn"U(       a  SOSn#[S        U RT                  5       H  u  n$n%U(       a  U"U!4-  n"U$U RV                  -  S:X  a2  U RX                  U$U RV                  -     n&U&" U!UU4U	U UU
SS.UD6n'U'S   n!U%" U!4UUUUU
US.UD6n(U(S   n!U(       d  Mv  U#U(S   4-  n#M     U R[                  U!5      n!U(       a  U"U!4-  n"UR9                  UUUU5      n[]        U!UU"U#US9$ s  snf )a*  
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
    The output of the image encoder.
perceiver_embeddings (`torch.FloatTensor`, *optional*):
    The output of the perceiver resampler.
image_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask for the image encoder.
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   rK   r?   r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)rr   rK   )r@   r  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer4  r   r   r+   )rC   rk  r.  r/  r-  )rE   r   r-  r.  r/  r0  )r&   r'   r(   r)   r*   )/rK   r  r.  r  r/  use_return_dictrt   r  r  r  r  r  r[   typer   r   rG   get_seq_lengthr1   rF   longcumsummasked_fill_r   sumrJ   rr   r  rH   r  r&   r5  r  r  rI   r   invert_attention_maskrb   squeezer   _update_causal_mask	enumerater  r  r  r  r$   ))rz   rN   rE   r   r'   r  r@   rA   rB   rC   r/  r.  r  r  r  r0  r{   rK   
batch_size
seq_lengthr;  past_key_values_lengthseq_length_with_pastr   
num_imagesr*   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaperk  r(   all_hidden_statesall_self_attnsidxdecoder_layercross_attn_blockrT  layer_outputss)                                            r7   r   IdeficsModel.forward  s_   : &/%:!!@T@T1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M /DJ+>??abb0*nO$1$7$7!
JETE`!?!?!Afg),BB!"\\&(>ATATUVAW(W`m`t`tN %,*>)..077;a?L%%n&91='J;<8L!)33A6LLL`#ab#aaT	#abcghhq  %'??F?KL%1%7%7%;"J
'22499*z:QkT`TfTfghgiTjkL #'"3"3) #4 #   &1G_GdGdGfDJ
M3D":"="=DJJW]"="^"5":"::
;RTact"u;;$$#+'+'?'?@S'T$3G3L3LQ3OQeQjQjklQm00K_KdKdKfH
J7H"6!)/B/G/G/JL_LdLdefLg,M,abb166z:P]C]_pq ,0033==bA3::1aMR388\S]`mSmn*9L9Q9Q9S63Q"24I!J#+',zz2DV'T$#'#=#=>R#S #'  $83#>"C"C"C"K!O!OVZV`V`!O!a j jop j quu 

 !"ZZ12%**]MaMaN 11>L]
 & #7BD0d"+DKK"8C#!m%55! T...!3#'#?#?tG`G`@`#a *!"'
 *>)=&7'#'
 
 !(
)	-)."3#-	 	M *!,M  =#3"55C #9F 		-0  -!11166z:}^op-+++% 3
 	
m cs   :Vr    input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r4  flex_attentionr   Fr3  )r  r  is_trainingr   r?   )sequence_lengthtarget_lengthrr   r0  r  )cudaxpunpu)r  r8  rb   r[   r1   r   r!   r  is_compileabler   _ignore_causal_mask_sdpar  rr   rG   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrK   r  finfomin_unmask_unattended)rz   rE   r  r0  r'   r.  past_seen_tokensusing_compilable_cacherr   r  r  causal_mask	min_dtypes                r7   r   IdeficsModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr6   r  r  rr   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuerr   rK   r   )diagonalr  r?   r   )r   r1   r  r  fullrK   triurF   r9  expandr   rG   rJ   rm  )rE   r  r  rr   r0  r  r{   r  r  mask_lengthpadding_masks              r7   r  BIdeficsModel._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r6   )r  r  r  r  r  r  r  r  rs   r  r  r  r  rZ   )NNNNNNNNNNNNFNNF) r,   r-   r.   r/   r0   r   rv   r  r  r  r   r   r   r1   rE  r   r   r2   r   r   r   r   r4   r$   r   r  staticmethodr   rr   r  r5   r   r   s   @r7   r  r    so   0,} 0,df 46 F 68 M  151537+/5948@D<@7;$(,0/338&*59!E
E,,-E
 !.E
 u//0	E

 "%E
   1 12E
 u001E
 #+5+<+<"=E
 'u'8'89E
 'u||4E
 D>E
 $D>E
 'tnE
 #+4.E
 d^E
  !!1!12!E
" -.#E
$ 
u44	5%E
  E
\ #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r6   r  c            )       d  ^  \ rS rSrSS/rS!U 4S jjrS rS rS r\	\
                S"S\\R                     S	\\R                     S
\\R                     S\\   S\\R                      S\\R                      S\\R                      S\\R                      S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4$S jj5       5       r         S#U 4S jjr S$S\S\\\4   S\S\\\4   4U 4S jjjrS rU =r$ )%IdeficsForVisionText2Texti+  zmodel.embed_tokens.weightzlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  UR                  SUR                  S9U l	        U R                  5         g )NFr   )ru   rv   r  rd   r   r   r  r  freeze_lm_headlm_headr  )rz   r  r  r|   s      r7   rv   "IdeficsForVisionText2Text.__init__.  s[     !&)
-****$*$@$@#22
 	r6   c                     Xl         g rZ   rd   )rz   decoders     r7   set_decoder%IdeficsForVisionText2Text.set_decoder=  s    
r6   c                     U R                   $ rZ   r  r   s    r7   get_decoder%IdeficsForVisionText2Text.get_decoder@  s    zzr6   c                    U R                  5       nU R                  5       n[        U R                  SS5      (       ab  UR                  Ul        UR
                  S:  aA  UR                  UR
                  :X  d   eUR                  R                  UR                  l        [        US5      (       aY  [        US5      (       aG  UR                  Ul        [        US5      (       a$  [        US5      (       a  UR
                  Ul        ggggg)	z
Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
tie_word_embeddingsTr   r   rp   r   rw   N)get_output_embeddingsget_input_embeddingsgetattrr  rx   rw   r   ry   r   r  rp   r   )rz   output_embeddingsinput_embeddingss      r7   tie_weights%IdeficsForVisionText2Text.tie_weightsC  s    
 !6684464;; 5t<<'7'>'>$99A=(@@DTDnDnnnn9I9^9^9e9e!//6$n55'BRTd:e:e-=-L-L*(*CDD "=J J =M<f<f!9JD ;f5r6   rN   rE   r   r'   r  r@   rA   rB   rC   labelsr/  r.  r  r  r  r0  r{   rn   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUUU	UUUUSUS.UD6nUS   nU R                  U5      nSnU
b)  U R                  " SUXR                   R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )aK  
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
    The output of the image encoder.
perceiver_embeddings (`torch.FloatTensor`, *optional*):
    The output of the perceiver resampler.
image_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask for the image encoder.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoProcessor, IdeficsForVisionText2Text

>>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

>>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
>>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

>>> prompts = [
...     [
...         "User:",
...         dogs_image_url_1,
...         "Describe this image.\nAssistant: An image of two dogs.\n",
...         "User:",
...         dogs_image_url_2,
...         "Describe this image.\nAssistant:",
...     ]
... ]
>>> inputs = processor(prompts, return_tensors="pt")
>>> generate_ids = model.generate(**inputs, max_new_tokens=6)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)
```NT)rN   rE   r   r'   r  r@   rA   rB   rC   r/  r.  r  r  r  r0  r   )r<   r  r  )r;   r<   r'   r(   r)   r*   r+   )r  r.  r  r  rd   r  loss_functionr  r9   r'   r(   r)   r*   )rz   rN   rE   r   r'   r  r@   rA   rB   rC   r  r/  r.  r  r  r  r0  r{   rT  r(   r<   r;   s                         r7   r   !IdeficsForVisionText2Text.forwardX  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ** 
)%+'%%=!5!5/!5%=)
  !
&  
m,%%pVF{{OeOepiopD,#33!//)) ' ; ;
 	
r6   c                   > 0 nUb%  U R                   R                  (       a  XS'   O	XS'   OX|S'   UR                  SS5      US'   [        TU ]  " U4UUUUUU
U	S.UDUD6nU	b$  Uc!  US   R
                  S   nU	S S 2U* S 24   US	'   U$ )
NrB   rA   r@   r  F)r'   rE   r  r0  r   r/  rC   rN   r   rC   )r  r  popru   prepare_inputs_for_generationrG   )rz   rN   rE   r   r  r'   r0  r@   r*   rC   r/  r{   images_kwargsmodel_inputsr  r|   s                  r7   r  7IdeficsForVisionText2Text.prepare_inputs_for_generation  s      *{{((8K45<O89,8.)4:JJ?Y[`4a01w<
+)')%!5
 
 
  +0E%k288;J3GJ;<3XL/0r6   rT  rR   rP   c                   > [         TU ]  " UUU40 UD6nSU;   aU  US   nUS S 2SS S 24   R                  S5      nUR                  SS5      (       a  XbS'   O[        R
                  " XV/SS9US'   UR                  US'   U$ )NrC   r?   r   r/  Tr   r*   )ru   #_update_model_kwargs_for_generationr   rM   r1   r   r*   )rz   rT  rR   rP   r{   rC   	last_maskr|   s          r7   r  =IdeficsForVisionText2Text._update_model_kwargs_for_generation  s     wB
 	
 "\1#/0F#G ,QAX6@@CIT227@347<yyBVAbhi7j34 /6.I.I*+r6   )r  rd   rZ   )NNNNNNNNNNNNNFNN)	NNNNNNNNNr  ) r,   r-   r.   r/   _tied_weights_keysrv   r  r  r  r   r   r   r1   rE  r   r   r2   r   r   r   r   r4   r9   r   r  r   dictr   r   r  r5   r   r   s   @r7   r  r  +  s4   57GHg*  151537+/5948@D<@7;-1$(,0/338&*59#b
E,,-b
 !.b
 u//0	b

 "%b
   1 12b
 u001b
 #+5+<+<"=b
 'u'8'89b
 'u||4b
 ))*b
 D>b
 $D>b
 'tnb
 #+4.b
  d^!b
" !!1!12#b
$ +,%b
& 
u33	4'b
  b
N  !+b $)	 38n !	 
c3h r6   r  )r  r  rp  )r   FNN)r   )r4  )Or0   dataclassesr   typingr   r   r   r   r1   torch.nn.functionalr   r  r   torch.utils.checkpointactivationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   utilsr   r   r   r   r   configuration_ideficsr   	perceiverr   visionr   r   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerr,   r  r$   r9   rT   ri   rX   rk   rW   r   Moduler   r   r   r   r   r   r   r  r  rG  rW  rp  r  r  __all__r+   r6   r7   <module>r-     sI  (  ! 1 1      ! . ) > B 9 + X X & p p 0 0 E  !!;J 
		H	% 
C[ C C< 
CK C C: *#Z +- fA fAR8SRYY 8SxJRYY J0$
uxx $
N(:P P2 %II%<<% 
% <<	%
 U\\*% % %0a)ryy a)JK4 K\K&@ K\ **_ ** **Z O) O OdX 6 Xv Rr6   