
    <h$                         S r SSKJr  SSKrSSKJr  SSKJr   " S S\R                  5      r	 " S S	\R                  5      r
 " S
 S\R                  5      rg)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalN   )IdeficsConfigc                      ^  \ rS rSrS\S\S\S\S\S\SS	4U 4S
 jjrS\R                  S\R                  4S jr	Sr
U =r$ )IdeficsPerceiverResampler0   config	embed_dimdepthn_headshead_dim	n_latentsreturnNc                    > [         TU ]  5         X$XV4u  U l        U l        U l        U l        UR                  R                  U l        [        R                  " [        R                  " U R
                  U R                  5      SS9U l        [        UR                  S5      (       d  U R                  S-  OUR                  R                  S-  U l        [        R"                  " [%        U5       Vs/ sHc  n[        R"                  " ['        U R                  U R                  U R                  U R                  5      [)        U R                   U5      /5      PMe     sn5      U l        [        R,                  " U R                  5      U l        gs  snf )a  
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

Args:
    config (`IdeficsConfig`): config object
    embed_dim (`int`): The size of each embedding vector
    depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
    n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
    head_dim (`int`): Dimensionality of each head projection in the Transformer block.
    n_latents (`int`):
        Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

T)requires_gradr
      N)super__init__r
   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normsnn	Parametertorchrandnlatentshasattrvision_configintermediate_dim
ModuleListrangeIdeficsPerceiverAttention
IdeficsMLPblocks	LayerNorm
layer_norm)	selfr	   r
   r   r   r   r   _	__class__s	           ]/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/idefics/perceiver.pyr   "IdeficsPerceiverResampler.__init__1   s;   ( 	FOZbFmCdmT^$55NN ||EKK$O_cd 6//== NNQ%%//!3 	 mm u &A 1$..$,,PTP]P]_c_r_rs"4#8#8&A &

 ,,t~~6s   0A)Fcontextc                     U R                   R                  UR                  S   SS5      nU R                   H  u  p4U" X5      U-   nU" U5      U-   nM     U R	                  U5      $ )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   r   )r   repeatshaper$   r&   )r'   r,   r   attnffs        r*   forward!IdeficsPerceiverResampler.forward_   se     ,,%%gmmA&61= HD7,w6GkG+G $ w''    )	r$   r
   r   r   r   r&   r   r   r   )__name__
__module____qualname____firstlineno__r   intr   r   Tensorr2   __static_attributes____classcell__r)   s   @r*   r   r   0   sa    ,7#,703,7<?,7JM,7Y\,7il,7	,7\
(u|| 
( 
( 
(r4   r   c            
          ^  \ rS rSrS\S\S\S\SS4
U 4S jjrS	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )r"   l   r
   r   r   r   r   Nc                   > [         TU ]  5         XUsU l        U l        U l        X@l        [        R                  " U R                  5      U l        [        R                  " U R                  5      U l	        U R
                  (       aJ  [        R                  " U R                  5      U l
        [        R                  " U R                  5      U l        U R                  S-  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  USS9U l        g)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`g      FbiasN)r   r   r
   r   r   r   r   r%   context_layer_normlatents_layer_normq_layer_normk_layer_normqk_scaleLinearq_projk_projv_projoutput_proj)r'   r
   r   r   r   r)   s        r*   r   "IdeficsPerceiverAttention.__init__m   s,   6?(3dm,"$,,t~~">"$,,t~~"> "T]] ;D "T]] ;Dt+ iit}}0LSXYiit}}0LSXYiit}}0LSXY99T\\DMM%A9SXYr4   r,   r   c           	      t   U R                  U5      nU R                  U5      nUR                  SS u  p4nU R                  U5      nU R	                  [
        R                  " X/SS95      nU R                  [
        R                  " X/SS95      nXgU4 V	s/ sHF  oR                  X9R                  S   U R                  U R                  5      R                  SS5      PMH     sn	u  pgnU R                  (       a"  U R                  U5      nU R                  U5      n[
        R                  " SX`R                   -  U5      n
XR#                  SS	S
9R%                  5       -
  nUR'                  SS9n[
        R                  " SX5      nU R)                  UR                  SS5      R+                  S5      5      $ s  sn	f )a  
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

Args:
    context (`torch.Tensor`):
        Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
    latents (`torch.Tensor`):
        Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

Returns:
    `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
    from context.
N   )dimr      z... i d, ... j d -> ... i jT)rQ   keepdimz... i j, ... j d -> ... i d)rC   rD   r/   rI   rJ   r   catrK   reshaper   r   	transposer   rE   rF   einsumrG   amaxdetachsoftmaxrL   flatten)r'   r,   r   
batch_size
seq_lengthr
   qkvxscoresstabilized_scoresr0   	resampleds                 r*   r2   !IdeficsPerceiverAttention.forward   s    ))'2))'2,3MM"1,=)
	 KK KK		7"4"=>KK		7"4"=>
 mnrsktuktfg99ZT\\4==Q[[\]_`aktua!!!$A!!!$A;Q=NPQR"kkb$k&G&N&N&PQ ((R(0 LL!>H		 3 3Aq 9 A A" EFF vs   AF5)rC   r
   r   rF   rJ   rD   r   rL   rE   rI   r   rG   rK   )r5   r6   r7   r8   r9   boolr   r   r:   r2   r;   r<   r=   s   @r*   r"   r"   l   sc    Z# Z Zs ZTX Z]a Z*(Gu|| (Gell (Gu|| (G (Gr4   r"   c                   v   ^  \ rS rSrS\4U 4S jjrS\\\R                        S\R                  4S jr
SrU =r$ )r#      r	   c                 h  > [         TU ]  5         UR                  R                  U l        [        R
                  " U R                  5      U l        [        R                  " U R                  USS9U l        [        R                  " 5       U l
        [        R                  " XR                  SS9U l        g)z:Simple MLP block with intermediate_size and embedding sizeFrA   N)r   r   r   r
   r   r%   lnrH   fcReLUactc_proj)r'   intermediate_sizer	   r)   s      r*   r   IdeficsMLP.__init__   sr    --77,,t~~.))DNN,=EJ779ii 1>>Nr4   hidden_statesr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)rk   rl   rn   ro   )r'   rr   s     r*   r2   IdeficsMLP.forward   s@    ../M2r4   )rn   ro   r
   rl   rk   )r5   r6   r7   r8   r   r   r   tupler   FloatTensorr2   r;   r<   r=   s   @r*   r#   r#      s?    O- OXeE4E4E.F%G EL]L]  r4   r#   )__doc__typingr   r   torch.nnr   configuration_ideficsr   Moduler   r"   r#    r4   r*   <module>r}      sL   4    09(		 9(x>G		 >GB r4   