
    <h'                     *   S r SSKJr  SSKrSSKJr  SSKJr   " S S	\R                  R                  R                  5      r " S
 S\R                  R                  R                  5      r " S S\R                  R                  R                  5      rg)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalN   )
shape_list   )IdeficsConfigc                      ^  \ rS rSrS\S\S\S\S\S\SS	4U 4S
 jjrU 4S jrS\R                  S\R                  4S jr
SrU =r$ )TFIdeficsPerceiverResampler0   config	embed_dimdepthn_headshead_dim	n_latentsreturnNc                   > [         T	U ]  " S0 UD6  X$XV4u  U l        U l        U l        U l        UR                  R                  U l        [        UR                  S5      (       d  U R                  S-  OUR                  R                  S-  U l        / U l        [        U5       Hn  nU R                  R                  [        U R                  U R                  U R                  U R                  SU S3S9[!        U R                  USU S3S9/5        Mp     ["        R$                  R&                  R)                  SSS	9U l        g
)a  
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

Args:
    config (`IdeficsConfig`): config object
    embed_dim (`int`): The size of each embedding vector
    depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
    n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
    head_dim (`int`): Dimensionality of each head projection in the Transformer block.
    n_latents (`int`):
        Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

r      zblocks.z.0namez.1h㈵>
layer_normepsilonr   N )super__init__r   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normshasattrvision_configintermediate_dimblocksrangeappendTFIdeficsPerceiverAttentionTFIdeficsMLPtfkeraslayersLayerNormalizationr   )
selfr   r   r   r   r   r   kwargsi	__class__s
            `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/idefics/perceiver_tf.pyr   $TFIdeficsPerceiverResampler.__init__1   s#   ( 	"6"FOZbFmCdmT^$55NN 6//== NNQ%%//!3 	 uAKK/dmmTEXEXahijhkkm_n !!6!6wqcQS_U	  ((//<<TP\<]    c                 ~   > U R                  U R                  U R                  4SSSS9U l        [        TU ]  U5        g )Nrandom_normalTlatents)shapeinitializer	trainabler   )
add_weightr   r   r5   r   build)r,   input_shaper/   s     r0   r:   !TFIdeficsPerceiverResampler.build\   s>    >>4>>2[_fo ' 
 	k"r2   contextc                    [         R                  " U R                  SS9n[         R                  " U[         R                  " U5      S   SS/5      nU R
                   H  u  p4U" X5      U-   nU" U5      U-   nM     U R                  U5      $ )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   axisr   )r(   expand_dimsr5   tiler6   r#   r   )r,   r=   r5   attnffs        r0   call TFIdeficsPerceiverResampler.callc   s{     ..A6'''BHHW$5a$8!Q#?@HD7,w6GkG+G $ w''r2   )	r#   r   r   r"   r5   r   r   r   r   )__name__
__module____qualname____firstlineno__r   intr   r:   r(   TensorrE   __static_attributes____classcell__r/   s   @r0   r	   r	   0   sn    )^#)^03)^<?)^JM)^Y\)^il)^	)^V#	(BII 	(")) 	( 	(r2   r	   c            
          ^  \ rS rSrS\S\S\S\SS4
U 4S jjrS	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )r&   o   r   r   r   r   r   Nc                   > [         TU ]  " S0 UD6  XUsU l        U l        U l        X@l        [        R                  R                  R                  SSS9U l
        [        R                  R                  R                  SSS9U l        U R
                  (       aZ  [        R                  R                  R                  SSS9U l        [        R                  R                  R                  SSS9U l        U R                  S-  U l        [        R                  R                  R                  U R                  U R                  -  SS	S
9U l        [        R                  R                  R                  U R                  U R                  -  SSS
9U l        [        R                  R                  R                  U R                  U R                  -  SSS
9U l        [        R                  R                  R                  USSS
9U l        g)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`r   context_layer_normr   latents_layer_normq_layer_normk_layer_normg      Fq_projuse_biasr   k_projv_projoutput_projNr   )r   r   r   r   r   r   r(   r)   r*   r+   rS   rT   rU   rV   qk_scaleDenserW   rZ   r[   r\   )r,   r   r   r   r   r-   r/   s         r0   r   $TFIdeficsPerceiverAttention.__init__p   s   "6"6?(3dm,"$((//"D"DTXl"D"m"$((//"D"DTXl"D"m " B B4Vd B eD " B B4Vd B eDt+ hhoo++DLL4==,HSX_g+hhhoo++DLL4==,HSX_g+hhhoo++DLL4==,HSX_g+h88??00UQ^0_r2   r=   r5   c                    U R                  U5      nU R                  U5      n[        U5      u  p4nU R                  U5      nU R	                  [
        R                  " X/SS95      nU R                  [
        R                  " X/SS95      nXgU4 V	s/ sHS  n	[
        R                  " [
        R                  " XU	R                  S   U R                  U R                  45      / SQS9PMU     sn	u  pgnU R                  (       a"  U R                  U5      nU R                  U5      n[
        R                   " SX`R"                  -  U5      n
U
[
        R$                  " U
SSS	9-
  n[
        R&                  R)                  USS9n[
        R                   " S
X5      nU R+                  [
        R                  " [
        R                  " U/ SQS9USU R                  U R                  -  45      5      $ s  sn	f )a  
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

Args:
    context (`tf.Tensor`):
        Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
    latents (`tf.Tensor`):
        Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

Returns:
    `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
    from context.
r?   r   )r      r   r   )permz... i d, ... j d -> ... i jT)r@   keepdimsz... i j, ... j d -> ... i d)rS   rT   r   rW   rZ   r(   concatr[   	transposereshaper6   r   r   r   rU   rV   einsumr]   
reduce_maxnnsoftmaxr\   )r,   r=   r5   
batch_size
seq_lengthr   qkvxscoresstabilized_scoresrC   	resampleds                 r0   rE    TFIdeficsPerceiverAttention.call   s    ))'2))'2,6w,?)
	 KK KK		7"42>?KK		7"42>? AY
 LLAAGGAJdmm'\]dpq
a
 !!!$A!!!$A8!mm:KQO"R]]6T%RRuu}}.R}8 II;TE	JJr||ILAJPRTXT`T`cgcpcpTpCqr
 	

s   AG+)rS   r   r   rV   rZ   rT   r   r\   rU   rW   r   r]   r[   )rG   rH   rI   rJ   rK   boolr   r(   rL   rE   rM   rN   rO   s   @r0   r&   r&   o   s^    `# ` `s `TX `gk `*+
BII +
		 +
bii +
 +
r2   r&   c                   v   ^  \ rS rSrS\4U 4S jjrS\\\R                        S\R                  4S jr
SrU =r$ )r'      r   c                   > [         TU ]  " S0 UD6  UR                  R                  U l        [        R
                  R                  R                  SSS9U l        [        R
                  R                  R                  USSS9U l
        [        R
                  R                  R                  SS9U l        [        R
                  R                  R                  U R                  SS	S9U l        g
)z:Simple MLP block with intermediate_size and embedding sizer   lnr   FfcrX   actr   c_projNr   )r   r   r!   r   r(   r)   r*   r+   r{   r^   r|   ReLUr}   r~   )r,   intermediate_sizer   r-   r/   s       r0   r   TFIdeficsMLP.__init__   s    "6"--77((//44T4M((//''(9EPT'U88??''U'3hhoo++DNNUQY+Zr2   hidden_statesr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)r{   r|   r}   r~   )r,   r   s     r0   rE   TFIdeficsMLP.call   s@    ../M2r2   )r}   r~   r   r|   r{   )rG   rH   rI   rJ   r   r   r   tupler(   rL   rE   rM   rN   rO   s   @r0   r'   r'      s;    [- [(5+;"<   r2   r'   )__doc__typingr   
tensorflowr(   modeling_tf_utilsr   configuration_ideficsr   r)   r*   Layerr	   r&   r'   r   r2   r0   <module>r      sj   4   + 0<("((//"7"7 <(~A
"((//"7"7 A
H288??(( r2   