ó
    <±h'  ã                   ó*  • S r SSKJr  SSKrSSKJr  SSKJr   " S S	\R                  R                  R                  5      r " S
 S\R                  R                  R                  5      r " S S\R                  R                  R                  5      rg)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

é    )ÚOptionalNé   )Ú
shape_listé   )ÚIdeficsConfigc                   óŽ   ^ • \ rS rSrS\S\S\S\S\S\SS	4U 4S
 jjrU 4S jrS\R                  S\R                  4S jr
SrU =r$ )ÚTFIdeficsPerceiverResampleré0   ÚconfigÚ	embed_dimÚdepthÚn_headsÚhead_dimÚ	n_latentsÚreturnNc                 óˆ  >• [         T	U ]  " S0 UD6  X$XV4u  U l        U l        U l        U l        UR                  R                  U l        [        UR                  S5      (       d  U R                  S-  OUR                  R                  S-  U l        / U l        [        U5       Hn  nU R                  R                  [        U R                  U R                  U R                  U R                  SU S3S9[!        U R                  USU S3S9/5        Mp     ["        R$                  R&                  R)                  SSS	9U l        g
)aÿ  
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

Args:
    config (`IdeficsConfig`): config object
    embed_dim (`int`): The size of each embedding vector
    depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
    n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
    head_dim (`int`): Dimensionality of each head projection in the Transformer block.
    n_latents (`int`):
        Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

r   é   zblocks.z.0©Únamez.1çñhãˆµøä>Ú
layer_norm©Úepsilonr   N© )ÚsuperÚ__init__r   r   r   r   Úperceiver_configÚqk_layer_norms_perceiverÚqk_layer_normsÚhasattrÚvision_configÚintermediate_dimÚblocksÚrangeÚappendÚTFIdeficsPerceiverAttentionÚTFIdeficsMLPÚtfÚkerasÚlayersÚLayerNormalizationr   )
Úselfr   r   r   r   r   r   ÚkwargsÚiÚ	__class__s
            €Ú`/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/idefics/perceiver_tf.pyr   Ú$TFIdeficsPerceiverResampler.__init__1   s#  ø€ ô( 	‰ÒÑ"˜6Ò"ØFOÐZbÐFmÑCˆŒ˜œ d¤m°T´^Ø$×5Ñ5×NÑNˆÔô ˜6×/Ñ/°×=Ñ=ð N‰N˜QÒà×%Ñ%×/Ñ/°!Ñ3ð 	Ôð ˆŒÜu–ˆAØK‰K×Ñä/ØŸ™¨¯©°d·m±mÀT×EXÑEXÐahÐijÐhkÐkmÐ_nñô ! ×!6Ñ!6¸ÀwÈqÈcÐQSÀ_ÑUð	öñ ô Ÿ(™(Ÿ/™/×<Ñ<ÀTÐP\Ð<Ð]ˆó    c                 ó~   >• U R                  U R                  U R                  4SSSS9U l        [        TU ]  U5        g )NÚrandom_normalTÚlatents)ÚshapeÚinitializerÚ	trainabler   )Ú
add_weightr   r   r5   r   Úbuild)r,   Úinput_shaper/   s     €r0   r:   Ú!TFIdeficsPerceiverResampler.build\   s>   ø€ à—‘Ø—>‘> 4§>¡>Ð2ÀÐ[_Ðfoð 'ð 
ˆŒô 	‰‰kÕ"r2   Úcontextc                 ó  • [         R                  " U R                  SS9n[         R                  " U[         R                  " U5      S   SS/5      nU R
                   H  u  p4U" X5      U-   nU" U5      U-   nM     U R                  U5      $ )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   ©Úaxisr   )r(   Úexpand_dimsr5   Útiler6   r#   r   )r,   r=   r5   ÚattnÚffs        r0   ÚcallÚ TFIdeficsPerceiverResampler.callc   s{   € ô —.’. §¡°AÑ6ˆÜ—'’'˜'¤B§H¢H¨WÓ$5°aÑ$8¸!¸QÐ#?Ó@ˆàŸœ‰HˆDÙ˜7Ó,¨wÑ6ˆGÙ˜“k GÑ+ŠGñ $ð ‰˜wÓ'Ð'r2   )	r#   r   r   r"   r5   r   r   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   Úintr   r:   r(   ÚTensorrE   Ú__static_attributes__Ú__classcell__©r/   s   @r0   r	   r	   0   sn   ø† ð)^Ø#ð)^Ø03ð)^Ø<?ð)^ØJMð)^ØY\ð)^Øilð)^à	÷)^õV#ð	(˜BŸI™Ið 	(¨"¯)©)÷ 	(ò 	(r2   r	   c            
       ó’   ^ • \ rS rSrS\S\S\S\SS4
U 4S jjrS	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )r&   éo   r   r   r   r   r   Nc                 ó  >• [         TU ]  " S0 UD6  XUsU l        U l        U l        X@l        [        R                  R                  R                  SSS9U l
        [        R                  R                  R                  SSS9U l        U R
                  (       aZ  [        R                  R                  R                  SSS9U l        [        R                  R                  R                  SSS9U l        U R                  S-  U l        [        R                  R                  R                  U R                  U R                  -  SS	S
9U l        [        R                  R                  R                  U R                  U R                  -  SSS
9U l        [        R                  R                  R                  U R                  U R                  -  SSS
9U l        [        R                  R                  R                  USSS
9U l        g)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`r   Úcontext_layer_normr   Úlatents_layer_normÚq_layer_normÚk_layer_normg      à¿FÚq_proj©Úuse_biasr   Úk_projÚv_projÚoutput_projNr   )r   r   r   r   r   r   r(   r)   r*   r+   rS   rT   rU   rV   Úqk_scaleÚDenserW   rZ   r[   r\   )r,   r   r   r   r   r-   r/   s         €r0   r   Ú$TFIdeficsPerceiverAttention.__init__p   s  ø€ ä‰ÒÑ"˜6Ò"Ø6?È(Ð3ˆŒ˜œ d¤mØ,Ôä"$§(¡(§/¡/×"DÑ"DÈTÐXlÐ"DÐ"mˆÔÜ"$§(¡(§/¡/×"DÑ"DÈTÐXlÐ"DÐ"mˆÔØ××Ü "§¡§¡× BÑ BÈ4ÐVdÐ BÐ eˆDÔÜ "§¡§¡× BÑ BÈ4ÐVdÐ BÐ eˆDÔàŸ™ tÑ+ˆŒô —h‘h—o‘o×+Ñ+¨D¯L©L¸4¿=¹=Ñ,HÐSXÐ_gÐ+ÐhˆŒÜ—h‘h—o‘o×+Ñ+¨D¯L©L¸4¿=¹=Ñ,HÐSXÐ_gÐ+ÐhˆŒÜ—h‘h—o‘o×+Ñ+¨D¯L©L¸4¿=¹=Ñ,HÐSXÐ_gÐ+ÐhˆŒäŸ8™8Ÿ?™?×0Ñ0°ÀUÐQ^Ð0Ð_ˆÕr2   r=   r5   c                 óà  • U R                  U5      nU R                  U5      n[        U5      u  p4nU R                  U5      nU R	                  [
        R                  " X/SS95      nU R                  [
        R                  " X/SS95      nXgU4 V	s/ sHS  n	[
        R                  " [
        R                  " X“U	R                  S   U R                  U R                  45      / SQS9PMU     sn	u  pgnU R                  (       a"  U R                  U5      nU R                  U5      n[
        R                   " SX`R"                  -  U5      n
U
[
        R$                  " U
SSS	9-
  n[
        R&                  R)                  USS9n[
        R                   " S
XÈ5      nU R+                  [
        R                  " [
        R                  " U/ SQS9USU R                  U R                  -  45      5      $ s  sn	f )aí  
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

Args:
    context (`tf.Tensor`):
        Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
    latents (`tf.Tensor`):
        Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

Returns:
    `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
    from context.
éþÿÿÿr?   r   )r   é   r   r   )Úpermz... i d, ... j d -> ... i jéÿÿÿÿT)r@   Úkeepdimsz... i j, ... j d -> ... i d)rS   rT   r   rW   rZ   r(   Úconcatr[   Ú	transposeÚreshaper6   r   r   r   rU   rV   Úeinsumr]   Ú
reduce_maxÚnnÚsoftmaxr\   )r,   r=   r5   Ú
batch_sizeÚ
seq_lengthr   ÚqÚkÚvÚxÚscoresÚstabilized_scoresrC   Ú	resampleds                 r0   rE   Ú TFIdeficsPerceiverAttention.call…   s£  € ð ×)Ñ)¨'Ó2ˆØ×)Ñ)¨'Ó2ˆÜ,6°wÓ,?Ñ)ˆ
 	ð K‰K˜Ó ˆØK‰KœŸ	š	 7Ð"4¸2Ñ>Ó?ˆØK‰KœŸ	š	 7Ð"4¸2Ñ>Ó?ˆð ˜A‘Yó
áô LŠLœŸš A°A·G±G¸A±JÀÇÁÈdÏmÉmÐ'\Ó]ÒdpÔqÙñ
‰ˆˆað
 ××Ø×!Ñ! !Ó$ˆAØ×!Ñ! !Ó$ˆAä—’Ð8¸!¿m¹mÑ:KÈQÓOˆØ"¤R§]¢]°6ÀÈTÑ%RÑRÐÜu‰u}‰}Ð.°Rˆ}Ð8ˆô —I’IÐ;¸TÓEˆ	Ø×ÑÜJŠJ”r—|’| I²LÑAÀJÐPRÐTX×T`ÑT`Ðcg×cpÑcpÑTpÐCqÓró
ð 	
ùò
s   ÂAG+)rS   r   r   rV   rZ   rT   r   r\   rU   rW   r   r]   r[   )rG   rH   rI   rJ   rK   Úboolr   r(   rL   rE   rM   rN   rO   s   @r0   r&   r&   o   s^   ø† ð` #ð `°ð `¸sð `ÐTXð `Ðgk÷ `ð*+
˜BŸI™Ið +
°·	±	ð +
¸b¿i¹i÷ +
ò +
r2   r&   c                   óv   ^ • \ rS rSrS\4U 4S jjrS\\\R                        S\R                  4S jr
SrU =r$ )r'   é³   r   c                 óÚ  >• [         TU ]  " S0 UD6  UR                  R                  U l        [        R
                  R                  R                  SSS9U l        [        R
                  R                  R                  USSS9U l
        [        R
                  R                  R                  SS9U l        [        R
                  R                  R                  U R                  SS	S9U l        g
)z:Simple MLP block with intermediate_size and embedding sizer   Úlnr   FÚfcrX   Úactr   Úc_projNr   )r   r   r!   r   r(   r)   r*   r+   r{   r^   r|   ÚReLUr}   r~   )r,   Úintermediate_sizer   r-   r/   s       €r0   r   ÚTFIdeficsMLP.__init__´   s«   ø€ ä‰ÒÑ"˜6Ò"Ø×-Ñ-×7Ñ7ˆŒÜ—(‘(—/‘/×4Ñ4¸TÈÐ4ÐMˆŒÜ—(‘(—/‘/×'Ñ'Ð(9ÀEÐPTÐ'ÐUˆŒÜ—8‘8—?‘?×'Ñ'¨UÐ'Ð3ˆŒÜ—h‘h—o‘o×+Ñ+¨D¯N©NÀUÐQYÐ+ÐZˆr2   Úhidden_statesr   c                 óŽ   • U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)r{   r|   r}   r~   )r,   r‚   s     r0   rE   ÚTFIdeficsMLP.call½   s@   € ØŸ™ Ó.ˆØŸ™ Ó.ˆØŸ™ Ó/ˆØŸ™ MÓ2ˆàÐr2   )r}   r~   r   r|   r{   )rG   rH   rI   rJ   r   r   r   Útupler(   rL   rE   rM   rN   rO   s   @r0   r'   r'   ³   s;   ø† ð[°-÷ [ð (¨5°·±Ñ+;Ñ"<ð ÀÇÁ÷ ò r2   r'   )Ú__doc__Útypingr   Ú
tensorflowr(   Úmodeling_tf_utilsr   Úconfiguration_ideficsr   r)   r*   ÚLayerr	   r&   r'   r   r2   r0   Ú<module>rŒ      sj   ðñ4õ ã å +Ý 0ô<( "§(¡(§/¡/×"7Ñ"7ô <(ô~A
 "§(¡(§/¡/×"7Ñ"7ô A
ôH2—8‘8—?‘?×(Ñ(õ r2   