
    <hu                        S SK r S SKJr  S SKJrJrJr  S SKrS SKJ	r	  S SK
J	s  Jr  S SKJr  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0  SSK1J2r2  SSK3J4r4J5r5  \0Rl                  " \75      r8 " S S\	Rr                  5      r: " S S\	Rr                  5      r; " S S\R                  Rr                  5      r< " S S\	Rr                  5      r= " S S\	R|                  5      r?\" S5       " S  S\	Rr                  5      5       r@ " S! S"\	Rr                  5      rAS#\R                  S$\R                  S%\R                  S&\C\R                  \R                  4   4S' jrDS(\R                  S)\ES&\R                  4S* jrF S]S+\	Rr                  S,\R                  S-\R                  S.\R                  S/\\R                     S0\GS1\G4S2 jjrH S]S+\	Rr                  S,\R                  S-\R                  S.\R                  S/\\R                     S0\GS1\G4S3 jjrI " S4 S5\	Rr                  5      rJ " S6 S7\5      rK\. " S8 S9\)5      5       rL\. " S: S;\L5      5       rM " S< S=\L\5      rN\\." S>S?9 " S@ SA\#5      5       5       rO " SB SC\R                  Rr                  5      rP " SD SE\	Rr                  5      rQSF rR " SG SH\	Rr                  5      rSSI\R                  S,\R                  4SJ jrTS,\R                  S-\R                  SI\R                  S&\C\R                  \R                  4   4SK jrU " SL SM\	Rr                  5      rV " SN SO\	Rr                  5      rW " SP SQ\5      rX " SR SS\	Rr                  5      rY " ST SU\	Rr                  5      rZ " SV SW\	Rr                  5      r[ " SX SY\L5      r\ " SZ S[\L\5      r]/ S\Qr^g)^    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )Llama4ConfigLlama4TextConfigc                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Llama4TextExperts-   configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U l        [        R                  " [        R                  " U R                  U R
                  SU R                  -  5      5      U l        [        R                  " [        R                  " U R                  U R                  U R
                  45      5      U l        [        UR                     U l        g N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr	   
hidden_actact_fnselfr&   	__class__s     b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/llama4/modeling_llama4.pyr+   Llama4TextExperts.__init__.   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 n   UR                  U R                  R                  S   SU R                  5      n[        R
                  " XR                  5      nUR                  SSS9u  p4[        R
                  " X@R                  U5      -  U R                  5      nUR                  SU R                  5      nU$ )a  
This should really not be run on a single machine, as we are reaching compute bound:
- the inputs are expected to be "sorted" per expert already.
- the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

Args:
    hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
    selected_experts (torch.Tensor): (batch_size * token_num, top_k)
    routing_weights (torch.Tensor): (batch_size * token_num, top_k)
Returns:
    torch.Tensor
r   r)   dim)	viewr5   shaper/   r3   bmmchunkr8   r6   )r:   r?   gate_upgateupnext_statess         r<   forwardLlama4TextExperts.forward8   s     &**4+<+<+B+B1+Er4K[K[\))M+<+<====+iikk$&7!7$..I!&&r4+;+;<r>   )r8   r6   r0   r5   r/   r.   r-   )__name__
__module____qualname____firstlineno__r"   r+   r3   TensorrM   __static_attributes____classcell__r;   s   @r<   r$   r$   -   s0    0/ 0U\\ ell  r>   r$   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Llama4TextMLPN   c                 X  > [         TU ]  5         Uc  UR                  nXl        [        R
                  " UR                  USS9U l        [        R
                  " UR                  USS9U l        [        R
                  " X!R                  SS9U l	        [        UR                     U l        g NFbias)r*   r+   r.   r&   r1   Linearr/   	gate_projup_projr6   r	   r7   activation_fn)r:   r&   r.   r;   s      r<   r+   Llama4TextMLP.__init__O   s    $ & 8 86#5#57HuUyy!3!35FUS#46H6HuU#F$5$56r>   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      $ N)ra   r_   r`   r6   )r:   xr6   s      r<   rM   Llama4TextMLP.forward[   s7    &&t~~a'89DLLOK	~~i((r>   )ra   r&   r6   r_   r`   rd   rO   rP   rQ   rR   r+   rM   rT   rU   rV   s   @r<   rX   rX   N   s    
7) )r>   rX   c                   F   ^  \ rS rSrSS\4U 4S jjjrS rS rS rSr	U =r
$ )	Llama4TextL2Norm`   epsc                 .   > [         TU ]  5         Xl        g rd   )r*   r+   rk   )r:   rk   r;   s     r<   r+   Llama4TextL2Norm.__init__a   s    r>   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ Nr)   rB   T)keepdimr3   rsqrtpowmeanrk   r:   re   s     r<   _normLlama4TextL2Norm._norme   4    5;;quuQx}}R}>IJJJr>   c                 ^    U R                  UR                  5       5      R                  U5      $ rd   )rv   floattype_asru   s     r<   rM   Llama4TextL2Norm.forwardh   s"    zz!'')$,,Q//r>   c                      SU R                    3$ )Nzeps=rk   r:   s    r<   
extra_reprLlama4TextL2Norm.extra_reprk   s    dhhZ  r>   r~   )gư>)rO   rP   rQ   rR   rz   r+   rv   rM   r   rT   rU   rV   s   @r<   ri   ri   `   s)    E  K0! !r>   ri   c                   >   ^  \ rS rSrSU 4S jjrS rS rS rSrU =r	$ )Llama4TextRMSNormo   c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g)z,
Llama4RMSNorm is equivalent to T5LayerNorm
N)r*   r+   rk   r1   r2   r3   onesweight)r:   r/   rk   r;   s      r<   r+   Llama4TextRMSNorm.__init__p   s.     	ll5::k#:;r>   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ ro   rq   ru   s     r<   rv   Llama4TextRMSNorm._normx   rx   r>   c                 z    U R                  UR                  5       5      R                  U5      nX R                  -  $ rd   )rv   rz   r{   r   )r:   re   outputs      r<   rM   Llama4TextRMSNorm.forward{   s.    AGGI&..q1##r>   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   rF   rk   r   s    r<   r   Llama4TextRMSNorm.extra_repr   s'    ))*+6$((<<r>   )rk   r   )gh㈵>)
rO   rP   rQ   rR   r+   rv   rM   r   rT   rU   rV   s   @r<   r   r   o   s    <K$= =r>   r   c                   4   ^  \ rS rSrU 4S jrU 4S jrSrU =r$ )Llama4Router   c                    > [         TU ]  UR                  UR                  SS9  UR                  U l        UR
                  U l        g r[   )r*   r+   r/   r,   r-   num_experts_per_toktop_kr9   s     r<   r+   Llama4Router.__init__   s>    ++V-E-EER!33//
r>   c                 j  > [         TU ]  U5      n[        R                  " X R                  SS9u  p4[        R
                  " U[        S5      5      R                  SXC5      n[        R                  R                  R                  UR                  5       5      R                  UR                  5      nXR4$ )Nr    rC   z-inf)r*   rM   r3   topkr   	full_likerz   scatter_r1   
functionalsigmoidtodtype)r:   r?   router_logitsrouter_top_valuerouter_indicesrouter_scoresr;   s         r<   rM   Llama4Router.forward   s    6+0::mZZUV+W(uV}ENNqR`s++33M4G4G4IJMMmNaNab++r>   )r-   r   rg   rV   s   @r<   r   r      s    0
, ,r>   r   Llama4TextMoec                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r      c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        U5      U l	        [        U5      U l        [        U5      U l        g rd   )r*   r+   r   r   r/   
hidden_dimr,   r-   r$   expertsr   routerrX   shared_expertr9   s     r<   r+   Llama4TextMoe.__init__   s[    //
 ,,!33(0"6**62r>   c                    UR                  SU R                  5      nU R                  U5      u  p#UR                  UR                  S   S5      nXBR                  SS5      -  nU R                  U5      nU R                  U5      nUR                  UR                  UR                  S   SUR                  S   5      R                  SS95        Xc4$ )NrB   r    r   rC   )	reshaper   r   repeatrF   r   r   add_sum)r:   r?   r   r   	routed_in
routed_outouts          r<   rM   Llama4TextMoe.forward   s    %--b$//B'+{{='A$!(()<)<Q)?C	 5 5b! <<	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`a!!r>   )r   r   r-   r   r   r   rg   rV   s   @r<   r   r      s    3" "r>   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Llama4TextRotaryEmbedding   r&   c                 X  > [         TU ]  5         UR                  b  SOSU l        UR                  U l        UR                  U l        Xl        [        U R                     U l	        U R                  U R                  U5      u  o0l
        U R                  SUSS9  U R                  U l        g )Nllama3defaultinv_freqF)
persistent)r*   r+   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr&   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r:   r&   devicer   r;   s       r<   r+   "Llama4TextRotaryEmbedding.__init__   s    %+%8%8%D)"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r>   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  UR
                  5      U-  R                  SS5      n[        R                  " [        R                  " U5      U5      nXpR                  -  nS S S 5        U$ ! , (       d  f       W$ = f)	Nr   rB   r    mpscpuF)device_typeenabledr)   )r   rz   expandrF   
isinstancer   typestrr3   autocastr   	transposepolar	ones_liker   )r:   re   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r<   rM   !Llama4TextRotaryEmbedding.forward   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&))!((36KKVVWXZ[\EEOOE$:EBI!$:$::I D
  DC
 s   A(D==
E)r   r&   r   r   r   r   r   rd   )rO   rP   rQ   rR   r"   r+   r3   no_gradr   rM   rT   rU   rV   s   @r<   r   r      s7    // / / ]]_
  
r>   r   xqxkr   r@   c           	      *   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[         R
                  " X2S S 2S S 2S S S 24   -  5      R                  S5      n[         R
                  " XBS S 2S S 2S S S 24   -  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrB   r)   r   )r3   view_as_complexrz   r   rF   view_as_realflattenr{   )r   r   r   xq_xk_xq_outxk_outs          r<   apply_rotary_embr      s    
 


 2 2 IBHHSbM I2 Iq I
JC



 2 2 IBHHSbM I2 Iq I
JC1dA&> >?GGJF1dA&> >?GGJF>>"v~~b111r>   r?   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)rF   r   r   )r?   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr>   modulequerykeyvalueattention_maskscalingdropoutc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr)   r   rB   rC   ptrainingr    )r   num_key_value_groupsr3   matmulr   rF   r1   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r<   eager_attention_forwardr     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2(>L==((6??([L,,|:K''1-88:K$$r>   c                 
   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U R
                  S-  -  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )	Nr)   r         r   rB   rC   r   r    )r   r   r3   r   r   r   rF   r1   r   r   r   r   r   r   s                r<   vision_eager_attention_forwardr
     s     3 ; ;<JU$?$?@L<<';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#1==((2(>L==((6??([L,,|:K''1-88:K$$r>   c                   B  ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\\   S\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )Llama4TextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperr&   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  U l        UR                  UR                  -  U l	        UR                  U l        U R                  S-  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        SU l        UR                   U   U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR
                  UR(                  S9U l        U R                  R2                  (       a-  U R"                  (       a  [5        UR6                  5      U l        g g g )Nr   r	  Tr\   )r*   r+   r&   	layer_idxgetattrr/   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper1   r^   attention_biasq_projk_projv_projo_projuse_qk_normri   rms_norm_epsqk_normr:   r&   r  r;   s      r<   r+   Llama4TextAttention.__init__  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"r>   r?   position_embeddingsr   past_key_valuecache_positionr  r@   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      n	U R	                  U5      R                  " / UQSPU R                  P76 n
U R                  U5      R                  U5      R                  SS5      nU R                  (       a'  [        XUR                  U	R                  5      5      u  p[        U S5      (       a"  U R                  U	5      n	U R                  U
5      n
U R                  (       a  U R                  (       d  [        R                  " [        R                   " UR#                  5       S-   U R$                  -  5      S-   5      U R&                  -  S-   nUR                  SUS   SS45      R)                  / UQSPSP75      nX-  R                  U	R*                  5      n	U	R                  SS5      n	U
R                  SS5      n
Ub#  SU0nUR-                  XU R.                  U5      u  p[0        nU R2                  R4                  S:w  a  [6        U R2                  R4                     nU" U U	U
UU4U R8                  (       d  SOU R:                  U R<                  S	.UD6u  nnUR>                  " / UQSP76 RA                  5       nU RC                  U5      nUU4$ )
NrB   r    r)   r        ?r$  eager        )r   r   )"rF   r   r  rE   r  r  r   r  r   r   r   hasattrr  r  r3   logfloorrz   r  r  r   r   updater  r  r&   _attn_implementationr   r   r  r   r   r   r  )r:   r?   r"  r   r#  r$  r  input_shapehidden_shapequery_statesr  r  attn_scalescache_kwargsattention_interfacer  r  s                    r<   rM   Llama4TextAttention.forward5  s    $))#2.88b8$--8{{=166|D[[/44UkU2Ut}}U
{{=166|DNNqRST=='7*=*@*@ATAT*U($L 4##<<5Lj1J ''		%++~';';'='CtGWGW&WX[^^_bfbqbqqtww  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(6::<;M;MNL#--a3))!Q/
%,n=L'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r>   )r  r  r  r&   r  r   r  r  r  r  r   r   r  r  r  r   r  r  NN)rO   rP   rQ   rR   __doc__r"   r+   r3   rS   r   r   r
   
LongTensorr   r   rM   rT   rU   rV   s   @r<   r  r    s    GA/ AF +/599)||9) #5<<#=>9) !.	9)
 !9) !!1!129) -.9) 
u||Xell3XeELL>Q5RR	S9) 9)r>   r  c                     ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\	\R                        S\\
   S\\R                     S	\\	\R                  \R                  4      S
\\   S\	\R                  \\	\R                  \R                  4      4   4S jjrSrU =r$ )Llama4TextDecoderLayeriq  c                   > [         TU ]  5         UR                  U l        X l        UR                  U   U l        [        X5      U l        X!R                  ;   U l	        U R                  (       a  [        U5      U l        O[        XR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r.   r~   )r*   r+   r/   r  layer_typesattention_typer  	self_attn
moe_layersis_moe_layerr   feed_forwardrX   intermediate_size_mlpr   r  input_layernormpost_attention_layernormr   s      r<   r+   Llama4TextDecoderLayer.__init__r  s    !--"$00;,V?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%r>   r?   r   r   r#  	use_cacher$  r"  r  r@   c           
         Un	U R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUn	U R                  U5      nU R                  U5      nU R                  (       a  Uu  pXR                  U	R                  5      -   nU$ )N)r?   r"  r   r#  rE  r$   )rB  r=  rC  r@  r?  rE   rF   )r:   r?   r   r   r#  rE  r$  r"  r  residualattention_states_s               r<   rM   Llama4TextDecoderLayer.forward  s     !,,]; #nn 
' 3)))
 
 !3 !55mD))-8,M #5#5hnn#EEr>   )r<  r@  r/   rB  r?  r  rC  r=  )NNNFNN)rO   rP   rQ   rR   r+   r3   rS   r   r7  r   boolr   r   FloatTensorrM   rT   rU   rV   s   @r<   r9  r9  q  s    g$ 26378<$)59KO"||" !." u//0	"
 !u||!45" D>" !!1!12" &eELL%,,,F&GH" -." 
u  (51B1BEDUDU1U+V"WW	X" "r>   r9  c                   D    \ rS rSr% \\S'   SrS/rSrSr	Sr
SrSrS rSrg)	Llama4PreTrainedModeli  r&   Tpast_key_valuesFc                 |   [        U R                  S5      (       a  U R                  R                  OU R                  R                  R                  n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [	        U[         5      (       a&  UR                  R                  R                  S5        g [	        U["        5      (       aI  UR$                  R                  R                  SUS9  UR&                  R                  R                  SUS9  g [	        U[(        5      (       a[  UR*                  R                  R                  UR,                  S9  UR.                  R                  R                  UR,                  S9  g g )Ninitializer_ranger(  )rt   stdr&  )rS  )r)  r&   rR  text_configr   r1   r^   r   datanormal_r]   zero_	Embeddingpadding_idx	LayerNormfill_r   r$   r5   r6   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r:   r   rS  s      r<   _init_weights#Llama4PreTrainedModel._init_weights  s    t{{$788 KK))((:: 	
 fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$ 122MM$$S) 122$$,,#3,?!!))s)< 122""''//FLL/A++0088V\\8J 3r>   rG  N)rO   rP   rQ   rR   r!   __annotations__supports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr`  rT   rG  r>   r<   rO  rO    s:    &*##4"5 N!"&Kr>   rO  c                   >  ^  \ rS rSr% S/rSr\\S'   \\	\
S.rS\4U 4S jjr\\       SS\R                   S\\R$                     S	\\R                      S
\\   S\\R(                     S\\   S\\R                      S\\   S\\\4   4S jj5       5       rSrU =r$ )Llama4TextModeli  r9  modelr&   )
attentionsr?   r   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr~   )r&   F)r*   r+   pad_token_idrY  
vocab_sizer1   rX  r/   embed_tokens
ModuleListrangenum_hidden_layersr9  layersr   r  normr   
rotary_embgradient_checkpointing	post_initr   s      r<   r+   Llama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHghHg9#F6Hgh
 &f&8&8f>Q>QR	36B&+# 	 is   C>	input_idsr   r   rP  inputs_embedsrE  r$  r  r@   c                    US L US L-  (       a  [        S5      eUc>  U R                  UR                  U R                  R                  R                  5      5      nU(       a  Uc
  [        5       nUcD  Ub  UR                  5       OSn	[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d*  U R                  UUUUUS.n[        S	0 UD6[        S	0 UD6S.n
UnU R!                  X5      nU R"                  S U R                  R$                    H  nU" U4XR&                     UUUUUS.UD6nM!     U R)                  U5      n[+        UU(       a  US9$ S S9$ )
N:You must specify exactly one of input_ids or inputs_embedsr   r    )r   )r&   input_embedsr   r$  rP  r   )full_attentionchunked_attention)r   r   r#  rE  r$  r"  )last_hidden_staterP  rG  )
ValueErrorrq  r   r   r   r   get_seq_lengthr3   arangerF   	unsqueezer   dictr&   r   r   rw  ru  rt  r<  rv  r   )r:   r{  r   r   rP  r|  rE  r$  r  past_seen_tokenscausal_mask_mappingmask_kwargsr?   freq_cisdecoder_layers                  r<   rM   Llama4TextModel.forward  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L ?-FF ++ -"0"0#2 ,K #5"C{"C%?%N+%N#
 & ??=?![[)H4;;+H+HIM)	23O3OP).#-$,	 	M J 		-0&+/8O
 	
>B
 	
r>   )rq  rx  ru  rv  rY  rw  rp  )NNNNNNN)rO   rP   rQ   rR   _no_split_modulesbase_model_prefixr"   rb  r  r9  r   _can_record_outputsr+   r   r   r3   r7  r   rS   r
   rM  rL  r   r   r   r   r   rM   rT   rU   rV   s   @r<   rk  rk    s   12)/&/    '+1537+/59$(59C
##C
 !.C
 u//0	C

 "%C
   1 12C
 D>C
 !!1!12C
 +,C
 
u--	.C
  C
r>   rk  c                     ^  \ rS rSr% S/rSrS/rSS0r\\	S'   S\4U 4S jjr
S	 rS
 r\\         SS\R                   S\\R$                     S\\R                      S\\\\\R,                     4      S\\R,                     S\\R                      S\\   S\\R                      S\\\R$                  4   S\\   S\\\4   4S jj5       5       rSrU =r$ )Llama4ForCausalLMi0  r9  language_modelzlm_head.weightlm_headcolwise_repr&   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r[   )
r*   r+   rk  rl  rp  r1   r^   r/   r  ry  r9   s     r<   r+   Llama4ForCausalLM.__init__7  sU     $V,
 ++yy!3!3V5F5FUS 	r>   c                     Xl         g rd   rl  r:   decoders     r<   set_decoderLlama4ForCausalLM.set_decoder@  s    
r>   c                     U R                   $ rd   r  r   s    r<   get_decoderLlama4ForCausalLM.get_decoderC  s    zzr>   r{  r   r   rP  r|  labelsrE  r$  logits_to_keepr  r@   c
                 p   U R                   " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R	                  USS2USS24   5      nSnUb)  U R
                  " SXU R                  R                  S.U
D6n[        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Llama4ForCausalLM

>>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r{  r   r   rP  r|  rE  r$  r   N)logitsr  rp  )lossr  rP  r?   rm  rG  )rl  r   intslicer  loss_functionr&   rp  r   rP  r?   rm  )r:   r{  r   r   rP  r|  r  rE  r$  r  r  outputsr?   slice_indicesr  r  s                   r<   rM   Llama4ForCausalLM.forwardF  s    J ** 	
)%+')	
 	
  
8B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
r>   )r  rl  rp  )	NNNNNNNNr   ) rO   rP   rQ   rR   r  r  _tied_weights_keys_tp_planr"   rb  r+   r  r  r   r   r3   r7  r   rS   r   r
   listrM  rL  r  r   r   r   r   rM   rT   rU   rV   s   @r<   r  r  0  s_   12(*+=)H/   '+1537KO59-1$(5934<
##<
 !.<
 u//0	<

 "%tE4E4E/F(F"GH<
   1 12<
 ))*<
 D><
 !!1!12<
 c5<</0<
 +,<
 
u,,	-<
  <
r>   r  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Llama4CausalLMOutputWithPasti  a(  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr  r  rP  r?   rm  image_hidden_statesrG  )rO   rP   rQ   rR   r6  r  r   r3   rM  rb  r  rP  r  r?   r   rm  r  rT   rG  r>   r<   r  r    s      )-D(5$$
%, $FE$9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r>   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionMLP2i  c                 x  > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " U R                  UR                  SS9U l        [        R
                  " UR                  UR                  SS9U l	        [        R                  " 5       U l        UR                  U l        g r[   )r*   r+   r/   r.   r1   r^   projector_input_dimfc1projector_output_dimfc2GELUra   projector_dropoutr   r9   s     r<   r+   Llama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//r>   c                     U R                  U5      nU R                  U5      n[        R                  " XR                  U R                  S9nU R                  U R                  U5      5      $ )Nr   )r  ra   Fr   r   r  r:   r?   s     r<   rM   Llama4VisionMLP2.forward  sR    /**=9		-<<$--X!!$((="9::r>   )ra   r   r  r  r/   r.   rg   rV   s   @r<   r  r    s    0; ;r>   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4MultiModalProjectori  c                    > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        g r[   )	r*   r+   r1   r^   vision_configvision_output_dimrT  r/   linear_1r9   s     r<   r+   "Llama4MultiModalProjector.__init__  s?    		  22**
r>   c                 (    U R                  U5      nU$ rd   r  )r:   image_featuresr?   s      r<   rM   !Llama4MultiModalProjector.forward  s    n5r>   r  rg   rV   s   @r<   r  r    s    
 r>   r  c           
      8   U R                   u  p#n[        [        R                  " U5      5      nU R	                  X%US5      n U R                  5       u  p&ptU R	                  X&[        Xq-  5      [        XA-  5      5      nUR                  SSSS5      R                  5       nUR	                  U[        Xa-  5      [        Xq-  5      [        XAS-  -  5      5      nUR                  SSSS5      R                  5       nUR	                  USUR                   S   5      n	U	$ )NrB   r   r)   r    r   )rF   r  mathsqrtrE   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r<   pixel_shuffler    s   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='J"''
C@U<VX[\d\tXuvO%--aAq9DDFO%**C./U5J1KSQYlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMr>   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionPixelShuffleMLPi  c                    > [         TU ]  5         UR                  U l        [        UR                  U R                  S-  -  5      U l        UR                  U l        [        U5      U l	        g r(   )
r*   r+   pixel_shuffle_ratior  r  	inner_dimr  
output_dimr  mlpr9   s     r<   r+   $Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+r>   encoded_patchesr@   c                 N    [        XR                  5      nU R                  U5      $ rd   )r  r  r  )r:   r  s     r<   rM   #Llama4VisionPixelShuffleMLP.forward  s!    '9Q9QRxx((r>   )r  r  r  r  
rO   rP   rQ   rR   r+   r3   rS   rM   rT   rU   rV   s   @r<   r  r    s(    ,)u|| ) ) )r>   r  freqs_cic                     UR                   n[        UR                  5       VVs/ sH  u  p4US:X  d  X2S-
  :X  a  UOSPM     nnnU R                  " U6 $ s  snnf )Nr    )ndim	enumeraterF   rE   )r  r   r  idrF   s         r<   reshape_for_broadcastr    sT    ::D=Fu{{=ST=STQ!q&AMQq0=SET==%   Us   Ac                 >   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[        X#S9nUR                  UR                  5      n[         R                  " X2-  5      R                  S5      n[         R                  " XB-  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrB   r)   )r  r   r   )r3   r   rz   r   rF   r  r   r   r   r   r{   )r   r   r  query_key_	query_outkey_outs          r<   vision_apply_rotary_embr    s    
 ""5;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!Lciin!Lb!L!!LMD$hEH{{6==)H""6#45==a@I  199!<GU#W__S%999r>   c                     ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\\R                     S\\	   S\
\   S	\\R                  \\R                     \\\R                        4   4S
 jjrSrU =r$ )Llama4VisionAttentioni  r&   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  UR
                  -  U l        SU l        UR                  U l	        U R                  S-  U l
        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )Nr    r	  Tr\   )r*   r+   r&   r/   	embed_dimr  	num_headsr   r   r  r   r1   r^   r  r  r  r  r9   s     r<   r+   Llama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZr>   r?   r  r   r#  r  r@   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
[        XUS9u  pUR                  SS5      nU	R                  SS5      n	U
R                  SS5      n
[        nU R                  R                  S;  a  [        U R                  R                     nU" U UU	U
S 4U R                  (       d  SOU R                  S SS.UD6u  pUR                  " / UQSP76 R                  5       nU R!                  U5      nX4$ )	NrB   r  r    r)   )r'  flex_attentionr(  F)r   r   r  )rF   r   r  rE   r  r  r  r   r
  r&   r-  r   r   r  r   r   r  )r:   r?   r  r   r#  r  r.  r/  r0  r  r  r3  r  r  s                 r<   rM   Llama4VisionAttention.forward  sk    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g #--a3))!Q/
#--a3(F;;++3NN"9$++:Z:Z"[$7
%
  $}}C$2H2H
%
 
%
! "));;;;FFHkk+.((r>   )r  r&   r  r   r  r  r   r  r  r   r  r5  )rO   rP   rQ   rR   r   r+   r3   rS   r   r
   r   r   r   rM   rT   rU   rV   s   @r<   r  r    s    [1 [& 26*.()||() ,,() !.	()
 !() -.() 
u||Xell3XeELL>Q5RR	S() ()r>   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionMLPi6  c                   > [         TU ]  5         Xl        [        R                  " 5       U l        [        R                  " UR                  UR                  SS9U l	        [        R                  " UR                  UR                  SS9U l
        g )NTr\   )r*   r+   r&   r1   r  ra   r^   r/   r.   r  r  r9   s     r<   r+   Llama4VisionMLP.__init__7  sc    WWY99V//1I1IPTU99V55v7I7IPTUr>   r?   r@   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rd   )r  ra   r  r  s     r<   rM   Llama4VisionMLP.forward>  s4    /**=9/r>   )ra   r&   r  r  r  rV   s   @r<   r   r   6  s)    VU\\ ell  r>   r   c            
          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\R                  S\\R                     S\\	   4S jjr
S	rU =r$ )Llama4VisionEncoderLayeriE  r&   c                   > [         TU ]  5         UR                  U l        [        U5      U l        [        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  5      U l
        g rd   )r*   r+   r/   r  r=  r   r  r1   rZ  rB  rC  r9   s     r<   r+   !Llama4VisionEncoderLayer.__init__F  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%r>   hidden_stater  r   output_attentionsc                     UnU R                  U5      nU R                  UUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )N)r  r   )rB  r=  rC  r  )r:   r	  r  r   r
  rH  r  r  s           r<   rM    Llama4VisionEncoderLayer.forwardP  s      ++L9%)^^) &4 &
"
  .  44\Bxx-./&Gr>   )r/   rB  r  rC  r=  r5  )rO   rP   rQ   rR   r   r+   r3   rS   r   rL  rM   rT   rU   rV   s   @r<   r  r  E  s_    I1 I 26,0ll ,, !.	
 $D> r>   r  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S\	\R                     S\	\
   S	\	\
   S
\	\
   S\\\4   4S jjrSrU =r$ )Llama4VisionEncoderiq  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Llama4VisionEncoderLayer`].

Args:
    config: Llama4VisionConfig
r&   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        Xl        g s  snf )NF)
r*   r+   r&   r1   rr  rs  rt  r  ru  rx  )r:   r&   rJ  r;   s      r<   r+   Llama4VisionEncoder.__init__z  sY    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %ps   A+r?   r  r   r
  output_hidden_statesreturn_dictr@   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU R                   H,  n	U(       a  Xq4-   nU	" UUUUS9n
U(       a  XS   4-   nU
S   nM.     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrG  )r	  r   r
  r  r    r   c              3   ,   #    U H  oc  M  Uv   M     g 7frd   rG  .0vs     r<   	<genexpr>.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     e$Sq$S   	r  r?   rm  )r&   r
  r  use_return_dictru  r   r   )r:   r?   r  r   r
  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r<   rM   Llama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[M#!/2B!B)*-"3!	M !!/3C2E!E)!,M )   +.>>Ne]N$Seee+Vd
 	
r>   )r&   rx  ru  NNNN)rO   rP   rQ   rR   r6  r   r+   r3   rS   r   rL  r   r   r   rM   rT   rU   rV   s   @r<   r  r  q  s    1  26,0/3&*?
||?
 ,,?
 !.	?

 $D>?
 'tn?
 d^?
 
uo%	&?
 ?
r>   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4UnfoldConvolutioni  c                 8  > [         TU ]  5         UR                  n[        U[        5      (       a  X"4n[
        R                  R                  X!R                  S9U l        [        R                  " UR                  US   -  US   -  UR                  SS9U l        g )N)kernel_sizestrider   r    Fr\   )r*   r+   r  r   r  r3   r1   Unfoldunfoldr^   num_channelsr/   linear)r:   r&   r&  r;   s      r<   r+    Llama4UnfoldConvolution.__init__  s    ''k3''&4Khhoo+FWFWoXii+a.0;q>A
r>   r?   r@   c                 p    U R                  U5      nUR                  SSS5      nU R                  U5      nU$ )Nr   r)   r    )r)  r  r+  r  s     r<   rM   Llama4UnfoldConvolution.forward  s8    M2%--aA6M2r>   )r+  r)  r  rV   s   @r<   r$  r$    s(    

U\\ ell  r>   r$  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionRotaryEmbeddingi  c                   > [         TU ]  5         UR                  UR                  -  n[        R
                  " US-  [        R                  S9R                  US-  S5      n[        R                  " X3S S /SS9nSUS'   X2-  nX2-  nUR                  UR                  -  S-  nSUR                  [        R
                  " SUS5      S US-   R                  5       U-  -  -  nUS-   S	   US S S S 24   -  R                  SS
S9nUS-   S	   US S S S 24   -  R                  SS
S9n	[        R                  " X/S
S9R                  5       R                  5       SS S S24   n
U
R                  UR                  S
SS5      S:  S5      n
[        R                   " [        R"                  " [        R$                  " U
5      [        R&                  " U
5      /S
S95      nXl        g )Nr)   )r   r    r   rC   r   )rB   rB   r&  ).NrB   .)r*   r+   
image_sizer  r3   r  int32r   catr/   r  
rope_thetarz   repeat_interleaver   masked_fillr   stackcossinr  )r:   r&   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   r  r;   s               r<   r+   $Llama4VisionRotaryEmbedding.__init__  s   6#4#44,,sAvU[[9AA#q&!L))Wbqk2:%%)C)CCqH6,,a11MN_QY]^Q^1`1f1f1hks1stu	!A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc r>   c                 L    U R                   R                  UR                  5      $ rd   )r  r   r   r  s     r<   rM   #Llama4VisionRotaryEmbedding.forward  s    }} 4 455r>   r  rg   rV   s   @r<   r0  r0    s    !"6 6r>   r0  c                      ^  \ rS rSr% SrS/r\\S'   S\4U 4S jjrS r	    SS\
R                  S\\
R                     S	\\   S
\\   S\\   S\\\\
R                  S4   4   4S jjrSrU =r$ )r\  i  vision_modelr  r&   c                 ~  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  -  S-  S-   U l        UR                  S-  U l        [        U5      U l	        [        R                  " U R                  [        R                  " U R                  5      -  5      U l        [        R                  " U R                  [        R                  " U R                  U R                  5      -  5      U l        [!        U5      U l        [        R$                  " U R                  5      U l        [        R$                  " U R                  5      U l        [+        U5      U l        [/        U5      U l        U R3                  5         g )Nr)   r    r	  )r*   r+   r2  r  r/   r*  r  r^  r$  patch_embeddingr1   r2   r3   randnr]  r_  r0  rotary_embeddingrZ  layernorm_prelayernorm_postr  rl  r  vision_adapterry  r9   s     r<   r+   Llama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&Ar>   c                     U R                   $ )zW
This function is used to fetch the first embedding layer to activate grads on inputs.
)rI  r   s    r<   get_input_embeddings&Llama4VisionModel.get_input_embeddings  s     ###r>   pixel_valuesr   r
  r  r  r@   .c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR                  u  pgpSn
SnU R                  U5      nUR                  u  pnUR                  Xj-  U-  X5      nU R                  R                  UR                  S   SUR                  S   5      n[        R                  " UU/SS9nUS-  nUR                  Xj-  XU5      nU R                  R                  UR                  UR                  S9nUU-   nU R                  U5      nUR!                  USU5      nU R#                  U5      nU R%                  USUUUS9nUR&                  nU R)                  U5      nUSS2SS2SS24   nU R+                  U5      nU(       a  UR,                  OSnU(       a  US   nOSnU(       d  [/        S	 UUU4 5       5      $ [1        UUUS
9$ )aN  

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, MllamaVisionModel

>>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
>>> model = MllamaVisionModel.from_pretrained(checkpoint)
>>> processor = AutoProcessor.from_pretrained(checkpoint)

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")

>>> output = model(**inputs)

>>> print(output.last_hidden_state.shape)
torch.Size([1, 1, 4, 1025, 7680])
```
Nr    r   rB   rC   r   r   )r   r  r
  r  r)   c              3   ,   #    U H  oc  M  Uv   M     g 7frd   rG  r  s     r<   r  ,Llama4VisionModel.forward.<locals>.<genexpr>j  s     _$Mq$Mr  r  )r&   r
  r  r  rF   rI  r   r]  r   r3   r4  r_  r   r   r   rL  rE   rK  rl  r  rM  rN  r?   r   r   )r:   rS  r   r
  r  r  batch_size_times_num_tilesr*  r  r  num_concurrent_media
num_chunksr	  rJ  r  r   r]  positional_embeddingr  r   r?   rm  s                         r<   rM   Llama4VisionModel.forward  sB   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"& 
++L9%1%7%7"
 $++&=
JK
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&=zXb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
r>   )r]  r/   r2  rM  rL  rl  r*  r  rI  r  r_  rK  r^  rN  r"  )rO   rP   rQ   rR   r  r  r   rb  r+   rQ  r3   rS   r   rL  r   r   r   rM   rT   rU   rV   s   @r<   r\  r\    s    &341 2$ 26,0/3&*_
ll_
 !._
 $D>	_

 'tn_
 d^_
 
ellC&7 88	9_
 _
r>   r\  c            (         ^  \ rS rSr% SS/r0 rSr\\S'   S\4U 4S jjr	S r
S rS	 rS
 rS rS rS\R"                  S\\\\   4   S\4S jrS\R.                  S\R"                  S\R"                  4S jr\                S%S\R.                  S\R"                  S\\R6                     S\\R.                     S\\   S\\R"                     S\\\\\   4      S\\   S\\R.                     S\\   S\\   S\\   S\\   S\\R.                     S\\\R6                  4   S\R6                  S \\   S!\\ \!4   4$S" jj5       r"      S&S# jr#S$r$U =r%$ )'Llama4ForConditionalGenerationis  r9  r   r&   c                 j  > [         TU ]  U5        [        UR                  5      U l        [        U5      U l        [        UR                  5      U l	        UR                  R                  U l
        U R                  R                  b  U R                  R                  OSU l        U R                  5         g )NrB   )r*   r+   r\  r  rG  r  multi_modal_projectorr  rT  r  rp  r&   ro  ry  r9   s     r<   r+   'Llama4ForConditionalGeneration.__init__y  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdr>   c                 6    U R                   R                  5       $ rd   )r  rQ  r   s    r<   rQ  3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799r>   c                 :    U R                   R                  U5        g rd   )r  set_input_embeddings)r:   r   s     r<   rf  3Llama4ForConditionalGeneration.set_input_embeddings  s    007r>   c                 6    U R                   R                  5       $ rd   )r  get_output_embeddingsr   s    r<   ri  4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::r>   c                 :    U R                   R                  U5        g rd   )r  set_output_embeddings)r:   new_embeddingss     r<   rl  4Llama4ForConditionalGeneration.set_output_embeddings  s    11.Ar>   c                 :    U R                   R                  U5        g rd   )r  r  r  s     r<   r  *Llama4ForConditionalGeneration.set_decoder  s    ''0r>   c                 6    U R                   R                  5       $ rd   )r  r  r   s    r<   r  *Llama4ForConditionalGeneration.get_decoder  s    ""..00r>   rS  vision_feature_layervision_feature_select_strategyc                     US;  a  [        SU R                   35      eUR                  5        VVs0 sH  u  pVUc  M
  XV_M     nnnU R                  " U4SS0UD6nUR                  nU$ s  snnf )a  
Obtains image last hidden states from the vision tower and apply al projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, list[int]]`):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    vision_feature_select_strategy (`str`):
        The feature selection strategy used to select the vision feature from the vision backbone.
        Can be one of `"default"` or `"full"`
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r   fullz$Unexpected select feature strategy: r  F)r  rt  itemsrG  r  )	r:   rS  rs  rt  r  kr  image_outputsr	  s	            r<   get_image_features1Llama4ForConditionalGeneration.get_image_features  s{    . *1DDCDDgDgChijj#)<<>C>41Q$!$>C)),]U]V\]$66 Ds
   A*A*r{  r|  r  c           	      &   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nX$   R                  5       UR                  5       :w  a  [        SU SUR                  S    35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
rU  rB   z6Image features and image tokens do not match: tokens: z, features r   )rQ  r3   tensorr&   image_token_idlongr   allr   r  	expand_asr   numelr  rF   )r:   r{  r|  r  special_image_maskn_image_tokenss         r<   get_placeholder_mask3Llama4ForConditionalGeneration.get_placeholder_mask  s     !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno,2248L8L8NNHHXXcdrdxdxyzd{c|}  "!r>   r   r   rP  r  rE  r
  r  r  r$  r  image_sizesr  r@   c                 @   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  R
                  nUb  UOU R                   R                  R                  nUSL USL-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUb  U R                  UUUUS9nUR                  SUR                  S5      5      nU R                  U5      R                  UR                  UR                  5      nU R!                  XUS9nUR#                  UU5      nU R$                  " SUUUUU
UUUUUS.
UD6nUS   nSnU	Gb>  Ub  USS2UR&                  S	   S	-
  * S24   R                  UR                  5      nUS
SS2SS24   UR                  UR                  5      S:g     R)                  5       nU	S
S	S24   UR                  U	R                  5      S:g     R)                  5       nO1US
SS2SS24   R)                  5       nU	S
S	S24   R)                  5       n[*        R,                  " 5       nU" UR                  SUR                  S5      5      UR                  S5      R                  UR                  5      5      nU(       d  U4US	S -   nUb  U4U-   $ U$ [/        UUUR0                  UR2                  UR4                  Ub  WS9$ SS9$ )a   
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, LlavaForConditionalGeneration

>>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
```Nr~  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rS  rs  rt  r  rB   )r|  r  )
r   r   rP  r|  rE  r
  r  r  r$  r  r   r    .)r  r  rP  r?   rm  r  rG  )r&   r
  r  r  r  rs  rt  r  rQ  rz  rE   r  ra  r   r   r   r  masked_scatterr  rF   r   r1   CrossEntropyLossr  rP  r?   rm  )r:   r{  rS  r   r   rP  r|  rs  rt  r  rE  r
  r  r  r$  r  r  r  r  vision_flatprojected_vision_flatr  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s                                 r<   rM   &Llama4ForConditionalGeneration.forward  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $/ !**?? 	 .9 +**II 	' -t";<YZZ#(Av    557	BM#!44)%9/M'	 5 N )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
r>   c           	      f    U R                   R                  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)rP  r|  r   r$  r  r   rS  )r  prepare_inputs_for_generation)
r:   r{  rP  r|  rS  r   r$  r  r  model_inputss
             r<   r  <Llama4ForConditionalGeneration.prepare_inputs_for_generationW  sZ     **HH
+')))
 
 !! ,8(r>   )r  ra  ro  rG  rp  )NNNNNNNNNNNNNNr   N)NNNNNN)&rO   rP   rQ   rR   r  r  r  r!   rb  r+   rQ  rf  ri  rl  r  r  r3   rM  r   r  r  r   rz  r7  r  r   r   rS   r
   rL  r   r   r   r  rM   r  rT   rU   rV   s   @r<   r^  r^  s  sg   13MNH	| 	:8;B11'' $CcN3 ),	<"))":?:K:K"]b]n]n".  '+*.1537+/59@D8<-1$(,0/3&*5934$(#I
##I
 ''I
 !.	I

 u//0I
 "%I
   1 12I
 'uS$s)^'<=I
 )1I
 ))*I
 D>I
 $D>I
 'tnI
 d^I
 !!1!12I
  c5<</0!I
" \\#I
$ +,%I
& 
u22	3'I
 I
\  r>   r^  )rO  rk  r\  r  r^  )r(  )_r  dataclassesr   typingr   r   r   r3   torch.nnr1   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   configuration_llama4r!   r"   
get_loggerrO   loggerModuler$   rX   ri   r   r^   r   r   r   rS   r   r   r  r   rz   r  r
  r  r9  rO  rk  r  r  r  r  r  r  r  r  r  r   r  r  r$  r0  r\  r^  __all__rG  r>   r<   <module>r     s     ! , ,     N ! . ) 7 K B 9 m m K F & R R / @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " ."*		 >	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %D %II%<<% 
% <<	%
 U\\*% % %4Z)")) Z)z27 2j #KO #K #KL _
+ _
 _
DT
- T
n 
<; < <2;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:8)BII 8)vbii )9 )XO
")) O
dbii (6")) 6,C
- C
L@%:O @Fr>   