
    h                     h   d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	c m
Z d dlm	Z	 ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/  e&j`                  e1      Z2e$ G d de             Z3e e$d       G d de                    Z4e e$d       G d de                    Z5e e$d       G d de                    Z6 G d  d!e	jn                        Z8d"ejr                  d#e:d$ejr                  fd%Z;	 d^d&e	jn                  d'ejr                  d(ejr                  d)ejr                  d*eejr                     d+e<d,e<d-e!e#   fd.Z= G d/ d0e	jn                        Z> G d1 d2e	jn                        Z? G d3 d4e      Z@ G d5 d6e	jn                        ZA G d7 d8e	jn                        ZB G d9 d:e	jn                        ZC G d; d<e      ZDe$ G d= d>e3             ZE G d? d@e	jn                        ZF G dA dBe	jn                        ZG G dC dDe	jn                        ZH G dE dFe	jn                        ZI G dG dHe	jn                        ZJ G dI dJe	jn                        ZK G dK dLe	jn                        ZL G dM dNe	jn                        ZM G dO dPe	jn                        ZN e$dQ       G dR dSe3             ZO G dT dUe	jn                        ZP G dV dWe	jn                        ZQ e$dX       G dY dZe3             ZR G d[ d\e3e      ZSg d]ZTy)_    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   @    e Zd ZU eed<   dZdZddgZddgZdZ	dZ
dZdZy	)
JanusPreTrainedModelconfigmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFN)__name__
__module____qualname__r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment     g/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/janus/modeling_janus.pyr$   r$   /   sB    &*#,.GH#4m"DN!(-%r8   r$   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   b    e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   y)JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
r+   r,   r-   __doc__r=   r   torchFloatTensorr.   r>   r7   r8   r9   r<   r<   =   s4     9=(5#4#45<26NHU../6r8   r<   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)JanusBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater)   hidden_states
attentionsimage_hidden_states)r+   r,   r-   r?   rD   r   r@   rA   r.   r)   r
   rE   tuplerF   rG   r7   r8   r9   rC   rC   O   s|    & 6:x 1 129'+OXe_+8<M8E%"3"345<59Ju00129>B%(9(9":;Br8   rC   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)	JanusCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr)   rE   rF   rG   )r+   r,   r-   r?   rK   r   r@   rA   r.   rL   r)   r
   rE   rH   rF   rG   r7   r8   r9   rJ   rJ   p   s    " )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju00129>B%(9(9":;Br8   rJ   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e	dej                  fd
Z
 xZS )JanusVisionEmbeddingsr%   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   F)
persistent)super__init__r%   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr@   arangeexpandselfr%   	__class__s     r9   rZ   zJanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr8   
embeddingsheightwidthreturnc                    |j                   d   }| j                  j                  j                   d   }t        j                  j                         s%||k(  r ||k(  r| j                  | j                        S | j                  j                  j                  d      }|j                   d   }|| j                  z  }|| j                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rW   g      ?r   r   bicubicF)sizemodealign_corners)shapere   weightr@   jit
is_tracingrV   	unsqueezer^   r   reshapepermuter   
functionalinterpolateview)rj   rl   rm   rn   rb   rc   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r9   interpolate_pos_encodingz.JanusVisionEmbeddings.interpolate_pos_encoding   sE    !&&q)//66<<Q? yy##%+*F6UZ?**4+<+<==1188BB1Er"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr8   pixel_valuesr   c                 X   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r| j                  |||      }	n| j                  | j                        }	||	z   }|S )N)dtyper   r   )
ru   ra   rv   r   toflatten	transposer   re   rV   )
rj   r   r   _rm   rn   target_dtypepatch_embedsrl   
pos_embedss
             r9   forwardzJanusVisionEmbeddings.forward   s    *001fe++2288++LOO,O,OP!))!,66q!<
#66z65QJ001B1BCJ*,
r8   )F)r+   r,   r-   r!   rZ   r@   Tensorintr   boolr   __classcell__rk   s   @r9   rN   rN      se    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i r8   rN   rE   n_repro   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)ru   rh   rz   )rE   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr   r   rW   )r   r   )ptrainingr   )r   num_key_value_groupsr@   matmulr   ru   r   r|   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr*   attn_outputs                r9   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                   t     e Zd ZdZdef fdZ	 ddej                  deej                     de	e
   fdZ xZS )	JanusVisionAttentionz(Attention Class for Janus Vision Encoderr%   c                 F   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        d| _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                        | _        |dkD  rt        j,                  |      nt        j.                         | _        |rt        j0                  | j                        nt        j.                         | _        |r%t        j0                  | j                        | _        y t        j.                         | _        y )	N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: ).      Fr   biasr   )rY   rZ   r%   r[   r\   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rj   r%   proj_dropoutqk_normrk   s       r9   rZ   zJanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r8   rE   r   r   c                 >   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  d| j
                  | j                        }| j                  |      }|j	                  d| j
                  | j                        }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|f| j                  sdn| j                   | j"                  | j$                  d|\  }}|j	                  ||| j&                        }| j)                  |      }| j+                  |      }||fS )NrW   r   r   eager        )r   r   r   )rr   r   r   r   rz   r   r   r   r   r   r~   r   r%   _attn_implementationr   r   r   r   r   r\   r   r   )rj   rE   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 r9   r   zJanusVisionAttention.forward!  s    "/!3!3!5
GQ{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJnn
%
 
%
!\ "))*gt~~N&&{3((0|##r8   N)r+   r,   r-   r?   r!   rZ   r@   r   r   r   r   r   r   r   s   @r9   r   r     sO    2Q0 Q@ 26)$||)$ !.)$ +,	)$r8   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )JanusVisionMLPr%   c                    t         |           || _        t        |j                  |j
                  z        | _        t        |j                     | _	        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y r   )rY   rZ   r%   r   r[   	mlp_ratiointermediate_sizer	   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2ri   s     r9   rZ   zJanusVisionMLP.__init__N  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r8   rE   ro   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   rj   rE   s     r9   r   zJanusVisionMLP.forwardX  sP    /**=9m4/m4r8   )	r+   r,   r-   r!   rZ   r@   r   r   r   r   s   @r9   r   r   M  s+    ?0 ?U\\ ell r8   r   c            	            e Zd Zdef fdZedej                  dej                  dee	   dej                  fd       Z xZS )r(   r%   c                 R   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        || _        y N)eps)rY   rZ   r[   r\   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr%   ri   s     r9   rZ   z JanusVisionEncoderLayer.__init__b  st    ++<<F<Q<QR-f5<<F<Q<QR!&)r8   rE   r   r   ro   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rE   r   r7   r   r   r   r   rj   rE   r   r   residualr   s         r9   r   zJanusVisionEncoderLayer.forwardk  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r8   )r+   r,   r-   r!   rZ   r   r@   r   r   r   rA   r   r   r   s   @r9   r(   r(   a  s^    0  ||  +,	
 
		 r8   r(   c                   j     e Zd ZdZdef fdZe	 ddeej                     de
e   defd       Z xZS )	JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r%   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rY   rZ   r%   r   
ModuleListrangenum_hidden_layersr(   layersgradient_checkpointingrj   r%   r   rk   s      r9   rZ   zJanusVisionEncoder.__init__  sV    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A#r   r   ro   c                 T    |}| j                   D ]  } |||fi |} t        |      S )N)rD   )r   r   )rj   inputs_embedsr   r   rE   encoder_layers         r9   r   zJanusVisionEncoder.forward  s>     &![[M) M ) ??r8   r   )r+   r,   r-   r?   r!   rZ   r   r   r@   r   r   r   r   r   r   r   s   @r9   r   r     s`    ,0 ,  26@ !.@ +,	@
 
@ @r8   r   c                        e Zd ZdZ fdZdej                  dedefdZ	 ddej                  de	ej                     d	e
ej                  e	ej                     e	e
ej                        f   fd
Z xZS )JanusAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        d| _
        |j                  | _        t        j                  | j                  d| j                  z  d      | _        |j                  ret        j                   t#        j$                  | j                              }t        j                   t#        j$                  | j                              }nd }d }|Qt#        j&                  |t#        j(                  |d      |f      }t        j                   |      | j                  _        t        j                  | j                  | j                        | _        y )	Nr   r   r   r   Fr   r   )requires_grad)rY   rZ   r%   r[   r\   r   r   r   r   r   r   r   r   r   qkvqkv_bias	Parameterr@   zeroscat
zeros_liker   
projection)rj   r%   q_biasv_biasr  rk   s        r9   rZ   zJanusAttention.__init__  su   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCr8   tensorr   bszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )r~   r   r   r   r   )rj   r  r   r  s       r9   _shapezJanusAttention._shape  s7    {{3GQQRSUVWbbddr8   rE   	head_maskro   c                 6   |j                         \  }}}| j                  |      }|j                  ||d| j                  || j                  z        j	                  ddddd      }|d   |d   |d   }
}	}t
        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
fd| j                  sdn| j                  | j                  d	|\  }}|j                  ||d
      j                         }| j                  |      }||fS )z#Input shape: Batch x Time x Channelr   r   r   r      r   Nr   )r   r   r   rW   )rr   r  rz   r   r{   r   r%   r   r   r   r   r   r   r  )rj   rE   r  r   r  tgt_lenr\   	mixed_qkvr   r   r   r   r   r   s                 r9   r   zJanusAttention.forward  s-    #0"4"4"6WiHH]+	%%c7At~~yTXTbTbGbckkq!Q
	 2;1y|YWX\,j(?;;++w6"9$++:Z:Z"[$7		%

  #}}C$2H2HJJ	%
 	%
!\ "))#w;FFHook2L((r8   r   )r+   r,   r-   r?   rZ   r@   r   r   r  r   rH   r   r   r   s   @r9   r  r    s    GD>eU\\ eC ec e -1$)||$) ELL)$)
 
u||Xell3XeELL>Q5RR	S$)r8   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rY   rZ   r%   r	   r   r   r   r   r[   r   r   r   ri   s     r9   rZ   zJanusMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr8   rE   ro   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   s     r9   r   zJanusMLP.forward  s4    /**=9/r8   )r+   r,   r-   rZ   r@   r   r   r   r   s   @r9   r  r    s$    KU\\ ell r8   r  c            	            e Zd Zdef fdZedej                  dej                  dee	   dej                  fd       Z xZS )JanusEncoderLayerr%   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )rY   rZ   r[   r\   r  r   r   r   r   r   r  r   r   ri   s     r9   rZ   zJanusEncoderLayer.__init__  sm    ++'/<<F<Q<QRF#<<F<Q<QRr8   rE   r   r   ro   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rE   r  r7   r   r   s         r9   r   zJanusEncoderLayer.forward
  s     !((7)4>> 
'$
 
q
 &0 ((7/%0r8   )r+   r,   r-   r    rZ   r   r@   r   r   r   rA   r   r   r   s   @r9   r  r    s_    S{ S ||  +,	
 
		 r8   r  c                        e Zd ZU dZeed<   eedZdef fdZ	e
e	 	 d
deej                     dedee   deeef   fd              Zd	 Z xZS )JanusVisionModelr   r%   )rE   rF   c                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )rY   rZ   r%   r[   rN   rl   r   encoderr   r   r   post_layernorm	post_init)rj   r%   r\   rk   s      r9   rZ   zJanusVisionModel.__init__,  s]     &&	/7)&1 ll9&:O:OPr8   r   r   ro   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r   r   r   )rD   pooler_outputr7   )r   rl   r#  rD   r$  r   )rj   r   r   r   rE   encoder_outputsrD   pooled_outputs           r9   r   zJanusVisionModel.forward7  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
r8   c                     | j                   S r   )rl   rj   s    r9   get_input_embeddingsz%JanusVisionModel.get_input_embeddingsT  s    r8   r   )r+   r,   r-   main_input_namer!   r.   r  r  _can_record_outputsrZ   r   r   r   r@   rA   r   r   r   r   rH   r   r   r,  r   r   s   @r9   r!  r!  #  s    $O*$
	0 	  59).
u001
 #'
 +,	

 
u00	1
  
6r8   r!  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr%   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w Nr   )rY   rZ   r   r   r[   projection_dimr   r   r   depthhidden_layersr	   r   r   r   s      r9   rZ   zJanusVisionAlignerMLP.__init__Y  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   &1B<c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r   r5  r   rj   rE   layers      r9   r   zJanusVisionAlignerMLP.forwardb  B    /''E ..}=M!-0M ( r8   )r+   r,   r-   r!   rZ   r   r   r   s   @r9   r0  r0  X  s    70 7r8   r0  c                        e Zd ZdZdef fdZdej                  fdZdej                  dej                  fdZ xZS )	JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r%   c                    t         |           |j                  | _        |j                  | _        t        |dd      | _        t        j                  | j                  | j                        | _	        |j                  gdz  | _        y )Nbetag      ?r   )rY   rZ   num_embeddingsr\   embedding_dimgetattrr?  r   rd   	embeddingrb   quant_state_dimsri   s     r9   rZ   z"JanusVQVAEVectorQuantizer.__init__u  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8r8   hidden_statec           
      L   |j                  dddd      j                         }|j                  d| j                        }t	        j
                  |dz  dd      t	        j
                  | j                  j                  dz  d      z   dt	        j                  d	|| j                  j                  j                  dd            z  z
  }t	        j                  |d      }| j                  |      j                  |j                        }t	        j                  |j                         |z
  dz        | j                  t	        j                  ||j                         z
  dz        z  z   }|||z
  j                         z   }|j                  dddd      j                         }|||fS )
Nr   r   r   r   rW   T)r   keepdimr   z	bd,dn->bn)r{   r   r~   rA  r@   sumrC  rv   einsumr   argminru   meandetachr?  )rj   rE  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrK   s          r9   r   z!JanusVQVAEVectorQuantizer.forward~  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BDNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e,"5"5"77A=\
 P
 

 *-?,-N,V,V,XX 0771aCNNP!4)===r8   image_tokensro   c                 B   |j                   d   }| j                  j                  j                   d   }| j                  |      }t        j                  |dd      }|j                  |g| j                  |      }|j                  dddd      j                         }|S )Nr   rW   r   )r   r   r   r   )	ru   rC  rv   F	normalizer~   rD  r{   r   )rj   rR  r   emb_dimrQ  s        r9   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r8   )r+   r,   r-   r?   r"   rZ   r@   r   r   
LongTensorrA   rW  r   r   s   @r9   r=  r=  j  sD    9/ 9>ELL >6"u/?/? "EDUDU "r8   r=  c                   *     e Zd Z	 	 d fd	Zd Z xZS )JanusVQVAEResnetBlockc                    t         |           || _        ||n|| _        || _        t
        j                  j                  d|dd      | _        t
        j                  j                  ||ddd      | _
        t
        j                  j                  d|dd      | _        t
        j                  j                  |j                        | _        t
        j                  j                  ||ddd      | _        | j                  | j                  k7  r`| j                  r*t
        j                  j                  ||ddd      | _        y t
        j                  j                  ||ddd      | _        y y )	N    ư>T
num_groupsr`   r   affiner   r   rS   rT   rU   r   )rY   rZ   rQ   rR   use_conv_shortcutr@   r   	GroupNormnorm1r_   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rj   r%   rQ   rR   rh  rk   s        r9   rZ   zJanusVQVAEResnetBlock.__init__  s1    	&+7+?K\!.XX''2KUYbf'g
XX__[,AVWab_c
XX''2LVZcg'h
xx''7XX__\<QWXbc_d
t000%%%*XX__[,\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r8   c                    |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  |      }| j                  | j                  k7  r3| j                  r| j                  |      }||z   S | j                  |      }||z   S r   )rd  r@   sigmoidre  rf  r   rg  rQ   rR   rb  rh  ri  )rj   rE   r   s      r9   r   zJanusVQVAEResnetBlock.forward  s     

=1}55

=1

=1}55]3

=1t000%%--h7 -''  ,,X6-''r8   r   r+   r,   r-   rZ   r   r   r   s   @r9   rZ  rZ    s    
 s.(r8   rZ  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEAttnBlockc                    t         |           || _        t        j                  j                  d|dd      | _        t        j                  j                  ||ddd      | _        t        j                  j                  ||ddd      | _	        t        j                  j                  ||ddd      | _
        t        j                  j                  ||ddd      | _        y )Nr\  r]  Tr^  r   r   ra  )rY   rZ   rQ   r@   r   rc  normr_   qkvproj_outrj   rQ   rk   s     r9   rZ   zJanusVQVAEAttnBlock.__init__  s    &HH&&";TXae&f	kqQR\]^kqQR\]^kqQR\]^[aXYcder8   c                 t   |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  \  }}}}	|j                  ||||	z        j                  ddd      }|j                  ||||	z        }t        j                  ||      }
|
t        |      dz  z  }
t        j                  |
d      }
|j                  ||||	z        }|
j                  ddd      }
t        j                  ||
      j                  ||||	      }| j                  |      }||z   S )Nr   r   r   r   rH  )rp  rq  rr  rs  ru   rz   r{   r@   bmmr   rT  r   rt  )rj   rE   r   r   r   r   r   channelsrm   rn   r   r   s               r9   r   zJanusVQVAEAttnBlock.forward  s5    		-0vvm,VVM*
vvm, /;.@.@+
Hfe#++J&5.QYYZ[]^`ab''
HfunM
yyz:#s8}'>?yy15 $++J&5.Q#++Aq!4iil;CCJPXZ`bghmmK0+%%r8   rl  r   s   @r9   rn  rn    s    f&r8   rn  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   r   ra  )rY   rZ   r   r_   convru  s     r9   rZ   z!JanusVQVAEConvDownsample.__init__  s'    IIk;AaYZ[	r8   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r   r   r   constantr   )padrs   r   )rT  r  r|  r   s     r9   r   z JanusVQVAEConvDownsample.forward  s+    mJVWX		-0r8   rl  r   s   @r9   rz  rz    s    \r8   rz  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                 t    t         |           t        j                  j	                  ||ddd      | _        y )Nr   r   ra  )rY   rZ   r@   r   r_   r|  ru  s     r9   rZ   zJanusVQVAEConvUpsample.__init__  s.    HHOOK!TU_`Oa	r8   c                 X    t        j                  |dd      }| j                  |      }|S )Ng       @nearest)scale_factorrs   )rT  r}   r|  r   s     r9   r   zJanusVQVAEConvUpsample.forward	  s(    m#IV		-0r8   rl  r   s   @r9   r  r    s    br8   r  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEMidBlockr%   rx  c                     t         |           t        |||      | _        t	        |      | _        t        |||      | _        y )Nr%   rQ   rR   )rY   rZ   rZ  block_1rn  attn_1block_2)rj   r%   rx  rk   s      r9   rZ   zJanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
r8   rE   ro   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r   s     r9   r   zJanusVQVAEMidBlock.forward  s2    ]3M2]3r8   )
r+   r,   r-   r"   r   rZ   r@   r   r   r   r   s   @r9   r  r    s2    
/ 
3 
U\\ ell r8   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )JanusVQVAEEncoderc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }|j                  }|j                  }t        j                  j                  ||ddd      | _        dt        |      z   }|| _        t        j                          | _        t%        | j                        D ]   }t        j                          }	t        j                          }
|||   z  }|||   z  }t%        | j
                        D ]N  }|	j'                  t)        |||             |}|| j                  dz
  k(  s5|
j'                  t+        |             P t        j,                         }|	|_        |
|_        || j                  dz
  k7  rt3        |      |_        | j"                  j'                  |        t7        |      | _        t        j                  j;                  d|dd	      | _        t        j                  j                  ||rd
|z  n|ddd      | _        y )Nr   r   ra  )r   r  r\  r]  Tr^  r   ) rY   rZ   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrQ   double_latentlatent_channelsr@   r   r_   conv_inrH   in_channel_multiplierr   downr   appendrZ  rn  Moduleblockattnrz  
downsampler  midrc  norm_outconv_out)rj   r%   r  rQ   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  rk   s                  r9   rZ   zJanusVQVAEEncoder.__init__&  s   "6#<#<=$33,,((,, 00#66xx{MqYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$'<W'EEH%(:7(CCI !4!45)%$,%. %d22Q66KK 3H => 6 99;DDJDI$..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
r8   r   c                    | j                  |      g}t        | j                        D ]  }t        | j                        D ]  } | j                  |   j
                  |   |d         }t        | j                  |   j                        dkD  r" | j                  |   j                  |   |      }|j                  |        || j                  dz
  k7  s|j                  | j                  |   j                  |d                 |d   }| j                  |      }| j                  |      }|t        j                  |      z  }| j                  |      }|S )NrW   r   r   )r  r   r  r  r  r  r  r  r  r  r  r  r@   rk  r  )rj   r   rE   r  r  rE  rD   s          r9   r   zJanusVQVAEEncoder.forwardY  sH   l34T112G !4!45@tyy177@!"%  tyy)../!3#C499W#5#:#:7#CL#QL$$\2 6 $..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  r8   )r+   r,   r-   rZ   r@   rX  r   r   r   s   @r9   r  r  %  s    1
f!E$4$4 !r8   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusVQVAEDecoderc           	      v   t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }||j                  | j                  dz
     z  }t        j                  j                  ||ddd      | _        t        ||      | _        t        j                         | _        t#        t%        | j                              D ]  }t        j                         }t        j                         }||j                  |   z  }	t%        | j
                  dz         D ]N  }
|j'                  t)        |||	             |	}|| j                  dz
  k(  s5|j'                  t+        |             P t        j,                         }||_        ||_        |dk7  rt3        |      |_        | j                   j'                  |        t        j                  j7                  d|dd	      | _        t        j                  j                  ||ddd      | _        y )
Nr   r   ra  r  r   r\  r]  Tr^  )rY   rZ   r  r  r  r  r  r  rR   r@   r   r_   r  r  r  r   upreversedr   r  rZ  rn  r  r  r  r  upsamplerc  r  r  )rj   r%   r  r  rR   r  r  r  r  r  r  r  rk   s               r9   rZ   zJanusVQVAEDecoder.__init__s  s   "6#<#<=$33,, 00** !6#<#<T=Q=QTU=U#VV xxaXYcde &fh7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %d22Q66KK 3H => : BBHBG!|4X>GGNN2) =. **bxUYbf*g,AVWabcr8   rE  ro   c                 b   | j                  |      }| j                  |      }t        | j                        D ]  }t        | j                  dz         D ]l  } | j
                  |   j                  |   |      }t        | j
                  |   j                        dkD  sK | j
                  |   j                  |   |      }n || j                  dz
  k7  s| j
                  |   j                  |      } | j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr   r   )r  r  r   r  r  r  r  r  r  r  r  r@   rk  r  )rj   rE  r  r  s       r9   r   zJanusVQVAEDecoder.forward  s   ||L1 xx- T112G !4!4q!89>twww/55g>|Ltwww',,-1#A4777#3#8#8#A,#OL : $..22#www/88F 3 }}\2l33}}\2r8   )r+   r,   r-   rZ   r@   rA   r   r   r   s   @r9   r  r  r  s)    ,d\E$5$5 %:K:K r8   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                        e Zd ZU eed<   g dZdZdef fdZdej                  fdZ
dej                  dej                  fdZeedej                  deej                  ej                  f   fd	              Z xZS )

JanusVQVAEr%   )rn  rZ  r=  r   c                    t         |   |       t        |      | _        t	        |      | _        t        j                  j                  |j                  |j                  d      | _        t        j                  j                  |j                  |j                  d      | _        | j                          t        |      | _        d| _        | j#                          y )Nr   F)rY   rZ   r  r#  r=  quantizer@   r   r_   r  r\   
quant_convpost_quant_convevalr  decoderr   r%  ri   s     r9   rZ   zJanusVQVAE.__init__  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+# 	r8   c                 z    | j                  |      }| j                  |      }| j                  |      \  }}}|||fS r   )r#  r  r  )rj   r   rE   quantemb_lossindicess         r9   encodezJanusVQVAE.encode  s@    \26#'==#? xh''r8   rR  ro   c                    |j                   d   | j                  j                  d   | j                  j                  d   z  k7  rMt        d| j                  j                  d   | j                  j                  d   z   d|j                    d      | j                  j	                  |      }| j                  |      }| j                  |      }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)ru   r  rD  r   rW  r  r  )rj   rR  codebook_entryrE   r   s        r9   decodezJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r8   c                     |j                   d   }| j                  |      \  }}}| j                  |j                  |d            }t	        ||      S )Nr   rW   )ru   r  r  r~   r<   )rj   r   r   r  r>   r  r=   s          r9   r   zJanusVQVAE.forward  sQ     "''*
)-\)B&~w#{{7<<
B+GH 4nEEr8   )r+   r,   r-   r"   r.   r1   r-  rZ   r@   rX  r  rA   r  r   r   rH   r   r   r   s   @r9   r  r    s     
 %O/ (5#3#3 (5#3#3 8I8I & F''F 
u  %"3"33	4F  Fr8   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr%   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w r2  )rY   rZ   r   r   r\   r3  r   r   r   r   r5  r	   r   r   r   s      r9   rZ   zJanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rr6  c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r8  r9  s      r9   r   zJanusVQVAEAlignerMLP.forward  r;  r8   )r+   r,   r-   r"   rZ   r   r   r   s   @r9   r  r    s    7/ 7r8   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r%   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j
                  |j                        | _        y r   )rY   rZ   r   r   image_token_embed_dimr3  rt  r	   r   r   r@  vision_headri   s     r9   rZ   zJanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr8   rE   ro   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rt  r   r  r   s     r9   r   zJanusVQVAEHead.forward  s6    m4**=9((7r8   )r+   r,   r-   r?   r"   rZ   r@   r   r  r   r   r   s   @r9   r  r    s0    YS/ SU\\ ell r8   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       e Zd Zdef fdZd Zd Zd Zdej                  dej                  dej                  fd	Zee	 	 	 	 	 	 	 	 	 ddeej                     d
eej                     deej                     deej                     dee   deej                     deej                     dee   deeej                  f   fd              Z xZS )
JanusModelr%   c                    t         |   |       || _        t        j	                  |j
                        | _        t        | j                  j                        | _        t        j	                  |j                        | _        t        j                  | j                  j                  j                  | j                  j                  j                        | _        t#        | j                  j                        | _        t'        | j                  j                        | _        t+        j,                  |j.                        | _        d| _        | j5                          y )N)r%   F)rY   rZ   r%   r!  _from_configvision_configvision_modelr0  alignerr  	vq_configvqmodelr   rd   r@  r\   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr   r%  ri   s     r9   rZ   zJanusModel.__init__#  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r8   c                 6    | j                   j                         S r   )r  r,  r+  s    r9   r,  zJanusModel.get_input_embeddings8  s    ""7799r8   c                 :    | j                   j                  |       y r   )r  set_input_embeddingsrj   r   s     r9   r  zJanusModel.set_input_embeddings;  s    007r8   c                 ^    | j                  |      }| j                  |j                        }|S r   )r  r  rD   )rj   r   image_embedss      r9   get_image_featureszJanusModel.get_image_features>  s,    ((6||L$B$BCr8   	input_idsr   image_featuresc                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }||   j                         |j                         k7  r0|j                  d   |j                  d   z  }t        d| d|       |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   devicerW   r   r   z6Image features and image tokens do not match: tokens: z, features )r,  r@   r  r%   image_token_idlongr  allrI  ry   	expand_asr   numelru   r   )rj   r  r   r  special_image_maskn_image_tokensn_image_featuress          r9   get_placeholder_maskzJanusModel.get_placeholder_maskC  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno+,2248L8L8NN-33A69M9Ma9PPHHXXcdtcuv  "!r8   r   r   rV   r)   cache_position	use_cachelogits_to_keepc
                    |d u |d uz  rt        d      | | j                         |      }||| j                  |      }|j                  d|j                  d         }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||||	d|
}t        |j                  |j                  |j                  |j                  |      S d       S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onerW   )r   r  )r   r   rV   r)   r  r  r  )rD   r)   rE   rF   rG   r7   )r   r,  r  rz   ru   r   r  r   r  masked_scatterr  rC   rD   r)   rE   rF   )rj   r  r   r   rV   r)   r  r   r  r  r   r  r  image_attention_mask	lm_outputs                  r9   r   zJanusModel.forward[  sH    -t";<s   7D557	BM#22<@L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M~^M'D'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r8   )	NNNNNNNNr   )r+   r,   r-   r    rZ   r,  r  r  r@   rX  rA   r  r   r   r   r   r
   r   r   r   r   r   r   s   @r9   r  r    s4   { *:8
"))":?:K:K"]b]n]n"0  15481537+/5959$(34.
E,,-.
 u001.
 !.	.

 u//0.
 "%.
 !!1!12.
   1 12.
 D>.
 c5<</0.
  .
r8   r  c                   h    e Zd ZddgZdZdef fdZd Zd Zde	j                  d	e	j                  fd
Zee	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                      dee	j                     dee	j                     dee   dee	j                     dee	j                      dee	j                     dee   deee	j                  f   dee   fd              Z	 	 	 	 	 	 d fd	Zde	j                  fdZe	j4                  	 	 	 ddee	j                     dee	j                     dee   f fd       Z xZS )JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr%   c                     t         |   |       || _        t        |      | _        t        j                  |j                  j                  |j                  j                  d      | _
        | j                          y )NFr   )rY   rZ   r%   r  r&   r   r   r  r[   
vocab_sizelm_headr%  ri   s     r9   rZ   z&JanusForConditionalGeneration.__init__  s\     '
yy!3!3!?!?ASASA^A^ejk 	r8   c                 J    | j                   j                  j                         S r   )r&   r  r,  r+  s    r9   r,  z2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??r8   c                 N    | j                   j                  j                  |       y r   )r&   r  r  r  s     r9   r  z2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=r8   inputsro   c                 r    | j                   j                  |      }| j                   j                  |      }|S r   )r&   r  r  )rj   r  rE  s      r9   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\Br8   r  r   r   rV   r)   r  r   labelsr  r  r   c                     | j                   d|||||||	|d|}|j                  }t        |
t              rt	        |
 d      n|
}| j                  |dd|ddf         }d}|4 | j                  d||| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   rV   r)   r   r  r  N)rL   r  r  )rK   rL   r)   rE   rF   rG   r7   )r&   rD   
isinstancer   slicer  loss_functionr%   r  r  rJ   r)   rE   rF   rG   )rj   r  r   r   rV   r)   r  r   r  r  r  r   outputsrE   slice_indicesrL   rK   s                    r9   r   z%JanusForConditionalGeneration.forward  s    , $** 

%)%+')

 

  118B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r8   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r)   r   r   r  r  r   r   )rY   prepare_inputs_for_generation)rj   r  r   r)   r   r   r  r  r   model_inputsrk   s             r9   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !!+7L(r8   rR  c                 x    | j                   j                  j                  |      }|j                  dddd      }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   r   )r&   r  r  r{   )rj   rR  decoded_images      r9   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9r8   logits_processorc           	         |j                  d| j                        }t        j                  |      }|j                  dd      }|dk(  rt	        %|   d|||d d|S  |j                  di |}|j                         t        j                  t        j                  fvrt        d      |j                          | j                  |j                                ||n	t               }d|d<   |j                  t         j#                  d       d	|_        |j                  |d
<   | j%                  ||j&                  |      \  }}	}|j(                  |j*                  }}
t-        |j.                        dk7  rt        d|j.                   d      |d u}| j1                  |||j*                         |j                  r:|j                  dkD  r+|j3                  t5        |j                               d |_        | j7                  ||j.                  d   |d ||      } | j8                  d|||j:                  d|\  }}| j<                  j>                  j@                  jB                  }|j.                  \  }}|jE                  dd      }|j                  dd       }|jE                  dd      }||d<   ||d d d f   |j&                  k7  ||d d d f   |jF                  d   k7  z  }||d d d f   jI                  ||jJ                          | jM                         |      }| jO                  |||      }|jQ                  dd       @| jS                  |jT                  xs d|dz  tW        |jX                  ||z         |      |d<   t[        j\                  ||f|
|      }|j^                  }|j`                  }|jb                  }|jd                  }|jf                  }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti        |      D ]x  } | jj                  d||d|}|d   jm                  |j*                        |d<   |d   jm                  |j*                        |d<    | j<                  jn                  di |||d}| jq                  ||      }|jr                  d d dd d f   ju                         } | j<                  jw                  |       }! |||!      }"|jx                  r>t[        jz                  |"d      }#t[        j|                  |#d      j                  d      }$nt[        j                  |"d      }$|$|d d |f<   t[        j                  |$|$g      }$|$j                  d      }$| j                  |$      }{ |r@|r|!fz  }|r| j                         fz  }|r|j                  z  }|r|j                  z  }|rt        |!|||j                        S |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr)   static)cache_implementationr   max_cache_lenmodel_kwargsr  r7   )r   r  r  )output_attentionsoutput_hidden_statesrW   rH  )num_samples)	sequencesscoresrL   rF   rE   r)   )Ipopr  copydeepcopyrY   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  loggerwarning_prepare_model_inputsbos_token_idr   r  r  ru   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr&   r  r%   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr,  _get_initial_cache_positionget
_get_cacher  max
max_lengthr@   r	  r  r  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationrD   cloner  	do_sampler   multinomialsqueezeargmaxr
  ry   r  floatrF   rE   r   r)   )&rj   r  r   r  r   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskr3  r   r   input_tokensmaskr   generated_tokensr  r  r=  r>  r?  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  rE  r   next_token_scoresprobs
next_tokenrk   s&                                        r9   r$  z&JanusForConditionalGeneration.generate  su    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   0(//9&9 002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N%22L5
1	#\ ")9)9vy1$MiooM^EF  %3$$>!$$%68QZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #E$"D"D #
))>>#
 	#
	<  ::2299JJ'oo
G ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW3113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@PSZ@Z[) /> /L*+ !;;
4D'EU[ab .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A=4== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*/djj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r8   )
NNNNNNNNNr   )NNNNNN)NNN)r+   r,   r-   _tied_weights_keysr5   r    rZ   r,  r  r@   r   r  r   r   r   rX  rA   r
   r   r   r   r   r   r   r  r  no_gradr   r$  r   r   s   @r9   r  r    s   DFVW!{ @>ell u|| 
  15481537+/5959-1$(341
E,,-1
 u0011
 !.	1

 u//01
 "%1
 !!1!121
   1 121
 ))*1
 D>1
 c5<</01
 +,1
  1
l <
 
 ]] *.59:>	|$&|$ !!1!12|$ ##67	|$ |$r8   r  )r$   r  r  r  r!  )r   )Ur"  dataclassesr   typingr   r   r   r@   torch.nn.functionalr   r|   rT  activationsr	   cache_utilsr
   
generationr   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor   configuration_janusr    r!   r"   
get_loggerr+   r+  r$   r<   rC   rJ   r  rN   r   r   r   rF  r   r   r   r(   r   r  r  r  r!  r0  r=  rZ  rn  rz  r  r  r  r  r  r  r  r  r  __all__r7   r8   r9   <module>rg     sr  ,  ! , ,     !   u u 9 9 X X F & ] ] /  Q Q 
		H	% 
.? 
. 
. 
	7{ 	7 	7 
C; C C6 
C+ C C4HBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4I$299 I$XRYY ( 8  F@ @DI)RYY I)Xryy 2 D 1+ 1 1hBII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH :F% :F:Fz299 $RYY   
i
% i

i
Xt$$8/ t$n	 tr8   