
    <h                        S SK r S SKJr  S SKJrJrJr  S SKrS SKJr  SSK	J
r
  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+  \#" 5       (       a	  S SK,Js  J-r.  \$R^                  " \05      r1\! " S S\5      5       r2\\!" SS9 " S S\5      5       5       r3\\!" SS9 " S S\5      5       5       r4\\!" SS9 " S S\5      5       5       r5 " S S \Rl                  5      r7S!\Rp                  S"\9S#\Rp                  4S$ jr: SWS%\Rl                  S&\Rp                  S'\Rp                  S(\Rp                  S)\\Rp                     S*\;S+\;S,\\    4S- jjr< " S. S/\Rl                  5      r= " S0 S1\Rl                  5      r> " S2 S3\5      r? " S4 S5\Rl                  5      r@\! " S6 S7\25      5       rA " S8 S9\Rl                  5      rB " S: S;\Rl                  5      rC " S< S=\Rl                  5      rD " S> S?\Rl                  5      rE " S@ SA\Rl                  5      rF " SB SC\Rl                  5      rG " SD SE\Rl                  5      rH " SF SG\Rl                  5      rI " SH SI\Rl                  5      rJ\!" SJS9 " SK SL\25      5       rK " SM SN\Rl                  5      rL " SO SP\Rl                  5      rM\!" SQS9 " SR SS\25      5       rN " ST SU\2\5      rO/ SVQrPg)X    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_availablelogging	torch_int   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   H    \ rS rSr% \\S'   SrSrSS/rSS/r	Sr
SrSrS	rS
rg)JanusPreTrainedModel8   configmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskF N)__name__
__module____qualname____firstlineno__r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment__static_attributes__r,       `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/janus/modeling_janus.pyr$   r$   8   sB    &*#,.GH#4m"DN!(-%r;   r$   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   d    \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Srg)JanusVQVAEOutputF   z
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Reconstructed pixel values after encoding and decoding the input.
embedding_loss (`torch.FloatTensor`):
    Embedding loss.
Ndecoded_pixel_valuesembedding_lossr,   )r-   r.   r/   r0   __doc__rA   r   torchFloatTensorr1   rB   r:   r,   r;   r<   r?   r?   F   s/     9=(5#4#45<(,NE%%,r;   r?   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
JanusBaseModelOutputWithPastX   a(  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.

    If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
    hidden_size)` is output.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
    `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
    encoder_sequence_length, embed_size_per_head)`.

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_stater*   hidden_states
attentionsimage_hidden_statesr,   )r-   r.   r/   r0   rC   rI   r   rD   rE   r1   r*   tuplerJ   rK   rL   r:   r,   r;   r<   rG   rG   X   s    , 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;Br;   rG   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                   "   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)JanusCausalLMOutputWithPast|   aV  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr*   rJ   rK   rL   r,   )r-   r.   r/   r0   rC   rQ   r   rD   rE   r1   rR   r*   listrJ   rM   rK   rL   r:   r,   r;   r<   rO   rO   |   s    $ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;Br;   rO   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )JanusVisionEmbeddings   r&   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   F)
persistent)super__init__r&   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferrD   arangeexpandselfr&   	__class__s     r<   rb   JanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr;   
embeddingsheightwidthreturnc                    UR                   S   nU R                  R                  R                   S   n[        R                  R                  5       (       d%  XE:X  a   X#:X  a  U R                  U R                  5      $ U R                  R                  R                  S5      nUR                   S   nX R                  -  nX0R                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   r_   g      ?r   r   bicubicF)sizemodealign_corners)shaperm   weightrD   jit
is_tracingr^   	unsqueezerf   r   reshapepermuter   
functionalinterpolateview)rr   ru   rv   rw   rj   rk   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r<   interpolate_pos_encoding.JanusVisionEmbeddings.interpolate_pos_encoding   s:    !&&q)//66<<Q? yy##%%+*F6?**4+<+<==1188BB1Er".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr;   pixel_valuesr   c                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )N)dtyper   r   )
r~   ri   r   r   toflatten	transposer   rm   r^   )
rr   r   r   _rv   rw   target_dtypepatch_embedsru   
pos_embedss
             r<   forwardJanusVisionEmbeddings.forward   s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
r;   )r&   rd   re   rj   rk   ri   rf   rm   F)r-   r.   r/   r0   r!   rb   rD   Tensorintr   boolr   r:   __classcell__rs   s   @r<   rU   rU      sj    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i  r;   rU   rJ   n_reprx   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r~   rp   r   )rJ   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr;   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr   r   r_   )r   r   )ptrainingr   )r   num_key_value_groupsrD   matmulr   r~   r   r   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr+   attn_outputs                r<   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r;   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\   4S jjrS	rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr&   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr   biasr   )ra   rb   r&   rc   rd   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rr   r&   proj_dropoutqk_normrs   s       r<   rb   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r;   rJ   r   r   c                 <   UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	U4U R                  (       d  SOU R                   U R"                  U R$                  S.UD6u  pUR	                  XEU R&                  5      nU R)                  U5      nU R+                  U5      nX4$ )Nr_   r   r   eager        )r   r   r   )r{   r   r   r   r   r   r   r   r   r   r   r   r&   _attn_implementationr   r   r   r   r   rd   r   r   )rr   rJ   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 r<   r   JanusVisionAttention.forward.  s    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0##r;   )r   r&   rd   r   r   r   r   r   r   r   r   r   r   r   r   N)r-   r.   r/   r0   rC   r!   rb   rD   r   r   r   r   r   r:   r   r   s   @r<   r   r     sT    2Q0 Q@ 26)$||)$ !.)$ +,	)$ )$r;   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPiZ  r&   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g r   )ra   rb   r&   r   rc   	mlp_ratiointermediate_sizer	   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2rq   s     r<   rb   JanusVisionMLP.__init__[  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r;   rJ   rx   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   rr   rJ   s     r<   r   JanusVisionMLP.forwarde  sP    /**=9m4/m4r;   )r   r&   r   r   r   r   r   )r-   r.   r/   r0   r!   rb   rD   r   r   r:   r   r   s   @r<   r   r   Z  s0    ?0 ?U\\ ell  r;   r   c            
          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S\\	   S\
\R                     4S jjrS	rU =r$ )r)   in  r&   c                 H  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        Xl        g N)eps)ra   rb   rc   rd   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr&   rq   s     r<   rb    JanusVisionEncoderLayer.__init__o  sr    ++<<F<Q<QR-f5<<F<Q<QR!&)r;   rJ   r   output_attentionsrx   c                     UnU R                  U5      nU R                  UUUS9u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU4nU(       a  Xe4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rJ   r   r   )r   r   r   r   )rr   rJ   r   r   residualr   outputss          r<   r   JanusVisionEncoderLayer.forwardx  s      !((7&*nn')/ '5 '
#
 !0 ((7/ 0 "&Gr;   )r&   rd   r   r   r   r   r   )r-   r.   r/   r0   r!   rb   rD   r   r   r   rM   rE   r   r:   r   r   s   @r<   r)   r)   n  s\    0  -2	$||$ $ $D>	$
 
u  	!$ $r;   r)   c            
          ^  \ rS rSrSrS\4U 4S jjr\   SS\\	R                     S\\   S\\   S\4S	 jj5       rS
rU =r$ )JanusVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`JanusVisionEncoderLayer`].

Args:
    config: JanusVisionConfig
r&   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
ra   rb   r&   r   
ModuleListrangenum_hidden_layersr)   layersgradient_checkpointingrr   r&   r   rs   s      r<   rb   JanusVisionEncoder.__init__  sT    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A%r   r   output_hidden_statesrx   c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUnU R                   H-  nU(       a  XW4-   nU" UUUS9n	U	S   nU(       d  M%  XiS   4-   nM/     U(       a  XW4-   n[	        UUUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr,   )r   r   r   )rI   rJ   rK   )r&   r   r  r  r   )
rr   inputs_embedsr   r   r  encoder_statesall_attentionsrJ   encoder_layerlayer_outputss
             r<   r   JanusVisionEncoder.forward  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[M#!/2B!B)"3M *!,M  !/3C2E!E )  +.>>N+(%
 	
r;   )r&   r  r  NNN)r-   r.   r/   r0   rC   r!   rb   r   r   rD   r   r   r   r   r:   r   r   s   @r<   r  r    sm    ,0 ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r;   r  c                      ^  \ rS rSr% Sr\\S'   S\4U 4S jjr\     SS\	\
R                     S\	\   S\	\   S\	\   S\S	\\\4   4S
 jj5       rS rSrU =r$ )JanusVisionModeli  r   r&   c                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )ra   rb   r&   rc   rU   ru   r  encoderr   r   r   post_layernorm	post_init)rr   r&   rd   rs   s      r<   rb   JanusVisionModel.__init__  sY     &&	/7)&1 ll9:O:OPr;   r   r  return_dictr   rx   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  UUUUS9nUS   nU R                  U5      nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r   )r  r   r  r  r   r   )rI   pooler_outputrJ   rK   )r&   r   r  use_return_dictr   ru   r  r  r   rJ   rK   )
rr   r   r   r  r  r   rJ   encoder_outputsrI   pooled_outputs
             r<   r   JanusVisionModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%58KKK)/')77&11	
 	
r;   c                     U R                   $ r   )ru   rr   s    r<   get_input_embeddings%JanusVisionModel.get_input_embeddings*  s    r;   )r&   ru   r  r  )NNNNF)r-   r.   r/   r0   main_input_namer!   r1   rb   r   r   rD   rE   r   r   rM   r   r   r(  r:   r   r   s   @r<   r  r    s    $O	0 	  59,0/3&*).(
u001(
 $D>(
 'tn	(

 d^(
 #'(
 
u00	1(
 (
T r;   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPi.  r&   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ sH.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nr   )ra   rb   r   r   rc   projection_dimr   r  r	  depthhidden_layersr	   r   r   r  s      r<   rb   JanusVisionAlignerMLP.__init__/  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (4Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   r   r1  r   rr   rJ   layers      r<   r   JanusVisionAlignerMLP.forward8  B    /''E ..}=M!-0M ( r;   r   r   r1  )	r-   r.   r/   r0   r!   rb   r   r:   r   r   s   @r<   r,  r,  .  s    70 7 r;   r,  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S\R                  S\R                  4S	 jrS
rU =r$ )JanusVQVAEVectorQuantizeri@  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r&   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        USS5      U l        [        R                  " U R                  U R                  5      U l	        UR                  /S-  U l        g )Nbetag      ?r   )ra   rb   num_embeddingsrd   embedding_dimgetattrr>  r   rl   	embeddingrj   quant_state_dimsrq   s     r<   rb   "JanusVQVAEVectorQuantizer.__init__K  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8r;   hidden_statec           
      >   UR                  SSSS5      R                  5       nUR                  SU R                  5      n[        R
                  " US-  SSS9[        R
                  " U R                  R                  S-  SS9-   S[        R                  " S	X R                  R                  R                  SS5      5      -  -
  n[        R                  " USS9nU R                  U5      R                  UR                  5      n[        R                  " UR                  5       U-
  S-  5      U R                  [        R                  " XQR                  5       -
  S-  5      -  -   nXU-
  R                  5       -   nUR                  SSSS5      R                  5       nXVU4$ )
Nr   r   r   r   r_   T)r   keepdimr   z	bd,dn->bn)r   r   r   r@  rD   sumrB  r   einsumr   argminr~   meandetachr>  )rr   rE  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrQ   s          r<   r   !JanusVQVAEVectorQuantizer.forwardT  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e"5"5"77A=\
 P
 

 *,-N,V,V,XX 0771aCNNP!)===r;   image_tokensrx   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r_   r   )r   r   r   r   )	r~   rB  r   F	normalizer   rC  r   r   )rr   rS  r   emb_dimrQ  s        r<   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entryo  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r;   )r>  rB  r@  r?  rC  )r-   r.   r/   r0   rC   r"   rb   rD   r   r   
LongTensorrE   rX  r:   r   r   s   @r<   r<  r<  @  sI    9/ 9>ELL >6"u/?/? "EDUDU " "r;   r<  c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )JanusVQVAEResnetBlocki  c                   > [         TU ]  5         X l        Uc  UOUU l        X@l        [
        R                  R                  SUSSS9U l        [
        R                  R                  X#SSSS9U l
        [
        R                  R                  SUSSS9U l        [
        R                  R                  UR                  5      U l        [
        R                  R                  X3SSSS9U l        U R                  U R                  :w  a]  U R                  (       a&  [
        R                  R                  X#SSSS9U l        g [
        R                  R                  X#SSSS9U l        g g )	N    ư>T
num_groupsrh   r   affiner   r   r[   r\   r]   r   )ra   rb   rY   rZ   use_conv_shortcutrD   r   	GroupNormnorm1rg   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rr   r&   rY   rZ   rj  rs   s        r<   rb   JanusVQVAEResnetBlock.__init__  s%    	&+7+?K\!.XX''2KUYbf'g
XX__[AVWab_c
XX''2LVZcg'h
xx''7XX__\QWXbc_d
t000%%%*XX__[\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r;   c                    UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U5      nU R                  U R                  :w  a7  U R                  (       a  U R                  U5      nX!-   $ U R                  U5      nX!-   $ r   )rf  rD   sigmoidrg  rh  r   ri  rY   rZ   rd  rj  rk  )rr   rJ   r  s      r<   r   JanusVQVAEResnetBlock.forward  s     

=1}55

=1

=1}55]3

=1t000%%--h7 ''  ,,X6''r;   )
rg  ri  rj  r   rY   rk  rf  rh  rZ   rd  r  r-   r.   r/   r0   rb   r   r:   r   r   s   @r<   r\  r\    s    
 s.( (r;   r\  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEAttnBlocki  c                   > [         TU ]  5         Xl        [        R                  R                  SUSSS9U l        [        R                  R                  XSSSS9U l        [        R                  R                  XSSSS9U l	        [        R                  R                  XSSSS9U l
        [        R                  R                  XSSSS9U l        g )Nr^  r_  Tr`  r   r   rc  )ra   rb   rY   rD   r   re  normrg   qkvproj_outrr   rY   rs   s     r<   rb   JanusVQVAEAttnBlock.__init__  s    &HH&&";TXae&f	qQR\]^qQR\]^qQR\]^aXYcder;   c                 Z   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pgpUR                  XgX-  5      R                  SSS5      nUR                  XgX-  5      n[        R                  " X45      n
U
[        U5      S-  -  n
[        R                  " U
SS9n
UR                  XgX-  5      nU
R                  SSS5      n
[        R                  " XZ5      R                  XgX5      nU R                  U5      nX+-   $ )Nr   r   r   r   rH  )rt  ru  rv  rw  r~   r   r   rD   bmmr   rU  r   rx  )rr   rJ   r  r   r   r   r   channelsrv   rw   r   r   s               r<   r   JanusVQVAEAttnBlock.forward  s    		-0vvm,VVM*
vvm, /;.@.@+
f#++J&.QYYZ[]^`ab''
fnM
yy:#s8}'>?yy15 $++J&.Q#++Aq!4ii;CCJZ`hmmK0%%r;   )rY   rv  rt  rx  ru  rw  rp  r   s   @r<   rr  rr    s    f& &r;   rr  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvDownsamplei  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r   r   rc  )ra   rb   r   rg   convry  s     r<   rb   !JanusVQVAEConvDownsample.__init__  s%    IIkAaYZ[	r;   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r   r   r   constantr   )padr|   r   )rU  r  r  r   s     r<   r    JanusVQVAEConvDownsample.forward  s+    mJVWX		-0r;   r  rp  r   s   @r<   r  r    s    \ r;   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsamplei  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr   r   rc  )ra   rb   rD   r   rg   r  ry  s     r<   rb   JanusVQVAEConvUpsample.__init__  s,    HHOOK!TU_`Oa	r;   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factorr|   )rU  r   r  r   s     r<   r   JanusVQVAEConvUpsample.forward  s(    m#IV		-0r;   r  rp  r   s   @r<   r  r    s    b r;   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlocki  r&   r}  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr&   rY   rZ   )ra   rb   r\  block_1rr  attn_1block_2)rr   r&   r}  rs   s      r<   rb   JanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
r;   rJ   rx   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r   s     r<   r   JanusVQVAEMidBlock.forward  s2    ]3M2]3r;   )r  r  r  )r-   r.   r/   r0   r"   r   rb   rD   r   r   r:   r   r   s   @r<   r  r    s7    
/ 
3 
U\\ ell  r;   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr   r   rc  )r   r  r^  r_  Tr`  r   ) ra   rb   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrY   double_latentlatent_channelsrD   r   rg   conv_inrM   in_channel_multiplierr  downr	  appendr\  rr  Moduleblockattnr  
downsampler  midre  norm_outconv_out)rr   r&   r  rY   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  rs   s                  r<   rb   JanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
r;   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr_   r   r   )r  r	  r  r  r  r  r  r  r  r  r  r  rD   rn  r  )rr   r   rJ   r  r  rE  rI   s          r<   r   JanusVQVAEEncoder.forward/  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  r;   )r  r  r  r  r  r  r  r  )
r-   r.   r/   r0   rb   rD   rZ  r   r:   r   r   s   @r<   r  r    s     1
f!E$4$4 ! !r;   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderiH  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nr   r   rc  r  r   r^  r_  Tr`  )ra   rb   r  r  r  r  r  r  rZ   rD   r   rg   r  r  r  r  upreversedr	  r  r\  rr  r  r  r  r  upsamplere  r  r  )rr   r&   r  r  rZ   r  r  r  r  r  r  r  rs   s               r<   rb   JanusVQVAEDecoder.__init__I  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcr;   rE  rx   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr   r   )r  r  r	  r  r  r  r  r  r  r  r  rD   rn  r  )rr   rE  r  r  s       r<   r   JanusVQVAEDecoder.forwardw  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2r;   )r  r  r  r  r  r  r  )
r-   r.   r/   r0   rb   rD   rE   r   r:   r   r   s   @r<   r  r  H  s.    ,d\E$5$5 %:K:K  r;   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                     ^  \ rS rSr% \\S'   / SQrSrS\4U 4S jjrS\	R                  4S jrS\	R                  S\	R                  4S	 jr\\S\	R                  S\\	R                  \	R                  4   4S
 j5       5       rSrU =r$ )
JanusVQVAEi  r&   )rr  r\  r<  r   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  R                  UR                  UR                  S5      U l        [        R                  R                  UR                  UR                  S5      U l        U R                  5         [        U5      U l        SU l        U R#                  5         g )Nr   F)ra   rb   r  r  r<  quantizerD   r   rg   r  rd   
quant_convpost_quant_convevalr  decoderr  r  rq   s     r<   rb   JanusVQVAE.__init__  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+# 	r;   c                 v    U R                  U5      nU R                  U5      nU R                  U5      u  p4nX4U4$ r   )r  r  r  )rr   r   rJ   quantemb_lossindicess         r<   encodeJanusVQVAE.encode  s<    \26#'==#? ''r;   rS  rx   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r~   r  rC  r   rX  r  r  )rr   rS  codebook_entryrJ   r   s        r<   decodeJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r;   c                     UR                   S   nU R                  U5      u  p4nU R                  UR                  US5      5      n[	        Xd5      $ )Nr   r_   )r~   r  r  r   r?   )rr   r   r   r  rB   r  rA   s          r<   r   JanusVQVAE.forward  sM     "''*
)-\)B&w#{{7<<
B+GH 4EEr;   )r  r  r  r  r  r  )r-   r.   r/   r0   r"   r1   r4   r*  rb   rD   rZ  r  rE   r  r   r   rM   r   r:   r   r   s   @r<   r  r    s     
 %O/ (5#3#3 (5#3#3 8I8I & F''F 
u  %"3"33	4F  Fr;   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPi  r&   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ sH.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf r.  )ra   rb   r   r   rd   r/  r   r  r	  r
  r1  r	   r   r   r  s      r<   rb   JanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rr3  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   r5  r6  s      r<   r   JanusVQVAEAlignerMLP.forward  r9  r;   r:  )	r-   r.   r/   r0   r"   rb   r   r:   r   r   s   @r<   r  r    s    7/ 7 r;   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadi  zOHead used for sampling tokens in image generation, replacing the usual lm head.r&   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r   )ra   rb   r   r   image_token_embed_dimr/  rx  r	   r   r   r?  vision_headrq   s     r<   rb   JanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr;   rJ   rx   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rx  r   r  r   s     r<   r   JanusVQVAEHead.forward  s6    m4**=9((7r;   )r   rx  r  )r-   r.   r/   r0   rC   r"   rb   rD   r   tensorr   r:   r   r   s   @r<   r  r    s5    YS/ SU\\ ell  r;   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS\	R                  S\	R                  S	\	R                  4S
 jr\\         SS\	R                  S\	R                  S\\	R                      S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\\	R                   4   4S jj5       5       rSrU =r$ )
JanusModeli  r&   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r&   F)ra   rb   r&   r  _from_configvision_configvision_modelr,  alignerr  	vq_configvqmodelr   rl   r?  rd   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr  r  rq   s     r<   rb   JanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r;   c                 6    U R                   R                  5       $ r   )r  r(  r'  s    r<   r(  JanusModel.get_input_embeddings  s    ""7799r;   c                 :    U R                   R                  U5        g r   )r  set_input_embeddingsrr   r   s     r<   r  JanusModel.set_input_embeddings  s    007r;   c                 ^    U R                  U5      nU R                  UR                  5      nU$ r   )r  r  rI   )rr   r   image_embedss      r<   get_image_featuresJanusModel.get_image_features  s,    ((6||L$B$BCr;   	input_idsr  image_featuresc           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nX$   R                  5       UR                  5       :w  a0  UR                  S   UR                  S   -  n[        SU SU 35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   devicer_   r   r   z6Image features and image tokens do not match: tokens: z, features )r(  rD   r  r&   image_token_idlongr  allrI  r   	expand_asr   numelr~   r   )rr   r  r  r  special_image_maskn_image_tokensn_image_featuress          r<   get_placeholder_maskJanusModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno,2248L8L8NN-33A69M9Ma9PPHHXXcdtcuv  "!r;   r   r   r^   r*   cache_position	use_cachelogits_to_keepc
                    US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbw  U R                  U5      nUR                  SUR                  S   5      nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUUU	S.U
D6n[        UR                  UR                  UR                  UR                  Ub  WS9$ S S9$ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner_   )r  r  )r  r   r^   r*   r  r  r  )rI   r*   rJ   rK   rL   r,   )r   r(  r  r   r~   r   r  r   r  masked_scatterr  rG   rI   r*   rJ   rK   )rr   r  r   r   r^   r*   r  r  r  r  r   r  r  image_attention_mask	lm_outputs                  r<   r   JanusModel.forward1  s@    -t";<s    557	BM#22<@L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M^M'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r;   )	r  r&   r  r  r  r  r  r  r  )	NNNNNNNNr   )r-   r.   r/   r0   r    rb   r(  r  r  rD   rZ  rE   r  r   r   r   r   r
   r   r   r   r   r:   r   r   s   @r<   r  r    s*   { *:8
"))":?:K:K"]b]n]n"0  '+*.1537+/5959$(34.
##.
 ''.
 !.	.

 u//0.
 "%.
 !!1!12.
   1 12.
 D>.
 c5<</0.
  .
r;   r  c                     ^  \ rS rSrSS/rSrS\4U 4S jjrS rS r	S	\
R                  S
\
R                  4S jrS rS r\\          S S\
R"                  S\
R$                  S\\
R                     S\\
R"                     S\\   S\\
R"                     S\\
R$                     S\\
R"                     S\\   S\\\
R                  4   S\\   4S jj5       5       r      S!U 4S jjrS\
R                  4S jr\
R:                     S"S	\
R                  S\\
R"                     S\\   4U 4S jjj5       rSr U =r!$ )#JanusForConditionalGenerationid  z(model.language_model.embed_tokens.weightzlm_head.weightTr&   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )ra   rb   r&   r  r'   r   r   r  rc   
vocab_sizelm_headr  rq   s     r<   rb   &JanusForConditionalGeneration.__init__h  sZ     '
yy!3!3!?!?ASASA^A^ejk 	r;   c                 J    U R                   R                  R                  5       $ r   )r'   r  r(  r'  s    r<   r(  2JanusForConditionalGeneration.get_input_embeddingsq  s    zz((==??r;   c                 N    U R                   R                  R                  U5        g r   )r'   r  r  r  s     r<   r  2JanusForConditionalGeneration.set_input_embeddingst  s    

!!66u=r;   inputsrx   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r   )r'   r  r  )rr   r  rE  s      r<   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generationw  s0    zz77?zz44\Br;   c                     Xl         g r   r'   )rr   r  s     r<   set_decoder)JanusForConditionalGeneration.set_decoder|  s    
r;   c                     U R                   $ r   r$  r'  s    r<   get_decoder)JanusForConditionalGeneration.get_decoder  s    zzr;   r  r   r   r^   r*   r  r  labelsr  r  r   c                    U R                   " SUUUUUUU	US.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r  r   r   r^   r*   r  r  r  N)rR   r*  r  )rQ   rR   r*   rJ   rK   rL   r,   )r'   rI   
isinstancer   slicer  loss_functionr&   r  r  rO   r*   rJ   rK   rL   )rr   r  r   r   r^   r*   r  r  r*  r  r  r   r  rJ   slice_indicesrR   rQ   s                    r<   r   %JanusForConditionalGeneration.forward  s    , ** 

%)%+')

 

  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
r;   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  X)S'   U	$ )N)r*   r  r   r  r  r   r   )ra   prepare_inputs_for_generation)rr   r  r   r*   r   r  r  r  r   model_inputsrs   s             r<   r2  ;JanusForConditionalGeneration.prepare_inputs_for_generation  sR     w<
+')))
 
 !!+7(r;   rS  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r   r   r   )r'   r  r  r   )rr   rS  decoded_images      r<   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9r;   logits_processorc           	      N  > UR                  SU R                  5      n[        R                  " U5      nUR                  SS5      nUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR                  " S0 UD6nUR                  5       [        R                  [        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [         R#                  S5        S	Ul        UR                  US
'   U R%                  XR&                  U5      u  pnUR(                  UR*                  p[-        UR.                  5      S:w  a  [        SUR.                   S35      eUS LnU R1                  X\UR*                  S9  UR                  (       a;  UR                  S:  a+  UR3                  [5        UR                  5      5        S Ul        U R7                  UUR.                  S   US UUS9nU R8                  " SUUUR:                  S.UD6u  pU R<                  R>                  R@                  RB                  nUR.                  u  pURE                  SS5      nUR                  SS 5      nURE                  SS5      nX'S'   UUS 2S S 24   UR&                  :g  UUS 2S S 24   URF                  S   :g  -  nUUS 2S S 24   RI                  UURJ                  5        U RM                  5       " U5      nU RO                  XU5      nURQ                  SS 5      cB  U RS                  URT                  =(       d    SUS-  [W        URX                  X-   5      UUS9US'   [Z        R\                  " X4XS9nUR^                  nUR`                  nURb                  nURd                  nURf                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[i        U5       GHy  nU Rj                  " SUUS.UD6nUS   Rm                  UR*                  5      US'   US   Rm                  UR*                  5      US'   U R<                  Rn                  " S0 UDUUS.D6nU Rq                  UU5      nURr                  S S 2SS S 24   Ru                  5       n U R<                  Rw                  U 5      n!U" UU!5      n"URx                  (       a:  [Z        Rz                  " U"SS9n#[Z        R|                  " U#SS9R                  S5      n$O[Z        R                  " U"SS9n$U$US S 2U4'   [Z        R                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGM|     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ ) Ngeneration_configgeneration_modetext)r  r   r;  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r>  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r;  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr9  r  )r  r   expand_sizer   boi_token_idr*   static)cache_implementationr   max_cache_lenr  model_kwargsr  r,   )r  r  r  )r   r  r_   rH  )num_samples)	sequencesscoresrR   rK   rJ   r*   )Ipopr;  copydeepcopyra   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r>  loggerwarning_prepare_model_inputsbos_token_idr   r  r  r~   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr'   r  r&   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr(  _get_initial_cache_positionget
_get_cacherF  max
max_lengthrD   zerosr   r  output_scoresoutput_logitsreturn_dict_in_generater	  r2  r   r  #_update_model_kwargs_for_generationrI   cloner  	do_sampler   multinomialsqueezeargmaxcatr   r!  floatrK   rJ   r   r*   )&rr   r  r   r9  r   r;  r<  rH  r  model_input_namer   r  kwargs_has_attention_maskr^  r   r   input_tokensmaskr  generated_tokensr   r  ri  rj  rk  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir3  r  rE  rK  next_token_scoresprobs
next_tokenrs   s&                                        r<   rO  &JanusForConditionalGeneration.generate  sP    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   )//9&9 002>;P;PR`RnRn:ool  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@P@Z[) /> /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r;   )r&   r  r'   )
NNNNNNNNNr   )NNNNNNr  )"r-   r.   r/   r0   _tied_weights_keysr8   r    rb   r(  r  rD   r   r!  r%  r(  r   r   rZ  rE   r   r
   r   r   r   r   r   r   r2  r7  no_gradr   rO  r:   r   r   s   @r<   r  r  d  s   DFVW!{ @>ell u|| 
  '+*.1537+/5959-1$(341
##1
 ''1
 !.	1

 u//01
 "%1
 !!1!121
   1 121
 ))*1
 D>1
 c5<</01
 +,1
  1
l <
 
 ]]  $59:>	}$}$ !!1!12}$ ##67	}$ }$r;   r  )r$   r  r  r  r  )r   )QrM  dataclassesr   typingr   r   r   rD   r   activationsr	   cache_utilsr
   
generationr   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_janusr    r!   r"   torch.nn.functionalr   rU  
get_loggerr-   rV  r$   r?   rG   rO   r  rU   r   r   r   rs  r   r   r   r)   r  r  r,  r<  r\  rr  r  r  r  r  r  r  r  r  r  r  __all__r,   r;   r<   <module>r     sM  ,  ! , ,   !   u u 9 9 X X F &   Q Q ## 
		H	% 
.? 
. 
. 
	-{ 	- 	- 
C; C C< 
C+ C C6HBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4I$299 I$XRYY (.8 .bM
 M
` ;+ ; ;|BII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH :F% :F:Fz299 $RYY   
i
% i

i
X{$$8/ {$|	 tr;   