
    <hj                       S SK r S SKJr  S SKJr  S SKJrJrJr  S SK	r
S SKrS SKJr  S SKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJr  SSKJrJr  SSKJrJr  SSK J!r!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,  SSK-J.r.  SSK/J0r0J1r1J2r2J3r3J4r4J5r5  SSK6J7r7  SSK8J9r9  SSK:J;r;  SSK<J=r=J>r>J?r?J@r@JArA  SSKBJCrCJDrD  SSKEJFrF  SSKGJHrH  SSKIJJrJJKrKJLrL  \3" 5       (       a  S SKrS SKMJr  S SKNJs  JOrP  S SKQr\4" 5       (       a  S SKRrRSSKSJTrT  SSK6JUrUJVrV  \5R                  " \X5      rY " S S\H5      rZ " S  S!\;5      r[ " S" S#\T5      r\\1 " S$ S%\,5      5       r]\\1" S&S'9 " S( S)\)5      5       5       r^ " S* S+\C5      r_ " S, S-\D5      r` " S. S/\L5      ra " S0 S1\R                  5      rc " S2 S3\R                  5      rd " S4 S5\K5      re " S6 S7\J5      rf " S8 S9\95      rg " S: S;\R                  5      rh " S< S=\A5      ri " S> S?\@5      rj " S@ SA\>5      rk " SB SC\?5      rl " SD SE\R                  5      rm " SF SG\R                  5      rn " SH SI\R                  5      ro " SJ SK\R                  5      rp " SL SM\=5      rq " SN SO\R                  5      rr " SP SQ\R                  5      rs\1" SRS'9 " SS ST\]5      5       rt " SU SV\]\5      ru " SW SX\5      rv/ SYQrwg)Z    N)Iterable)	dataclass)CallableOptionalUnion)nn)BlipImageProcessor   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)resizeto_channel_dimension_format)ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatmake_list_of_imagesto_numpy_array)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_availableis_vision_availablelogging   )	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddings)PretrainedConfig)CONFIG_MAPPING
AutoConfigc                   \   ^  \ rS rSrSrSrSr                  SU 4S jjrSrU =r	$ )JanusVisionConfigW   a^
  
This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
`JanusVisionModel` according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    image_size (`int`, *optional*, defaults to 384):
        The size (resolution) of each image.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for attention weights.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"`, and `"gelu_new"` are supported.
    mlp_ratio (`float`, *optional*, defaults to 4.0):
        Ratio of MLP hidden dimensionality to embedding dimensionality.
    attention_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys, and values in the attention layers.
    hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
        The dropout probability for fully connected layers in the encoder.
    projection_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the MLP projection head.
    projection_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for the projection layer.
    use_qk_norm (`bool`, *optional*, defaults to `False`):
        Whether to normalize the query and key matrices.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated normal initializer for initializing all weight matrices.
    depth (`int`, *optional*, defaults to 2):
        Number of hidden layers in the aligner module.
    num_image_tokens (`int`, *optional*, defaults to 576):
        Number of image tokens.
janus_vision_modelvision_configc                    > [         TU ]  " SUUUUUUUUU	S.	UD6  U ?Xl        Xl        Xl        Xl        Xl        Xl        UU l	        UU l
        UU l        g )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfr@   rA   rB   rC   rD   rE   rF   rG   rH   rM   rN   rO   rP   rQ   rR   rS   rT   rU   kwargs	__class__s                       _/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/janus/modular_janus.pyrK   JanusVisionConfig.__init__   s~    , 	 	
#/ 3%!!/)!	
 	
 "",#6 ,"4&!2
 0    )	rN   rT   rO   rS   rM   rU   rP   rQ   rR   )i         r
   r]             ư>gelug      @Tr_      r_   F{Gz?r'   i@  )
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyrK   __static_attributes____classcell__rX   s   @rY   r;   r;   W   sW    ,\ &J%O ',1 ,1r[   r;   c                      ^  \ rS rSrSrSSSSSSSS	/ S
QSSSSSSS4S\S\S\S\S\S\S\S\S\\   S\S\4U 4S jjjr	Sr
U =r$ )JanusVQVAEConfig   a	  
This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
`JanusVQVAEModel` according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a
configuration with the defaults will yield a similar configuration to the VQModel of the
[deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

Args:
    embed_dim (`int`, *optional*, defaults to 8):
        Dimensionality of each embedding vector.
    num_embeddings (`int`, *optional*, defaults to 16384):
        Number of codebook embeddings.
    double_latent (`bool`, *optional*, defaults to `False`):
        Whether to use double z channels.
    latent_channels (`int`, *optional*, defaults to 256):
        Number of channels for the latent space.
    num_patches (`int`, *optional*, defaults to 32):
        Num of patches the input images can be divided into.
    in_channels (`int`, *optional*, defaults to 3):
        Number of input channels.
    out_channels (`int`, *optional*, defaults to 3):
        Number of out channels.
    base_channels (`int`, *optional*, defaults to 128):
        Base channel count.
    channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
        Channel multipliers for each resolution.
    num_res_blocks (`int`, *optional*, defaults to 2):
        Number of residual blocks.
    dropout (`float`, *optional*, defaults to 0.0):
        Dropout rate.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    projection_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the MLP projection head.
    num_hidden_layers (`int`, *optional*, defaults to 2):
        Number of hidden layers in VAVAE MLP Connecter module.
    hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    image_token_embed_dim (`int`, *optional*, defaults to 2048):
        Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
   i @  F       r
      )   ru   r'   r'      r'   r_   rc   rb   ra   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                    > [         TU ]  " SUUUUUUU	U
UUS.
UD6  XPl        Xpl        Xl        Xl        Xl        UU l        U ?U ?	U ?
g )N)
rw   rx   ry   rz   r|   r~   r   r   r   rS   rI   )rJ   rK   r{   r}   rP   rA   rH   image_token_embed_dim
resolutionattn_resolutions	attn_type)rV   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   rS   rP   rA   rH   r   rW   rX   s                     rY   rK   JanusVQVAEConfig.__init__   sv    ( 	 	
)'+#'1)/	
 	
 '(,!2$%:"O!Nr[   )rH   r   rA   r{   r}   rP   )rd   re   rf   rg   rh   intboollistfloatrK   rk   rl   rm   s   @rY   ro   ro      s    *\ ##" (7"#** * 	*
 * * * * * !I* * * *r[   ro   c                   H   ^  \ rS rSrSrSr\\\S.r	    SU 4S jjr
SrU =r$ )JanusConfigi  a  
This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
[deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
        The config object or dictionary of the text backbone.
    vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
        The config object or dictionary of the vision backbone.
    vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
        The config object or dictionary of the VQVAE backbone.
    image_token_id (`int`, *optional*, defaults to 100581):
        Token index of a placeholder image token.

Example:

```python
>>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

>>> # Initializing a Janus vision config
>>> vision_config = JanusVisionConfig()

>>> # Initializing a Llama config
>>> text_config = LlamaConfig()

>>> # Initializing a VQ config
>>> vq_config = JanusVQVAEConfig()

>>> # Initializing a Janus Pro 1B style configuration
>>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

>>> # Initializing a model from the Janus Pro 1B style configuration
>>> model = JanusForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```janus)text_configr>   	vq_configc                 &  > [        U[        5      (       a-  UR                  SS5      US'   [        US      " S	0 UD6U l        O_Uc)  [
        R                  S5        [        S   " 5       U l        O3[        U[        5      (       a  Xl        O[        S[        U5       35      eUc%  [
        R                  S5        [        5       U l        OY[        U[        5      (       a  [        S	0 UD6U l        O3[        U[        5      (       a  X l        O[        S[        U5       35      eUc%  [
        R                  S5        [        5       U l        OY[        U[        5      (       a  [        S	0 UD6U l        O3[        U[        5      (       a  X0l        O[        S[        U5       35      eU R                  R                  U l        U R                  R                  U R                  R                   -  U R                  l        X@l        [&        TU ]P  " S	0 UD6  g )
Nri   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rI   )
isinstancedictgetr8   r   loggerinfor7   
ValueErrortyper;   r>   ro   r   rS   rE   rD   r{   image_token_idrJ   rK   )rV   r   r>   r   r   rW   rX   s         rY   rK   JanusConfig.__init__G  s    k4(((3g(NK%-k,.GHW;WD KKQR-g68D%566*  $[ 124 
  KKef!2!4Dt,,!2!C]!CD'899!.  $] 346 
 KK`a-/DN	4((-:	:DN	#344&N  $Y02 
 "&!3!3!E!E%)%7%7%B%BdFXFXFcFc%c","6"r[   )r   rS   r   r>   r   )NNNi )rd   re   rf   rg   rh   ri   r9   r;   ro   sub_configsrK   rk   rl   rm   s   @rY   r   r     s8    +Z J!*%K 6# 6#r[   r   c                   H    \ rS rSr% \\S'   SrSrSS/rSS/r	Sr
SrSrS	rS
rg)JanusPreTrainedModelr^   configmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFrI   N)rd   re   rf   rg   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignmentrk   rI   r[   rY   r   r     sB    &*#,.GH#4m"DN!(-%r[   r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   d    \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Srg)JanusVQVAEOutputi  z
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Reconstructed pixel values after encoding and decoding the input.
embedding_loss (`torch.FloatTensor`):
    Embedding loss.
Ndecoded_pixel_valuesembedding_lossrI   )rd   re   rf   rg   rh   r   r   torchFloatTensorr   r   rk   rI   r[   rY   r   r     s/     9=(5#4#45<(,NE%%,r[   r   c                       \ rS rSrSrg)JanusBaseModelOutputWithPasti  rI   Nrd   re   rf   rg   rk   rI   r[   rY   r   r         r[   r   c                       \ rS rSrSrg)JanusCausalLMOutputWithPasti  rI   Nr   rI   r[   rY   r   r     r   r[   r   c                   V    \ rS rSrSS\R
                  S\S\R
                  4S jjrSrg)	JanusVisionEmbeddingsi  pixel_valuesinterpolate_pos_encodingreturnc                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )Ndtyper'   ru   )
shapepatch_embeddingweightr   toflatten	transposer   position_embeddingposition_ids)
rV   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             rY   forwardJanusVisionEmbeddings.forward  s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
r[   rI   N)F)	rd   re   rf   rg   r   Tensorr   r   rk   rI   r[   rY   r   r     s,    ELL D ]b]i]i  r[   r   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\   4S jjrS	rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fru   biasr   )rJ   rK   r   r@   rw   rB   	num_headshead_dimr   scalerF   rQ   rR   	is_causalnum_key_value_groupsr   LinearrN   q_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rV   r   proj_dropoutqk_normrX   s       rY   rK   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r[   hidden_statesattention_maskrW   c                 <   UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	U4U R                  (       d  SOU R                   U R"                  U R$                  S.UD6u  pUR	                  XEU R&                  5      nU R)                  U5      nU R+                  U5      nX4$ )Nru   r'   eagerr_   )r   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr2   r   _attn_implementationr   trainingrF   r   r   rw   r   rQ   )rV   r   r   rW   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputs                 rY   r   JanusVisionAttention.forward  s    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0##r[   )rF   r   rw   r   r   r   r   r   r   rQ   r   r   r   r   r   N)rd   re   rf   rg   rh   r;   rK   r   r   r   r    r!   r   rk   rl   rm   s   @rY   r   r     sT    2Q0 Q@ 26)$||)$ !.)$ +,	)$ )$r[   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPi  r   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g r  )rJ   rK   r   r   r@   rM   rL   r   rH   activation_fnr   r   fc1fc2r   rO   dropout1dropout2rV   r   rX   s     rY   rK   JanusVisionMLP.__init__  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r[   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r  )r  r  r  r  r	  rV   r   s     rY   r   JanusVisionMLP.forward  sP    /**=9m4/m4r[   )r  r   r  r	  r  r  rL   )rd   re   rf   rg   r;   rK   r   r   r   rk   rl   rm   s   @rY   r  r    s0    ?0 ?U\\ ell  r[   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )r   i  r   c                 H  > [         TU ]  5         Xl        UR                  U l        [        U5      U l        [        R                  " U R                  UR                  S9U l
        [        R                  " U R                  UR                  S9U l        [        U5      U l        g )N)eps)rJ   rK   r   r@   rw   r   	self_attnr   r   rG   layer_norm1layer_norm2r  mlpr
  s     rY   rK    JanusVisionEncoderLayer.__init__  sr    ++-f5<<F<Q<QR<<F<Q<QR!&)r[   )r   rw   r  r  r  r  rd   re   rf   rg   r;   rK   rk   rl   rm   s   @rY   r   r     s    *0 * *r[   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionEncoderi$  r   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ sH  n[        U5      PM     sn5      U l        g s  snf r  )rJ   rK   r   
ModuleListrangerA   r   layersrV   r   r   rX   s      rY   rK   JanusVisionEncoder.__init__%  sF     mmeTZTlTlNm$nNm%<V%DNm$no$ns   A)r  r  rm   s   @rY   r  r  $  s    p0 p pr[   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionModeli*  r   c                 D   > [         TU ]  U5        [        U5      U l        g r  )rJ   rK   r  encoderr
  s     rY   rK   JanusVisionModel.__init__+  s     )&1r[   )r#  r  rm   s   @rY   r!  r!  *  s    20 2 2r[   r!  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPi0  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ sH.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nru   )rJ   rK   r   r   r@   rP   r  r  r  rT   hidden_layersr   rH   r  r  s      rY   rK   JanusVisionAlignerMLP.__init__1  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (4Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r  r)  r  rV   r   layers      rY   r   JanusVisionAlignerMLP.forward:  B    /''E ..}=M!-0M ( r[   r  r  r)  )	rd   re   rf   rg   r;   rK   r   rk   rl   rm   s   @rY   r&  r&  0  s    70 7 r[   r&  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )JanusVQVAEVectorQuantizeriB  r   c                 N   > [         TU ]  U5        UR                  /S-  U l        g )Nr'   )rJ   rK   r{   quant_state_dimsr
  s     rY   rK   "JanusVQVAEVectorQuantizer.__init__C  s&     !'!3!3 4q 8r[   image_tokensr   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r   r'   )pdimr
   ru   )	r   	embeddingr   F	normalizer   r6  permute
contiguous)rV   r8  r   emb_dimhidden_state_quants        rY   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entryG  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r[   )r6  )rd   re   rf   rg   ro   rK   r   
LongTensorr   rC  rk   rl   rm   s   @rY   r4  r4  B  s4    9/ 9"u/?/? "EDUDU " "r[   r4  c                       \ rS rSrSrg)JanusVQVAEResnetBlockiW  rI   Nr   rI   r[   rY   rG  rG  W  r   r[   rG  c                       \ rS rSrSrg)JanusVQVAEAttnBlocki[  rI   Nr   rI   r[   rY   rI  rI  [  r   r[   rI  c                       \ rS rSrSrg)JanusVQVAEConvDownsamplei_  rI   Nr   rI   r[   rY   rK  rK  _  r   r[   rK  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsampleic  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr
   ru   kernel_sizestridepadding)rJ   rK   r   r   Conv2dconv)rV   r|   rX   s     rY   rK   JanusVQVAEConvUpsample.__init__d  s,    HHOOK!TU_`Oa	r[   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factormode)r=  interpolaterT  r  s     rY   r   JanusVQVAEConvUpsample.forwardh  s(    m#IV		-0r[   )rT  )rd   re   rf   rg   rK   r   rk   rl   rm   s   @rY   rM  rM  c  s    b r[   rM  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlockin  r   channelsc                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr   r|   r}   )rJ   rK   rG  block_1rI  attn_1block_2)rV   r   r^  rX   s      rY   rK   JanusVQVAEMidBlock.__init__o  sF    , !

 *(3, !
r[   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )ra  rb  rc  r  s     rY   r   JanusVQVAEMidBlock.forward}  s2    ]3M2]3r[   )rb  ra  rc  )rd   re   rf   rg   ro   r   rK   r   r   r   rk   rl   rm   s   @rY   r]  r]  n  s7    
/ 
3 
U\\ ell  r[   r]  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr
   ru   rO  )ru   r`  rs   r`   T
num_groupsrC   r  affiner'   ) rJ   rK   lenr   num_resolutionsr   r~   r|   ry   rz   r   r   rS  conv_intuplein_channel_multiplierr  downr  appendrG  rI  ModuleblockattnrK  
downsampler]  mid	GroupNormnorm_outconv_out)rV   r   r~   r|   ry   rz   r   rq  i_levelru  rv  block_in	block_outi_blockrr  rX   s                  rY   rK   JanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
r[   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr   r   ru   )ro  r  rn  r   rr  ru  rm  rv  rs  rw  rx  rz  r   sigmoidr{  )rV   r   r   r|  r  hidden_statelast_hidden_states          rY   r   JanusVQVAEEncoder.forward  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  r[   )ro  r{  rr  rq  rx  rz  r   rn  )
rd   re   rf   rg   rK   r   rE  r   rk   rl   rm   s   @rY   rh  rh    s     1
f!E$4$4 ! !r[   rh  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderi  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nru   r
   rO  r`  r   rs   r`   Trj  )rJ   rK   rm  r   rn  r   r~   rz   r}   r   r   rS  ro  r]  rx  r  upreversedr  rs  rG  rI  rt  ru  rv  rM  upsamplery  rz  r{  )rV   r   r~   rz   r}   r}  r|  ru  rv  r~  r  r  rX   s               rY   rK   JanusVQVAEDecoder.__init__  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcr[   r  r   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nru   r   )ro  rx  r  rn  r   r  ru  rm  rv  r  rz  r   r  r{  )rV   r  r|  r  s       rY   r   JanusVQVAEDecoder.forward   s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2r[   )ro  r{  rx  rz  r   rn  r  )
rd   re   rf   rg   rK   r   r   r   rk   rl   rm   s   @rY   r  r    s.    ,d\E$5$5 %:K:K  r[   r  c                      ^  \ rS rSr/ SQrSrS\4U 4S jjrS\R                  S\R                  4S jr\\S\R                  S\\R                  \R                  4   4S	 j5       5       rS
rU =r$ )
JanusVQVAEi  )rI  rG  r4  r   r   c                 r   > [         TU ]  U5        [        U5      U l        SU l        U R                  5         g )NF)rJ   rK   r  decodergradient_checkpointing	post_initr
  s     rY   rK   JanusVQVAE.__init__  s0     (0&+# 	r[   r8  r   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
ru   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizer6  r   rC  post_quant_convr  )rV   r8  codebook_entryr   r   s        rY   decodeJanusVQVAE.decode%  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r[   c                     UR                   S   nU R                  U5      u  p4nU R                  UR                  US5      5      n[	        Xd5      $ )Nr   r   )r   encoder  r   r   )rV   r   r   quantr   indicesr   s          rY   r   JanusVQVAE.forward8  sM     "''*
)-\)B&w#{{7<<
B+GH 4EEr[   )r  r  )rd   re   rf   rg   r   main_input_namero   rK   r   rE  r   r  r#   r"   rp  r   rk   rl   rm   s   @rY   r  r    s    
 %O/ 5#3#3 8I8I & F''F 
u  %"3"33	4F  Fr[   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPiE  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ sH.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf r(  )rJ   rK   r   r   rw   rP   r  r  r  rA   r)  r   rH   r  r  s      rY   rK   JanusVQVAEAlignerMLP.__init__F  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rr+  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r-  r.  s      rY   r   JanusVQVAEAlignerMLP.forwardO  r1  r[   r2  )	rd   re   rf   rg   ro   rK   r   rk   rl   rm   s   @rY   r  r  E  s    7/ 7 r[   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadiW  zOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r  )rJ   rK   r   r   r   rP   proj_outr   rH   r  rx   vision_headr
  s     rY   rK   JanusVQVAEHead.__init__Z  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr[   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r  r  s     rY   r   JanusVQVAEHead.forward`  s6    m4**=9((7r[   )r  r  r  )rd   re   rf   rg   rh   ro   rK   r   r   tensorr   rk   rl   rm   s   @rY   r  r  W  s5    YS/ SU\\ ell  r[   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS\	R                  S\	R                  S	\	R                  4S
 jr\\         SS\	R                  S\	R                  S\\	R                      S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\\	R                   4   4S jj5       5       rSrU =r$ )
JanusModelig  r   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r   F)rJ   rK   r   r!  _from_configr>   vision_modelr&  alignerr  r   vqmodelr   	Embeddingrx   rw   generation_embeddingsr  generation_alignerr  generation_headr(   from_configr   language_modelr  r  r
  s     rY   rK   JanusModel.__init__m  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r[   c                 6    U R                   R                  5       $ r  )r  get_input_embeddingsrV   s    rY   r  JanusModel.get_input_embeddings  s    ""7799r[   c                 :    U R                   R                  U5        g r  )r  set_input_embeddingsrV   values     rY   r  JanusModel.set_input_embeddings  s    007r[   c                 ^    U R                  U5      nU R                  UR                  5      nU$ r  )r  r  r  )rV   r   image_embedss      rY   get_image_featuresJanusModel.get_image_features  s,    ((6||L$B$BCr[   	input_idsinputs_embedsimage_featuresc           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nX$   R                  5       UR                  5       :w  a0  UR                  S   UR                  S   -  n[        SU SU 35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   devicer   r   ru   z6Image features and image tokens do not match: tokens: z, features )r  r   r  r   r   longr  allsum	unsqueeze	expand_asr   numelr   r   )rV   r  r  r  special_image_maskn_image_tokensn_image_featuress          rY   get_placeholder_maskJanusModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno,2248L8L8NN-33A69M9Ma9PPHHXXcdtcuv  "!r[   r   r   r   r   cache_position	use_cachelogits_to_keepc
                    US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbw  U R                  U5      nUR                  SUR                  S   5      nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUUU	S.U
D6n[        UR                  UR                  UR                  UR                  Ub  WS9$ S S9$ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   )r  r  )r  r   r   r   r  r  r  )r  r   r   
attentionsimage_hidden_statesrI   )r   r  r  r   r   r   r  r   r  masked_scatterr  r   r  r   r   r  )rV   r  r   r   r   r   r  r  r  r  rW   r  r  image_attention_mask	lm_outputs                  rY   r   JanusModel.forward  s@    -t";<s    557	BM#22<@L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M^M'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r[   )	r  r   r  r  r  r  r  r  r  )	NNNNNNNNr   )rd   re   rf   rg   r   rK   r  r  r  r   rE  r   r  r#   r"   r   r   r   r   r   r   r   rk   rl   rm   s   @rY   r  r  g  s*   { *:8
"))":?:K:K"]b]n]n"0  '+*.1537+/5959$(34.
##.
 ''.
 !.	.

 u//0.
 "%.
 !!1!12.
   1 12.
 D>.
 c5<</0.
  .
r[   r  c                     ^  \ rS rSrSS/rSrS\4U 4S jjrS rS r	S	\
R                  S
\
R                  4S jrS rS r\\          S S\
R"                  S\
R$                  S\\
R                     S\\
R"                     S\\   S\\
R"                     S\\
R$                     S\\
R"                     S\\   S\\\
R                  4   S\\   4S jj5       5       r      S!U 4S jjrS\
R                  4S jr\
R:                     S"S	\
R                  S\\
R"                     S\\   4U 4S jjj5       rSr U =r!$ )#JanusForConditionalGenerationi  z(model.language_model.embed_tokens.weightzlm_head.weightTr   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )rJ   rK   r   r  r   r   r   r   r@   
vocab_sizelm_headr  r
  s     rY   rK   &JanusForConditionalGeneration.__init__  sZ     '
yy!3!3!?!?ASASA^A^ejk 	r[   c                 J    U R                   R                  R                  5       $ r  )r   r  r  r  s    rY   r  2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??r[   c                 N    U R                   R                  R                  U5        g r  )r   r  r  r  s     rY   r  2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=r[   inputsr   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r  )r   r  r  )rV   r  r  s      rY   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\Br[   c                     Xl         g r  r   )rV   r  s     rY   set_decoder)JanusForConditionalGeneration.set_decoder  s    
r[   c                     U R                   $ r  r  r  s    rY   get_decoder)JanusForConditionalGeneration.get_decoder  s    zzr[   r  r   r   r   r   r  r  labelsr  r  rW   c                    U R                   " SUUUUUUU	US.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r  r   r   r   r   r  r  r  N)logitsr  r  )lossr  r   r   r  r  rI   )r   r  r   r   slicer  loss_functionr   r   r  r   r   r   r  r  )rV   r  r   r   r   r   r  r  r  r  r  rW   outputsr   slice_indicesr  r  s                    rY   r   %JanusForConditionalGeneration.forward  s    , ** 

%)%+')

 

  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
r[   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  X)S'   U	$ )N)r   r  r   r  r  r   r   )rJ   prepare_inputs_for_generation)rV   r  r   r   r   r  r  r  rW   model_inputsrX   s             rY   r  ;JanusForConditionalGeneration.prepare_inputs_for_generation+  sR     w<
+')))
 
 !!+7(r[   r8  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r'   r
   ru   )r   r  r  r?  )rV   r8  decoded_images      rY   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokensI  s:     

**11,?%--aAq9r[   logits_processorc           	      N  > UR                  SU R                  5      n[        R                  " U5      nUR                  SS5      nUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR                  " S0 UD6nUR                  5       [        R                  [        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [         R#                  S5        S	Ul        UR                  US
'   U R%                  XR&                  U5      u  pnUR(                  UR*                  p[-        UR.                  5      S:w  a  [        SUR.                   S35      eUS LnU R1                  X\UR*                  S9  UR                  (       a;  UR                  S:  a+  UR3                  [5        UR                  5      5        S Ul        U R7                  UUR.                  S   US UUS9nU R8                  " SUUUR:                  S.UD6u  pU R<                  R>                  R@                  RB                  nUR.                  u  pURE                  SS5      nUR                  SS 5      nURE                  SS5      nX'S'   UUS 2S S 24   UR&                  :g  UUS 2S S 24   URF                  S   :g  -  nUUS 2S S 24   RI                  UURJ                  5        U RM                  5       " U5      nU RO                  XU5      nURQ                  SS 5      cB  U RS                  URT                  =(       d    SUS-  [W        URX                  X-   5      UUS9US'   [Z        R\                  " X4XS9nUR^                  nUR`                  nURb                  nURd                  nURf                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[i        U5       GHy  nU Rj                  " SUUS.UD6nUS   Rm                  UR*                  5      US'   US   Rm                  UR*                  5      US'   U R<                  Rn                  " S0 UDUUS.D6nU Rq                  UU5      nURr                  S S 2SS S 24   Ru                  5       n U R<                  Rw                  U 5      n!U" UU!5      n"URx                  (       a:  [Z        Rz                  " U"SS9n#[Z        R|                  " U#SS9R                  S5      n$O[Z        R                  " U"SS9n$U$US S 2U4'   [Z        R                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGM|     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r'   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  ru   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenr  model_kwargsr  rI   )r  r  r  )output_attentionsoutput_hidden_statesr   )r;  )num_samples)	sequencesscoresr  r  r   r   )Ipopr  copydeepcopyrJ   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  r   warning_prepare_model_inputsbos_token_idr   r  rm  r   _prepare_special_tokensrs  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   rU   repeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr   
_get_cacher  max
max_lengthr   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater  r  r   r  #_update_model_kwargs_for_generationr  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  r   r  r   r   r   )&rV   r  r   r  rW   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskrU   r   r   input_tokensmaskr  generated_tokensr  r  r<  r=  r>  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r  r!  next_token_scoresprobs
next_tokenrX   s&                                        rY   r%  &JanusForConditionalGeneration.generateU  sP    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   )//9&9 002>;P;PR`RnRn:ool  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@P@Z[) /> /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r[   )r   r  r   )
NNNNNNNNNr   )NNNNNN)NNN)"rd   re   rf   rg   _tied_weights_keysr   r   rK   r  r  r   r   r  r  r  r#   r"   rE  r   r   r   r   r   r   r    r!   r   r  r  no_gradr   r%  rk   rl   rm   s   @rY   r  r    s   DFVW!{ @>ell u|| 
  '+*.1537+/5959-1$(341
##1
 ''1
 !.	1

 u//01
 "%1
 !!1!121
   1 121
 ))*1
 D>1
 c5<</01
 +,1
  1
l <
 
 ]]  $59:>	}$}$ !!1!12}$ ##67	}$ }$r[   r  c                     ^  \ rS rSrSrSSS\R                  SSSSSS4
S\S\\	\
\4      S	\S
\S\S\\\4   S\S\\\\\   4      S\\\\\   4      S\\   4U 4S jjjr   SS\R"                  S\\\\\\4   4   S\\\
\4      S\\\
\4      S\R(                  4
S jjrS\R                  SS4S\R"                  S\\	\
\4   \4   S\\\\\4      S
\S\\\
\4      S\\\
\4      S\R"                  4S jjr       SS\S\\   S\\   S\\   S\\\      S\\\      S\\
   S\\
   4S jjr S S\R(                  S\\\\   4   S\\\\   4   S\\\
\4      S\R(                  4
S jjrSrU =r$ )!JanusImageProcessori  a
  
Constructs a JANUS image processor.

Args:
    do_resize (`bool`, *optional*, defaults to `True`):
        Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
        `do_resize` parameter in the `preprocess` method.
    size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
        Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
        method.
    min_size (`int`, *optional*, defaults to 14):
        The minimum allowed size for the resized image. Ensures that neither the height nor width
        falls below this value after resizing.
    resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
        overridden by the `resample` parameter in the `preprocess` method.
    do_rescale (`bool`, *optional*, defaults to `True`):
        Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
        `do_rescale` parameter in the `preprocess` method.
    rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
        Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
        overridden by the `rescale_factor` parameter in the `preprocess` method.
    do_normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
        method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
    image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
        Mean to use if normalizing the image. This is a float or list of floats the length of the number of
        channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
        overridden by the `image_mean` parameter in the `preprocess` method.
    image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
        Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
        number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
        Can be overridden by the `image_std` parameter in the `preprocess` method.
    do_convert_rgb (`bool`, *optional*, defaults to `True`):
        Whether to convert the image to RGB.
TN   gp?	do_resizer   min_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbc           	         > [         TU ]  " S0 UD6  X0l        Uc  SU l        g [	        U Vs/ sH  n[        US-  5      PM     sn5      U l        g s  snf )N)   rd  rd     rI   )rJ   rK   r[  background_colorrp  r   )rV   rZ  r   r[  r\  r]  r^  r_  r`  ra  rb  rW   xrX   s                rY   rK   JanusImageProcessor.__init__<  sO     	"6" $3D!$)*LA3q3w<*L$MD!*Ls   Aimagerf  data_formatinput_data_formatr   c                 6   [        X5      u  pVU[        R                  :X  a  UR                  S   OUR                  S   nXV:X  a  Ub  [	        XU5      nU$ UnU$ [        XV5      n[        U[        5      (       a  U/nO[        U5      U:w  a  [        SU S35      eU[        R                  :X  av  [        R                  " XxU4UR                  S9n	[        U5       H  u  pXU
SS2SS24'   M     Xe:  a  X-
  S-  nXSS2XU-   2SS24'   U	$ X-
  S-  nXSS2SS2XU-   24'    U	$ [        R                  " XU4UR                  S9n	[        U5       H  u  pXSS2SS2U
4'   M     Xe:  a  X-
  S-  nXXU-   2SS2SS24'   U	$ X-
  S-  nXSS2XU-   2SS24'   U	$ )a  
Pads an image to a square based on the longest edge.

Args:
    image (`np.ndarray`):
        The image to pad.
    background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
        The color to use for the padding. Can be an integer for single channel or a
        tuple of integers representing for multi-channel images. If passed as integer
        in mutli-channel mode, it will default to `0` in subsequent channels.
    data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the output image. Can be one of:
            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        If unset, will use same as the input image.
    input_data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the input image. Can be one of:
            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

Returns:
    `np.ndarray`: The padded image.
r   r   Nz(background_color must have no more than z) elements to match the number of channelsr   r'   )r   r   FIRSTr   r   r9  r   r   rm  r   npr;  r   	enumerate)rV   ri  rf  rj  rk  r   r   rC   max_dimresultrP  colorstarts                rY   pad_to_square!JanusImageProcessor.pad_to_squareR  s   < 'u@):>N>T>T)Tu{{1~Z_ZeZefhZi? * ,E@QR 
 L  
 Lf$ &,, 01!"l2:<.Hqr   0 6 66XX|g>ekkRF%&67"'q!Qw 8~ )a/7<q%&.0!34  !Q.6;q!UU]223  XXw>ekkRF%&67"'q!Qw 8~ )a/7<uv~-q!34
  !Q.6;q%%-/23r[   c                    Ub  UOU R                   nUc  [        U5      n[        X5      u  p[        X5      n
[	        USS9nUS   US   :w  a  [        SUS    SUS    35      eUS   nX*-  n[        [        X-  5      U R                  5      [        [        X-  5      U R                  5      /n[        U4UUUUS.UD6nU R                  UUUS9nU$ )	a  
Resize an image to dynamically calculated size.

Args:
    image (`np.ndarray`):
        Image to resize.
    size (`dict[str, int]` or `int`):
        The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
    background_color (`tuple[int, int, int]`):
        The background color to use for the padding.
    resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
        `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
    data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the output image. If unset, the channel dimension format of the input
        image is used. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `None`: will be inferred from input
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

Returns:
    `np.ndarray`: The resized image.
T)default_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r   r\  rj  rk  )ri  rf  rk  )
rf  r   r   r9  r   r   r   r[  r   rt  )rV   ri  r   rf  r\  rj  rk  rW   r   r   max_sizedeltaoutput_size_nonpaddeds                rY   r   JanusImageProcessor.resize  s    L 0@/K+QUQfQf$ >u E&u@v%TT:>T']*GXGWWbcghocpbqr  H~ FN#T]]3EM"DMM2!

 
&#/
 
 ""-/ # 

 r[   imagesreturn_tensorsc	                 <   Ub  UOU R                   nUc  SU R                  -  OUnUb  UOU R                  nUb  UOU R                  nUb  UOU R                  n[        U5      n[        US   [        R                  R                  5      (       a  [        U5      S:  a  U$ US   $ Uc  [        US   5      n/ n	U H  n
[        U
5      n
U(       a  U R                  XXgS9n
U(       a?  U R                  XUS9n
U
R                  SS5      R                  [         R"                  5      n
U(       aE  U(       a>  US:X  a8  [%        U
[&        R(                  US	9n
[        R                  R+                  U
5      n
U	R-                  U
5        M     S
U	0nUS:w  a  UOSn[/        XS9$ )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Ng      ?r   ru   )ri  r`  ra  rk  )r   rk  re  zPIL.Image.Image)input_channel_dimr   )datatensor_type)r]  r^  r_  r`  ra  r   r   PILImagerm  r   r   unnormalizerescaleclipastypern  uint8r   r   LAST	fromarrayrs  r   )rV   r|  r]  r^  r_  r`  ra  rk  r}  r   ri  r  s               rY   postprocessJanusImageProcessor.postprocess  s    $.#9Zt
6D6Lt222R`'3'?|TEVEV#-#9Zt
!*!6IDNN	$V,fQi11 [1_6;&);$ >vay IE"5)E(() )  UTef

1c*11"((;
~AR/R3E;K;P;Pduv		++E2&! $ -+9=N+NTXBBr[   c                    Sn[        U[        5      (       a*  [        U5      U:w  a  [        SU S[        U5       35      eOU/U-  n[        U[        5      (       a*  [        U5      U:w  a  [        SU S[        U5       35      eOU/U-  n[	        S [        X#5       5       5      n[	        S U 5       5      nU R                  XXtS9nU$ )a  
Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
image = (image * image_std) + image_mean
Args:
    image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
        Batch of pixel values to postprocess.
    image_mean (`float` or `Iterable[float]`):
        The mean to use for unnormalization.
    image_std (`float` or `Iterable[float]`):
        The standard deviation to use for unnormalization.
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
r
   zmean must have z$ elements if it is an iterable, got zstd must have c              3   0   #    U H  u  pU* U-  v   M     g 7fr  rI   ).0meanstds      rY   	<genexpr>2JanusImageProcessor.unnormalize.<locals>.<genexpr>@  s     W<Vytus{<Vs   c              3   *   #    U H
  nS U-  v   M     g7f)ru   NrI   )r  r  s     rY   r  r  A  s     ;#a#gs   )ri  r  r  rk  )r   r   rm  r   rp  zipr>  )rV   ri  r`  ra  rk  rC   rev_image_meanrev_image_stds           rY   r  JanusImageProcessor.unnormalize  s    0 j(++:,. ?<.@dehisetdu!vww / %4Ji**9~- >,?cdghqdrcs!tuu . #l2IWC
<VWW;;;-  
 r[   )rf  r[  )r   NN)NNNNNNNr  )rd   re   rf   rg   rh   r   BICUBICr   r   r   strr   r   r   r   rK   rn  ndarrayrp  r   arrayrt  r   r   r  r   r  rk   rl   rm   s   @rY   rX  rX    s+   #N )-'9'A'A,3!:>9=)-NN tCH~&N 	N
 %N N c5j)N N U5$u+#567N E%e"456N !N N2 >?>BDHHzzH  U3S=%9 9:H eC)9$9:;	H
 $E#/?*?$@AH 
H\ <@'9'A'A>BDHIzzI DcNC'(I #5c3#78	I
 %I eC)9$9:;I $E#/?*?$@AI 
I\ &**.'+,0+/+/(,1C1C TN1C !	1C
 tn1C T%[)1C DK(1C $C=1C !1Cp EI+xx+ %%01+ /0	+
 $E#/?*?$@A+ 
+ +r[   rX  )	rX  r   r  r  r  r!  ro   r;   r   )xr#  collections.abcr   dataclassesr   typingr   r   r   numpyrn  r   r   .transformers.models.blip.image_processing_blipr	   activationsr   cache_utilsr   
generationr   r   r   r   generation.utilsr   image_processing_utilsr   r   image_transformsr   r   image_utilsr   r   r   r   r   r   r   modeling_outputsr   modeling_utilsr   r   processing_utilsr    utilsr!   r"   r#   r$   r%   r&   autor(   blip_2.modeling_blip_2r)   !chameleon.configuration_chameleonr*   chameleon.modeling_chameleonr+   r,   r-   r.   r/   idefics.modeling_ideficsr0   r1   llama.modeling_llamar2   siglip.configuration_siglipr3   siglip.modeling_siglipr4   r5   r6   torch.nntorch.nn.functional
functionalr=  torch.utils.checkpointr  configuration_utilsr7   r8   r9   
get_loggerrd   r   r;   ro   r   r   r   r   r   r   rt  r   r  r   r  r!  r&  r4  rG  rI  rK  rM  r]  rh  r  r  r  r  r  r  rX  __all__rI   r[   rY   <module>r     s     $ ! , ,    M !   u u 9 A C   , F &   5 D  e : < ^ ^ ##! 3 - 
		H	%
^1* ^1BW+ Wtk#" k#\ 
.? 
. 
. 
	-{ 	- 	-	#A 		"? 	2 "I$299 I$XRYY (*0 *p p2' 2BII $" = "*	< 		8 		B 	RYY  ,J!		 J!ZA		 AH-F -F`299 $RYY   
i
% i

i
X{$$8/ {$|	o, od	
r[   