
    <h\                     6   S r SSKJrJrJr  SSKrSSKJr  SSKJr  SSK	J
r
Jr  SSKJrJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJr  SSKJrJrJrJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(J)r)  \RT                  " \+5      r,Sr-Sr. " S S\5      r/ " S S\#5      r0 " S S\$5      r1 " S S\Rd                  5      r3 " S S\)5      r4 " S S\(5      r5 " S S\"5      r6 " S  S!\6\!5      r7 " S" S#\5      r8 " S$ S%\5      r9 " S& S'\ 5      r: " S( S)\5      r;/ S*Qr<g)+zLG AI Research EXAONE Lab    )CallableOptionalUnionN)nn)check_model_inputs   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )
LlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Olmo2DecoderLayerOlmo2MLPzLGAI-EXAONE/EXAONE-4.0-InstructExaone4Configc                      ^  \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                    SU 4S jjr	Sr
U =r$ )r"   ;   a  
This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 102400):
        Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`Exaone4Model`].
    hidden_size (`int`, *optional*, defaults to 4096):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
        Dimensionality of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 32768 for EXAONE 3.5).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if ``config.is_decoder=True``.
    bos_token_id (`int`, *optional*, defaults to 0):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 2):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    sliding_window (`int`, *optional*):
        The size of the sliding window for the sliding window attention.
    sliding_window_pattern (`str`, *optional*):
        The pattern to use for sliding window attention. Can be one of:
            - `None`: No sliding window attention is used
            - `int`: Every `sliding_window` layers, use global attention, else use local attention.
            - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
              attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
              final layer always uses global attention regardless of the pattern.
        For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
            - Layer 0, 1, 2: local attention,
            - Layer 3: global attention,
            ...(repeated)
    layer_types (`list`, *optional*):
        Attention pattern for each layer. Prioritized over `sliding_window_pattern`.

Example:

```python
>>> from transformers import Exaone4Model, Exaone4Config

>>> # Initializing a EXAONE configuration
>>> configuration = Exaone4Config()

>>> # Initializing a model from configuration
>>> model = Exaone4Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```exaone4past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 ,  > Xl         X l        X@l        XPl        X`l        X0l        Xpl        Xl        Xl        Xl	        Xl
        UU l        Xl        UU l        UU l        UU l        UU l        U R                  c  SnU R                   cH  [#        U R                  5       Vs/ sH#  nUS-   U-  S:w  a  UU R                  :  a  SOSPM%     snU l        SU R                   ;   a  SU l        ['        U R                   5        [(        TU ]T  " SXUS.UD6  g s  snf )	Nr      sliding_attentionfull_attentionsliding_windowhybrid)bos_token_ideos_token_idtie_word_embeddings )
vocab_sizehidden_sizenum_hidden_layersnum_attention_headsnum_key_value_headsintermediate_size
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cacheattention_dropout
rope_thetarope_scalingr4   sliding_window_patternlayer_typesrangecache_implementationr   super__init__)selfr:   r;   r?   r<   r=   r>   r@   rA   rB   rC   rD   r6   r7   r8   rF   rG   rE   r4   rH   rI   kwargsi	__class__s                          c/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/exaone4/modular_exaone4.pyrM   Exaone4Config.__init__   s4   0 %&!2#6 #6 !2$'>$!2("!2$(,&<#&&%&"#
 t556	  7A U56!;DDZDZ@Z $%& 7	 D t///(0D%d../ 	
%Vi	
ms	
 s    )D)rE   rK   r@   r;   rB   r?   rI   rA   r=   r<   r>   rC   rG   rF   r4   rH   rD   r:   )i     i @      rU   rU   silui   g{Gz?gh㈵>Tr   r   Fg     @N        rT      N)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrM   __static_attributes____classcell__rQ   s   @rR   r"   r"   ;   s    un J#4"5 &/%.%.%."+ )"+ &(9:#%568IJ!"_$56  $! +9
 9
    c                       \ rS rSrSrg)Exaone4RMSNormi  r9   NrY   rZ   r[   r\   rb   r9   re   rR   rg   rg         re   rg   c                       \ rS rSrSrg)Exaone4RotaryEmbeddingi  r9   Nrh   r9   re   rR   rk   rk     ri   re   rk   c                   D  ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\\   S\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )Exaone4Attentioni	  config	layer_idxc                 d  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        [        USUR                  UR                  -  5      U l        UR                  UR
                  -  U l	        UR                  U l
        SU l        U R                  S-  U l        UR                  U l        UR                  U l        UR                  U   S:H  U l        ["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R
                  U R                  -  SS9U l        ["        R$                  " U R                  U R
                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  -  U R                  SS9U l        [/        U R                  UR0                  S9U l        [/        U R                  UR0                  S9U l        g )Nhead_dimTg      r2   F)biaseps)rL   rM   rn   ro   r=   r>   r;   getattrrq   num_key_value_groupsrE   	is_causalscalingr4   rH   rI   
is_slidingr   Linearq_projk_projv_projo_projrg   rC   q_normk_normrN   rn   ro   rQ   s      rR   rM   Exaone4Attention.__init__
  s   "#)#=#= #)#=#= !--
F4F4F&JdJd4de$*$>$>&B\B\$\!!'!9!9}}d*$33&,&C&C# ,,Y7;NNii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii 8 84== H$JZJZafg$T]]8K8KL$T]]8K8KLre   r+   position_embeddingsr,   past_key_valuecache_positionrO   returnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  pU R                  b  U R                  (       a  [        XX5      u  pUb#  SU0nUR                  XU R                  U5      u  p[        nU R                  R                   S:w  a  ["        U R                  R                      nU" U U	U
UU4U R$                  (       d  SOU R&                  U R(                  U R                  (       a  U R                  OS S.UD6u  nnUR*                  " / UQSP76 R-                  5       nU R/                  U5      nUU4$ )Nr1   r   r   eagerrW   )dropoutrx   r4   )shaperq   r{   view	transposer|   r}   r   r   r4   ry   r   updatero   r   rn   _attn_implementationr   trainingrE   rx   reshape
contiguousr~   )rN   r+   r   r,   r   r   rO   input_shapehidden_shapequery_states
key_statesvalue_statescossincache_kwargsattention_interfaceattn_outputattn_weightss                     rR   forwardExaone4Attention.forward"  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST {{<0[[,
&&$//';LVY'_$L% .L (6'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL26//4..t
%
 
%
!\ "));;;;FFHkk+.L((re   )rE   rn   rq   r;   rw   ry   r   r|   ro   r=   rv   r>   r~   r   r{   rx   r4   rH   r}   )NNN)rY   rZ   r[   r\   r"   intrM   torchTensortupler   r	   
LongTensorr   r   r   rb   rc   rd   s   @rR   rm   rm   	  s    M} M M8 26*.591)||1) #5<<#=>1) !.	1)
 !1) !!1!121) +,1) 
u||Xell3XeELL>Q5RR	S1) 1)re   rm   c                       \ rS rSrSrg)
Exaone4MLPiV  r9   Nrh   r9   re   rR   r   r   V  ri   re   r   c                       \ rS rSrSrg)Exaone4DecoderLayeriZ  r9   Nrh   r9   re   rR   r   r   Z  ri   re   r   c                       \ rS rSr\rS/rSrg)Exaone4PreTrainedModeli^  r   r9   N)rY   rZ   r[   r\   r"   config_class_no_split_modulesrb   r9   re   rR   r   r   ^  s     L./re   r   c                     ^  \ rS rSrS\4U 4S jjr\       SS\R                  S\	\R                     S\	\R                     S\	\   S\	\R                     S	\	\   S
\	\R                     S\\   S\\\4   4S jj5       rSrU =r$ )Exaone4Modelic  rn   c           	      "  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        U R                  5         g s  snf )Nrs   )rL   rM   r   
ModuleListrJ   r<   r   r.   rg   r;   rC   r/   	post_initr   s      rR   rM   Exaone4Model.__init__d  ss     mmEJ6KcKcEdeEd	 3Ede
 #6#5#56;N;NO	 	 fs   Br)   r,   position_idsr&   r*   rD   r   rO   r   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc
  [        5       nUcD  Ub  UR                  5       OSn	[        R
                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       dH  U R                  UUUUUS.nS[        S
0 UD60n
SU R                  R                  ;   a  [        S
0 UD6U
S'   UnU R                  X5      n[!        U R"                  5       H1  u  pU R                  R                  U   nU" U4UU
U   UUUUS.UD6nM3     U R%                  U5      n['        UU(       a  US	9$ S S	9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   r1   )device)rn   input_embedsr,   r   r&   r   r3   r2   )r   r,   r   r   rD   r   )last_hidden_stater&   r9   )
ValueErrorr-   r
   get_seq_lengthr   aranger   r   	unsqueeze
isinstancedictrn   r   rI   r   
rotary_emb	enumerater.   r/   r   )rN   r)   r,   r   r&   r*   rD   r   rO   past_seen_tokenscausal_mask_mappingmask_kwargsr+   r   rP   decoder_layer
layer_types                    rR   r   Exaone4Model.forwardn  s    -t";<YZZ  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L ?-FF ++ -"0"0#2 ,K !"4"C{"C# #dkk&=&==;\;k_j;k#$78%"oomJ )$++ 6A003J)	$72:>).#-	 	M !7 		-0&+/8O
 	
>B
 	
re   )r.   r/   )NNNNNNN)rY   rZ   r[   r\   r"   rM   r   r   r   r   r   r	   FloatTensorboolr   r   r   r   r   r   rb   rc   rd   s   @rR   r   r   c  s    }   '+1537+/59$(59E
##E
 !.E
 u//0	E

 "%E
   1 12E
 D>E
 !!1!12E
 +,E
 
u--	.E
 E
re   r   c                   :  ^  \ rS rSr         SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\R                     S\\
   S	\\R                     S
\\\R                  4   S\\   S\4U 4S jjjrSrU =r$ )Exaone4ForCausalLMi  r)   r,   r   r&   r*   labelsrD   r   logits_to_keeprO   r   c
                 :   > [         TU ]  " SUUUUUUUUU	S.	U
D6  g)u$  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct")
>>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct")

>>> prompt = "Explain how wonderful you are"
>>> messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
>>> input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    enable_thinking=False,
)

>>> output = model.generate(input_ids, max_new_tokens=128)
>>> tokenizer.decode(output[0], skip_special_tokens=False)
"[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n<think>\n\n</think>\n\nOh, thank you for such a kind and lovely question! 😊  \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with:  \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake!  \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered!  \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out"
```

NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.)	r)   r,   r   r&   r*   r   rD   r   r   Nr9   )rL   r   )rN   r)   r,   r   r&   r*   r   rD   r   r   rO   rQ   s              rR   r   Exaone4ForCausalLM.forward  s<    Z 	 	
)%+'))	
 	
re   r9   )	NNNNNNNNr   )rY   rZ   r[   r\   r   r   r   r   r	   r   r   r   r   r   r   r   r   rb   rc   rd   s   @rR   r   r     s     151537+/59-1$(59348
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 ))*8
 D>8
 !!1!128
 c5<</08
 +,8
 
 8
 8
re   r   c                       \ rS rSrSrg) Exaone4ForSequenceClassificationi  r9   Nrh   r9   re   rR   r   r     ri   re   r   c                       \ rS rSrSrg)Exaone4ForTokenClassificationi  r9   Nrh   r9   re   rR   r   r     ri   re   r   c                       \ rS rSrSrg)Exaone4ForQuestionAnsweringi  r9   Nrh   r9   re   rR   r   r     ri   re   r   )r"   r   r   r   r   r   r   )=r]   typingr   r   r   r   r   transformers.utils.genericr   cache_utilsr	   r
   configuration_utilsr   r   masking_utilsr   r   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   olmo2.modeling_olmo2r    r!   
get_loggerrY   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr"   rg   rk   Modulerm   r   r   r   r   r   r   r   r   __all__r9   re   rR   <module>r      s      , ,   9 . J R 6 &   ? 
		H	%7 !C
$ C
L	\ 		1 	J)ryy J)Z	 		+ 	01 0
Q
): Q
h9
) 9
x	'E 		$? 		"; 	re   