
    <hP                        S SK r S SKJrJr  S SKrS SKJr  SSKJrJ	r	  SSK
JrJr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  \RP                  " \)5      r* " S S\5      r+ " S S\#5      r, " S S\!5      r- " S S\\R\                  5      r/ " S S\5      r0 " S S\"5      r1 " S S\'5      r2 " S S\ 5      r3/ S Qr4g)!    N)CallableOptional   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2Modelc                      ^  \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                      SU 4S jjr	\
S 5       r\R                  S 5       rSrU =r$ )Cohere2Config/   a  
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


Args:
    vocab_size (`int`, *optional*, defaults to 256000):
        Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`CohereModel`]
    hidden_size (`int`, *optional*, defaults to 8192):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 22528):
        Dimension of the MLP representations.
    logit_scale (`float`, *optional*, defaults to 0.0625):
        The scaling factor for the output logits.
    num_hidden_layers (`int`, *optional*, defaults to 40):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 64):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 5):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 255001):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    sliding_window (`int`, *optional*, defaults to 4096):
        Size of the sliding window attention context.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.

```python
>>> from transformers import Cohere2Model, Cohere2Config

>>> # Initializing a Cohere Nextmodel configuration
>>> configuration = Cohere2Config()

>>> # Initializing a model from the Cohere2 configuration
>>> model = Cohere2Model(configuration) # doctest: +SKIP

>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```
cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 ~  > Xl         Xl        X l        X@l        X0l        XPl        X`l        Uc  UnXpl        Xl        Xl	        Xl
        Xl        UU l        UU l        UU l        UU l        UU l        UU l        X&-  U l        ['        U 5        [(        TU ]T  " SUUUUS.UD6  UR-                  SS5      U l        U R"                  c^  [1        U SS5      U l        [3        U R
                  5       Vs/ sH'  n[5        US-   U R.                  -  5      (       a  SOSPM)     snU l        [7        U R"                  5        g s  snf )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_window_pattern      sliding_attentionfull_attention )
vocab_sizemax_position_embeddingshidden_sizelogit_scaleintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangelayer_norm_eps	use_cache
rope_thetarope_scalingattention_biasattention_dropoutsliding_windowlayer_typeshead_dimr   super__init__get_sliding_window_patterngetattrrangeboolr	   )selfr6   r8   r:   r9   r;   r<   r=   r>   r7   r?   r@   rA   r,   r-   r.   r/   rB   rC   rD   rE   rF   rG   kwargsi	__class__s                            c/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/cohere2/modular_cohere2.pyrJ   Cohere2Config.__init__   sZ   4 %'>$&&!2!2#6  &"5#6 $!2,"$(,!2,&#: 	t$ 	
%%% 3		

 	
 (.zz2JA'N$#+249QST+UD( t556 6A (,QUd6R6R,R'S'S#Yii6 D 	d../	 s   /-D:c                 P    [         R                  " S[        5        U R                  $ )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)warningswarnFutureWarningrL   )rP   s    rT   r0   $Cohere2Config.sliding_window_pattern   s"    b	
 +++    c                     Xl         g N)rL   )rP   values     rT   r0   rZ     s    ',$r[   )rL   rD   rE   rH   r>   r8   r?   r:   r@   rG   r9   r7   r<   r;   r=   rC   rB   rF   rA   r6   )i      i X  g      ?(   @   Nsilur_   g{Gz?gh㈵>Tr      i Tg     @NF        i   N)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrJ   propertyr0   setter__static_attributes____classcell__rS   s   @rT   r   r   /   s    n` J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56   $ /I0V , , ""- #-r[   r   c                       \ rS rSrSrg)Cohere2RotaryEmbeddingi	  r5   Nre   rf   rg   rh   rp   r5   r[   rT   rt   rt   	      r[   rt   c                       \ rS rSrSrg)Cohere2LayerNormi  r5   Nru   r5   r[   rT   rx   rx     rv   r[   rx   c                   B   \ rS rSrSrSS\S\\   4S jjr  SS\	R                  S\\	R                  \	R                  4   S	\\	R                     S
\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrg)Cohere2Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    [         R                  R                  5         Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        SU l        UR                  U   S:X  a  UR                  OS U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  U R                  -  UR                  UR"                  S9U l        g )NrH   g      Tr3   )bias)nnModulerJ   r{   r|   rM   r8   r<   rH   r=   num_key_value_groupsscalingrE   	is_causalrG   rF   LinearrD   q_projk_projv_projo_proj)rP   r{   r|   s      rT   rJ   Cohere2Attention.__init__  sq   
		"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!97=7I7I)7TXk7kf33quii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r[   r&   position_embeddingsr'   past_key_valuecache_positionrQ   returnc                 d   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  pU R                  b  [        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                   U R"                  U R                  S.UD6u  nnUR$                  " / UQSP76 R'                  5       nU R)                  U5      nUU4$ )Nr2   r   )sincosr   eagerrd   )dropoutr   rF   )shaperH   r   view	transposer   r   rF   r   updater|   r   r{   _attn_implementationr   trainingrE   r   reshape
contiguousr   )rP   r&   r   r'   r   r   rQ   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     rT   forwardCohere2Attention.forward,  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&*';LVY'_$L%#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((r[   )rE   r{   rH   r   r   r|   r   r   r   r   rF   r   r]   )NN)re   rf   rg   rh   ri   r   r   intrJ   torchTensortupler   
LongTensorr   r   r   rp   r5   r[   rT   rz   rz     s    G
} 
# 
: +/59*)||*) #5<<#=>*) !.	*)
 !*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*) *)r[   rz   c                   L  ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\   S
\
\R                     S\\   S\	\R                   \
\	\R                   \R                   4      4   4S jjrSrU =r$ )Cohere2DecoderLayeriY  r{   r|   c                 L   > [         TU ]  X5        UR                  U   U l        g r]   )rI   rJ   rG   attention_type)rP   r{   r|   rS   s      rT   rJ   Cohere2DecoderLayer.__init__Z  s#    +$00;r[   r&   r   r'   r   rA   r   rQ   r   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   U-   nU$ )N)r&   r   r'   r   rA   r   r5   )input_layernorm	self_attnmlp)rP   r&   r   r'   r   rA   r   rQ   residualhidden_states_attention_hidden_states_mlps               rT   r   Cohere2DecoderLayer.forward^  sq     !,,];%)^^ &
' 3)))&
 &
" !HH]3 :=NNr[   )r   )NNFN)re   rf   rg   rh   r   r   rJ   r   r   r   r   r   rO   r   r   r   FloatTensorr   rp   rq   rr   s   @rT   r   r   Y  s    <} < < 26*.$)59|| #5<<#=> !.	
 ! D> !!1!12 -. 
u  (51B1BEDUDU1U+V"WW	X r[   r   c                        \ rS rSr% \\S'   Srg)Cohere2PreTrainedModeliy  r{   r5   N)re   rf   rg   rh   r   __annotations__rp   r5   r[   rT   r   r   y  s    r[   r   c                     ^  \ rS rSrS\4U 4S jjr       SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\R                     S\\   S\4S jjrSrU =r$ )Cohere2Modeli}  r{   c                    > [         TU ]  U5        [        UR                  UR                  S9U l        [        US9U l        g )N)r8   eps)r{   )rI   rJ   rx   r8   r@   r*   rt   
rotary_emb)rP   r{   rS   s     rT   rJ   Cohere2Model.__init__~  s6     $&2D2D6K`K`a	0?r[   r$   r'   position_idsr!   r%   rA   r   rQ   r   c           
         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  U R                  (       d
  [        5       nUcD  Ub  UR	                  5       OSn	[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d*  U R                  UUUUUS.n[        S	0 UD6[        S	0 UD6S.n
UnU R                  X5      nU R                    H  nU" U4UXR"                     UUUS.UD6nM      U R%                  U5      n['        UUS9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r2   )device)r{   input_embedsr'   r   r!   r   )r4   r3   )r   r'   r   rA   r   )last_hidden_stater!   r5   )
ValueErrorr(   r   r   get_seq_lengthr   aranger   r   	unsqueeze
isinstancedictr{   r
   r   r   r)   r   r*   r   )rP   r$   r'   r   r!   r%   rA   r   rQ   past_seen_tokenscausal_mask_mappingmask_kwargsr&   r   decoder_layers                  rT   r   Cohere2Model.forward  sw    -t";<YZZ  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L?-FF++ -"0"0#2 ,K #5"C{"C%F%U%U#
 &"oomJ![[M)$723O3OP.#- M ) 		-0&++
 	
r[   )r*   r   )NNNNNNN)re   rf   rg   rh   r   rJ   r   r   r   r   r   r   rO   r   r   r   r   rp   rq   rr   s   @rT   r   r   }  s    @} @ 151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
!<
 <
r[   r   c                       \ rS rSrSrg)Cohere2ForCausalLMi  r5   Nru   r5   r[   rT   r   r     rv   r[   r   )r   r   r   r   )5rW   typingr   r   r   torch.nnr   cache_utilsr   r   configuration_utilsr   r	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerre   loggerr   rt   rx   r   rz   r   r   r   r   __all__r5   r[   rT   <module>r      s      %   . J R B 7 9 5 & 0	 	 	 1 
		H	%W-$ W-t	2 		 	E)		 E)P, @2 B
; B
J	* 	 \r[   