ó
    <±hûP  ã                   óÒ  • S SK r S SKJrJr  S SKrS SKJr  SSKJrJ	r	  SSK
JrJr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  \RP                  " \)5      r* " S S\5      r+ " S S\#5      r, " S S\!5      r- " S S\\R\                  5      r/ " S S\5      r0 " S S\"5      r1 " S S\'5      r2 " S S\ 5      r3/ S Qr4g)!é    N)ÚCallableÚOptionalé   )ÚCacheÚDynamicCache)ÚPretrainedConfigÚlayer_type_validation)Úcreate_causal_maskÚ!create_sliding_window_causal_mask)ÚFlashAttentionKwargs)ÚBaseModelOutputWithPast)Úrope_config_validation)ÚALL_ATTENTION_FUNCTIONS)ÚUnpack)ÚTransformersKwargsÚloggingé   )ÚCohereAttentionÚCohereDecoderLayerÚCohereForCausalLMÚCohereLayerNormÚCoherePreTrainedModelÚCohereRotaryEmbeddingÚapply_rotary_pos_embÚeager_attention_forward)ÚGemma2Modelc                   óÔ   ^ • \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                      SU 4S jjr	\
S 5       r\R                  S 5       rSrU =r$ )ÚCohere2Configé/   a–  
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


Args:
    vocab_size (`int`, *optional*, defaults to 256000):
        Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`CohereModel`]
    hidden_size (`int`, *optional*, defaults to 8192):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 22528):
        Dimension of the MLP representations.
    logit_scale (`float`, *optional*, defaults to 0.0625):
        The scaling factor for the output logits.
    num_hidden_layers (`int`, *optional*, defaults to 40):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 64):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 5):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 255001):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    sliding_window (`int`, *optional*, defaults to 4096):
        Size of the sliding window attention context.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.

```python
>>> from transformers import Cohere2Model, Cohere2Config

>>> # Initializing a Cohere Nextmodel configuration
>>> configuration = Cohere2Config()

>>> # Initializing a model from the Cohere2 configuration
>>> model = Cohere2Model(configuration) # doctest: +SKIP

>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```
Úcohere2Úpast_key_valuesÚcolwiseÚrowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projÚ	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormc                 ó~  >• Xl         Xl        X l        X@l        X0l        XPl        X`l        Uc  UnXpl        X€l        X l	        X°l
        XÀl        UU l        UU l        UU l        UU l        UU l        UU l        X&-  U l        ['        U 5        [(        TU ]T  " SUUUUS.UD6  UR-                  SS5      U l        U R"                  c^  [1        U SS5      U l        [3        U R
                  5       Vs/ sH'  n[5        US-   U R.                  -  5      (       a  SOSPM)     snU l        [7        U R"                  5        g s  snf )N)Úpad_token_idÚbos_token_idÚeos_token_idÚtie_word_embeddingsÚsliding_window_patterné   é   Úsliding_attentionÚfull_attention© )Ú
vocab_sizeÚmax_position_embeddingsÚhidden_sizeÚlogit_scaleÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_headsÚ
hidden_actÚinitializer_rangeÚlayer_norm_epsÚ	use_cacheÚ
rope_thetaÚrope_scalingÚattention_biasÚattention_dropoutÚsliding_windowÚlayer_typesÚhead_dimr   ÚsuperÚ__init__ÚgetÚ_sliding_window_patternÚgetattrÚrangeÚboolr	   )Úselfr6   r8   r:   r9   r;   r<   r=   r>   r7   r?   r@   rA   r,   r-   r.   r/   rB   rC   rD   rE   rF   rG   ÚkwargsÚiÚ	__class__s                            €Úc/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/cohere2/modular_cohere2.pyrJ   ÚCohere2Config.__init__±   sZ  ø€ ð4 %ŒØ'>Ô$Ø&ÔØ&ÔØ!2ÔØ!2ÔØ#6Ô ð Ñ&Ø"5Ðà#6Ô Ø$ŒØ!2ÔØ,ÔØ"ŒØ$ˆŒØ(ˆÔØ,ˆÔØ!2ˆÔØ,ˆÔØ&ˆÔà#Ñ:ˆŒô 	˜tÔ$ä‰Òð 	
Ø%Ø%Ø%Ø 3ñ		
ð
 ò	
ð (.§z¡zÐ2JÈAÓ'NˆÔ$à×ÑÑ#ä+2°4Ð9QÐSTÓ+UˆDÔ(ô ˜t×5Ñ5Ô6ó á6Aô (,¨Q°©U°d×6RÑ6RÑ,R×'SÑ'SÑ#ÐYiÒiÙ6ñ ˆDÔô 	˜d×.Ñ.Õ/ùò	 s   Ã/-D:c                 óP   • [         R                  " S[        5        U R                  $ )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)ÚwarningsÚwarnÚFutureWarningrL   )rP   s    rT   r0   Ú$Cohere2Config.sliding_window_patternü   s"   € äŠØbÜô	
ð ×+Ñ+Ð+ó    c                 ó   • Xl         g ©N)rL   )rP   Úvalues     rT   r0   rZ     s   € à',Õ$r[   )rL   rD   rE   rH   r>   r8   r?   r:   r@   rG   r9   r7   r<   r;   r=   rC   rB   rF   rA   r6   )i è é    i X  g      °?é(   é@   NÚsilur_   g{®Gáz”?gñhãˆµøä>Tr   é   iä Tg     ˆÃ@NFç        i   N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚkeys_to_ignore_at_inferenceÚbase_model_tp_planÚbase_model_pp_planrJ   Úpropertyr0   ÚsetterÚ__static_attributes__Ú__classcell__©rS   s   @rT   r   r   /   sæ   ø† ñnð` €JØ#4Ð"5Ðà%.Ø%.Ø%.Ø%.Ø"+Ø )Ø"+ñÐð &˜¨Ð(9Ð:Ø#Ð%5Ð6¸Ð8IÐJØ!Ð" _Ð$5Ð6ñÐð ØØØØØØ ØØ $ØØØØØØØ ØØØØØØ÷/I0ðV ñ,ó ð,ð ×"Ñ"ñ-ó #ö-r[   r   c                   ó   • \ rS rSrSrg)ÚCohere2RotaryEmbeddingi	  r5   N©re   rf   rg   rh   rp   r5   r[   rT   rt   rt   	  ó   † Úr[   rt   c                   ó   • \ rS rSrSrg)ÚCohere2LayerNormi  r5   Nru   r5   r[   rT   rx   rx     rv   r[   rx   c                   óB  • \ rS rSrSrSS\S\\   4S jjr  SS\	R                  S\\	R                  \	R                  4   S	\\	R                     S
\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrg)ÚCohere2Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperNÚconfigÚ	layer_idxc                 óº  • [         R                  R                  5         Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        SU l        UR                  U   S:X  a  UR                  OS U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  U R                  -  UR                  UR"                  S9U l        g )NrH   g      à¿Tr3   )Úbias)ÚnnÚModulerJ   r{   r|   rM   r8   r<   rH   r=   Únum_key_value_groupsÚscalingrE   Ú	is_causalrG   rF   ÚLinearrD   Úq_projÚk_projÚv_projÚo_proj)rP   r{   r|   s      rT   rJ   ÚCohere2Attention.__init__  sq  € Ü
	‰	×ÑÔØŒØ"ŒÜ ¨
°F×4FÑ4FÈ&×JdÑJdÑ4dÓeˆŒØ$*×$>Ñ$>À&×B\ÑB\Ñ$\ˆÔ!Ø—}‘} dÑ*ˆŒØ!'×!9Ñ!9ˆÔØˆŒØ7=×7IÑ7IÈ)Ñ7TÐXkÓ7k˜f×3Ò3ÐquˆÔä—i’iØ×Ñ × :Ñ :¸T¿]¹]Ñ JÐQW×QfÑQfñ
ˆŒô —i’iØ×Ñ × :Ñ :¸T¿]¹]Ñ JÐQW×QfÑQfñ
ˆŒô —i’iØ×Ñ × :Ñ :¸T¿]¹]Ñ JÐQW×QfÑQfñ
ˆŒô —i’iØ×&Ñ&¨¯©Ñ6¸×8JÑ8JÐQW×QfÑQfñ
ˆr[   r&   Úposition_embeddingsr'   Úpast_key_valueÚcache_positionrQ   Úreturnc                 ód  • UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  pÍU R                  b  [        XšXÍ5      u  pšUb$  XÜUS.nUR                  X«U R                  U5      u  p«[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                   U R"                  U R                  S.UD6u  nnUR$                  " / UQSP76 R'                  5       nU R)                  U5      nUU4$ )Néÿÿÿÿr2   r   )ÚsinÚcosrŒ   Úeagerrd   )Údropoutr‚   rF   )ÚshaperH   r…   ÚviewÚ	transposer†   r‡   rF   r   Úupdater|   r   r{   Ú_attn_implementationr   ÚtrainingrE   r‚   ÚreshapeÚ
contiguousrˆ   )rP   r&   rŠ   r'   r‹   rŒ   rQ   Úinput_shapeÚhidden_shapeÚquery_statesÚ
key_statesÚvalue_statesr‘   r   Úcache_kwargsÚattention_interfaceÚattn_outputÚattn_weightss                     rT   ÚforwardÚCohere2Attention.forward,  s®  € ð $×)Ñ)¨#¨2Ð.ˆØ8˜Ð8 bÐ8¨$¯-©-Ñ8ˆà—{‘{ =Ó1×6Ñ6°|ÓD×NÑNÈqÐRSÓTˆØ—[‘[ Ó/×4Ñ4°\ÓB×LÑLÈQÐPQÓRˆ
Ø—{‘{ =Ó1×6Ñ6°|ÓD×NÑNÈqÐRSÓTˆà&‰ˆØ×ÑÑ*Ü';¸LÐVYÓ'_Ñ$ˆLàÑ%Ø#&ÀnÑUˆLØ'5×'<Ñ'<¸ZÐW[×WeÑWeÐgsÓ'tÑ$ˆJä(?ÐØ;‰;×+Ñ+¨wÓ6Ü"9¸$¿+¹+×:ZÑ:ZÑ"[Ðá$7ØØØØØð
%
ð  $Ÿ}Ÿ}‘C°$×2HÑ2HØ—L‘LØ×.Ñ.ñ
%
ð ñ
%
Ñ!ˆ\ð "×)Ò)Ð;¨;Ð;¸Ò;×FÑFÓHˆØ—k‘k +Ó.ˆØ˜LÐ(Ð(r[   )rE   r{   rH   rƒ   r†   r|   r   rˆ   r…   r‚   rF   r‡   r]   )NN)re   rf   rg   rh   ri   r   r   ÚintrJ   ÚtorchÚTensorÚtupler   Ú
LongTensorr   r   r¥   rp   r5   r[   rT   rz   rz     sÍ   † ÙGñ
˜}ð 
¸À#¹õ 
ð: +/Ø59ñ*)à—|‘|ð*)ð # 5§<¡<°·±Ð#=Ñ>ð*)ð ! §¡Ñ.ð	*)ð
 ! ™ð*)ð ! ×!1Ñ!1Ñ2ð*)ð Ð-Ñ.ð*)ð 
ˆu|‰|˜X e§l¡lÑ3°X¸eÀEÇLÁLÑ>QÑ5RÐRÑ	S÷*)ð *)r[   rz   c                   óL  ^ • \ rS rSrS\S\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\   S
\
\R                     S\\   S\	\R                   \
\	\R                   \R                   4      4   4S jjrSrU =r$ )ÚCohere2DecoderLayeriY  r{   r|   c                 óL   >• [         TU ]  X5        UR                  U   U l        g r]   )rI   rJ   rG   Úattention_type)rP   r{   r|   rS   s      €rT   rJ   ÚCohere2DecoderLayer.__init__Z  s#   ø€ Ü‰Ñ˜Ô+Ø$×0Ñ0°Ñ;ˆÕr[   r&   rŠ   r'   r‹   rA   rŒ   rQ   r   c           
      ó’   • UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pšU R                  U5      nX‰-   U-   nU$ )N)r&   rŠ   r'   r‹   rA   rŒ   r5   )Úinput_layernormÚ	self_attnÚmlp)rP   r&   rŠ   r'   r‹   rA   rŒ   rQ   ÚresidualÚhidden_states_attentionÚ_Úhidden_states_mlps               rT   r¥   ÚCohere2DecoderLayer.forward^  sq   € ð !ˆØ×,Ñ,¨]Ó;ˆØ%)§^¢^ð &
Ø'Ø 3Ø)Ø)ØØ)ñ&
ð ñ&
Ñ"Ðð !ŸH™H ]Ó3ÐØ Ñ:Ð=NÑNˆØÐr[   )r¯   )NNFN)re   rf   rg   rh   r   r§   rJ   r¨   r©   rª   r   r   rO   r«   r   r   ÚFloatTensorr¥   rp   rq   rr   s   @rT   r­   r­   Y  sß   ø† ð<˜}ð <¸÷ <ð 26Ø*.Ø$)Ø59ñà—|‘|ðð # 5§<¡<°·±Ð#=Ñ>ðð ! §¡Ñ.ð	ð
 ! ™ðð ˜D‘>ðð ! ×!1Ñ!1Ñ2ðð Ð-Ñ.ðð 
ˆu× Ñ  (¨5°×1BÑ1BÀE×DUÑDUÐ1UÑ+VÑ"WÐWÑ	X÷ó r[   r­   c                   ó    • \ rS rSr% \\S'   Srg)ÚCohere2PreTrainedModeliy  r{   r5   N)re   rf   rg   rh   r   Ú__annotations__rp   r5   r[   rT   r¼   r¼   y  s   ‡ ØÖr[   r¼   c                   ó  ^ • \ rS rSrS\4U 4S jjr       SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\R                     S\\   S\4S jjrSrU =r$ )ÚCohere2Modeli}  r{   c                 ó†   >• [         TU ]  U5        [        UR                  UR                  S9U l        [        US9U l        g )N)r8   Úeps)r{   )rI   rJ   rx   r8   r@   r*   rt   Ú
rotary_emb)rP   r{   rS   s     €rT   rJ   ÚCohere2Model.__init__~  s6   ø€ Ü‰Ñ˜Ô Ü$°&×2DÑ2DÈ6×K`ÑK`ÑaˆŒ	Ü0¸Ñ?ˆr[   r$   r'   Úposition_idsr!   r%   rA   rŒ   rQ   r   c           
      ó–  • US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  U R                  (       d
  [        5       nUcD  Ub  UR	                  5       OSn	[
        R                  " X™UR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d*  U R                  UUUUUS.n[        S	0 UD6[        S	0 UD6S.n
UnU R                  XÃ5      nU R                    H  nU" U4UX®R"                     UUUS.UD6nM      U R%                  U5      n['        UUS9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r2   )Údevice)r{   Úinput_embedsr'   rŒ   r!   rÄ   )r4   r3   )rŠ   r'   r‹   rA   rŒ   )Úlast_hidden_stater!   r5   )Ú
ValueErrorr(   r™   r   Úget_seq_lengthr¨   Úaranger”   rÆ   Ú	unsqueezeÚ
isinstanceÚdictr{   r
   r   rÂ   r)   r¯   r*   r   )rP   r$   r'   rÄ   r!   r%   rA   rŒ   rQ   Úpast_seen_tokensÚcausal_mask_mappingÚmask_kwargsr&   rŠ   Údecoder_layers                  rT   r¥   ÚCohere2Model.forwardƒ  sw  € ð ˜Ð -°tÐ";×<ÜÐYÓZÐZàÑ Ø ×-Ñ-¨iÓ8ˆMæ˜Ñ0¸¿¿Ü*›nˆOàÑ!ØCRÑC^˜×=Ñ=Ô?ÐdeÐÜ"Ÿ\š\Ø °]×5HÑ5HÈÑ5KÑ"KÐTa×ThÑThñˆNð ÑØ)×3Ñ3°AÓ6ˆLä°Ð?Ð-Ä×FÑFàŸ+™+Ø -Ø"0Ø"0Ø#2Ø ,ñˆKô #5Ñ"C°{Ñ"CÜ%FÑ%UÈÑ%Uñ#Ðð
 &ˆØ"Ÿo™o¨mÓJÐà!Ÿ[œ[ˆMÙ)Øðà$7Ø2×3OÑ3OÑPØ.Ø#Ø-ñð ñŠMñ )ð Ÿ	™	 -Ó0ˆÜ&Ø+Ø+ñ
ð 	
r[   )r*   rÂ   )NNNNNNN)re   rf   rg   rh   r   rJ   r   r¨   r«   r©   r   rº   rO   r   r   r   r¥   rp   rq   rr   s   @rT   r¿   r¿   }  sÍ   ø† ð@˜}÷ @ð 15Ø15Ø37Ø+/Ø59Ø$(Ø59ñ<
à˜E×,Ñ,Ñ-ð<
ð ! §¡Ñ.ð<
ð ˜u×/Ñ/Ñ0ð	<
ð
 " %™ð<
ð   × 1Ñ 1Ñ2ð<
ð ˜D‘>ð<
ð ! ×!1Ñ!1Ñ2ð<
ð Ð+Ñ,ð<
ð 
!÷<
ó <
r[   r¿   c                   ó   • \ rS rSrSrg)ÚCohere2ForCausalLMiÂ  r5   Nru   r5   r[   rT   rÕ   rÕ   Â  rv   r[   rÕ   )r   rÕ   r¿   r¼   )5rW   Útypingr   r   r¨   Útorch.nnr   Úcache_utilsr   r   Úconfiguration_utilsr   r	   Úmasking_utilsr
   r   Úmodeling_flash_attention_utilsr   Úmodeling_outputsr   Úmodeling_rope_utilsr   Úmodeling_utilsr   Úprocessing_utilsr   Úutilsr   r   Úcohere.modeling_coherer   r   r   r   r   r   r   r   Úgemma2.modeling_gemma2r   Ú
get_loggerre   Úloggerr   rt   rx   r€   rz   r­   r¼   r¿   rÕ   Ú__all__r5   r[   rT   Ú<module>ræ      sÑ   ðó  ß %ã Ý ç .ß Jß RÝ BÝ 7Ý 9Ý 5Ý &ß 0÷	÷ 	ó 	õ 1ð 
×	Ò	˜HÓ	%€ôW-Ð$ô W-ôt	Ð2ô 	ô	ô 	ôE)¨¯	©	ô E)ôPÐ,ô ô@Ð2ô ôB
;ô B
ôJ	Ð*ô 	ò \r[   