o
    
sh|                     @   sx  d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	 e	
eZe r*d dlZdd Z			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZeeeeeedZ		d3dedededee d ee f
d!d"Zd2d	ed ee fd#d$Zd2d	ed ee fd%d&Zd2d	ed ee fd'd(Zd2d	ed ee fd)d*Zd2d	ed ee fd+d,Z d2d	ed ee fd-d.Z!eeeee e!dZ"d2d	ed ee fd/d0Z#dS )4    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                    s,   dd dd  t  fdd}|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 S   s   t |d }t| jdr| jj}n| jj}||kr8t| ds-| j| j||d d\| _}| jd| jdd dS | j	
|| _	| jd| j	dd dS )	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr	   max_position_embeddingsrope_init_fnr
   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r	   _ r   ^/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update+   s   

z6dynamic_rope_update.<locals>.longrope_frequency_updatec                 S   s   t |d }|| jkr#| j| j||d\}| _| jd|dd || _|| jk rD| j| jkrF| j	|| _| jd| jdd | j| _dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   max_seq_len_cachedr   r   attention_scalingr   original_max_seq_lenr   r   )r   r   r   r   r   r   r   r   dynamic_frequency_update>   s   
z5dynamic_rope_update.<locals>.dynamic_frequency_updatec                    sB   d| j v r | ||jd n| j dkr| ||jd | ||S )Ndynamic)r   longrope)	rope_typer   )r   xr   r#   r   rope_forwardr   r   wrapperQ   s
   

z$dynamic_rope_update.<locals>.wrapperr   )r)   r*   r   r(   r   dynamic_rope_update   s
   r+   r   r   ztorch.devicer   returnztorch.Tensorc           	      C   sv   | j }t| dr| jnd}t| ddp| j| j }t|| }d}d|tjd|dtj	dj
|tjd|   }||fS )	ax  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    partial_rotary_factor      ?head_dimNr      dtyper   r2   )
rope_thetar   r-   getattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser-   r/   dimattention_factorr   r   r   r    _compute_default_rope_parameters\   s   ,r?   c                 C   s*   | j d }t| ||\}}|| }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    factor)rope_scalingr?   )r   r   r   r@   r   r>   r   r   r   '_compute_linear_scaling_rope_parametersz   s   
rB   c                 C   s   | j }t| dr| jnd}t| d| j| j }t|| }| j}| jd }d}	|du r.|}nt	|t
jrCt
|t
j||j|jd}nt||}||| | |d  ||d    }d|t
jd	|dt
jd
j|t
jd|   }
|
|	fS )a  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r-   r.   r/   r@   Nr2   r   r   r0   r   r1   r3   )r4   r   r-   r5   r6   r7   r8   r   rA   
isinstancer   Tensormaximumtensorr2   r   r   r9   r:   r   r;   )r   r   r   r<   r-   r/   r=   r   r@   r>   r   r   r   r   _compute_dynamic_ntk_parameters   s$   

$,rH   c                    s  | j }t| dr| jnd}t| d| j| j }t|| }| jd }| jd}| jd}	| jd}
| jdp<| j	}dd
d}|du r[|	rW|
rWt
|||	|||
 }n||}| jdpbd}| jdpjd	}dd   fdd}dd }|td|dj|tj
d|  }d| }d||  }| jdd}|||||||\}}d	||||d j|tj
d }|d	|  ||  }||fS )a  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r-   r.   r/   r@   r>   mscalemscale_all_dimr	   r   c                 S   s"   | dkrdS d| t |  d S )Nr   r.   g?)mathlog)scalerI   r   r   r   
get_mscale   s   z,_compute_yarn_parameters.<locals>.get_mscaleN	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotationsr0   )rK   rL   pi)num_rotationsr=   r<   r   r   r   r   find_correction_dim   s   *z5_compute_yarn_parameters.<locals>.find_correction_dimc                    sL    | |||} ||||}|rt |}t |}t|dt||d fS )z.Find dimension range bounds based on rotationsr   r   )rK   floorceilr   min)low_rothigh_rotr=   r<   r   truncatelowhighrT   r   r   find_correction_range   s   

z7_compute_yarn_parameters.<locals>.find_correction_rangec                 S   s>   | |kr|d7 }t j|t jd|  ||   }t |dd}|S )NgMbP?r1   r   r   )r   r9   float32clamp)rW   r   r=   linear_func	ramp_funcr   r   r   linear_ramp_factor  s
   z4_compute_yarn_parameters.<locals>.linear_ramp_factorr   r0   r3   rZ   T)r   )r4   r   r-   r5   r6   r7   r8   rA   getr   r;   r   r9   r   )r   r   r   r<   r-   r/   r=   r@   r>   rI   rJ   r	   rN   rO   rQ   r^   rc   	pos_freqsinv_freq_extrapolationinv_freq_interpolationrZ   r[   r\   inv_freq_extrapolation_factorr   r   r]   r   _compute_yarn_parameters   s>   

	"
 
ri   c                 C   s&  | j }t| dr| jnd}t| d| j| j }t|| }| jd }| jd }| jd}	| jd}
t| drB| j	}| j
| j	 }	n| j
}|
d	u r_|	dkrPd}
ntd
t|	t|  }
|ro||krotj|tj|d}n	tj|tj|d}tjd|dtj|d | }d|||   }||
fS )a~  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r-   r.   r/   long_factorshort_factorr@   r>   r	   Nr   rC   r   r0   )r4   r   r-   r5   r6   r7   r8   rA   rd   r	   r   rK   sqrtrL   r   rG   r_   r9   r:   r;   )r   r   r   r<   r-   r/   r=   rj   rk   r@   r>   r	   ext_factorsinv_freq_shaper   r   r   r   _compute_longrope_parameters  s,   


ro   c                 C   s   t | ||\}}| jd }| jd }| jd }| jd }|| }	|| }
dtj | }t||	k|| |}|| | ||  }d| | | ||  }||
k  ||	k  }t|||}||fS )a<  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r@   low_freq_factorhigh_freq_factorr	   r0   r   )r?   rA   rK   rR   r   where)r   r   r   r   r>   r@   rp   rq   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqr   r   r   _compute_llama3_parametersR  s   



r{   )defaultlinearr$   yarnr%   llama3r&   received_keysrequired_keysoptional_keysignore_keysc                 C   s   d|v r|dh8 }| d |dur||8 }|| }|r&td|  d| |dur1|| | }n|| }|rDtd|  d|  dS dS )zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper&   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r&   r   r   r   r   missing_keysunused_keysr   r   r   _check_received_keys  s   	

r   c                 C   s@   | j }|d|dd }dh}t| }t||||d d S )Nr&   r   r   )rA   rd   setkeysr   )r   r   rA   r&   r   r   r   r   r   !_validate_default_rope_parameters  s
   r   c                 C   sx   | j }|d|dd }ddh}t| }t||||d |d }|d u s0t|tr0|dk r:td|  d S d S )Nr&   r   r@   r   r.   8`rope_scaling`'s factor field must be a float >= 1, got 	rA   rd   r   r   r   rD   r;   r   r   )r   r   rA   r&   r   r   r@   r   r   r   (_validate_linear_scaling_rope_parameters  s   r   c                 C   s   | j }|d|dd }ddh}dh}t| }t|||||d |d }|d u s4t|tr4|dk r>td|  d S d S )Nr&   r   r@   r	   r   r.   r   r   )r   r   rA   r&   r   r   r   r@   r   r   r   )_validate_dynamic_scaling_rope_parameters  s   r   c              	   C   s  | j }|d|dd }ddh}h d}t| }t|||||d |d }|d u s5t|tr5|dk r=td|  |d}|d urWt|trO|d	k rWtd
|  |d}	|	d urmt|	tsmtd|	  |d}
|
d urt|
tstd|
  |	pd|
pdk rtd|	 d|
 d | j d}|d ur| j	| }||krt
d| d| d| d d S d S t
d d S )Nr&   r   r@   >   rI   rZ   rO   rQ   rJ   r>   r	   r   r.   r   r>   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rO   z6`rope_scaling`'s beta_fast field must be a float, got rQ   z6`rope_scaling`'s beta_slow field must be a float, got rP   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r	   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)rA   rd   r   r   r   rD   r;   r   r   r   warning_once)r   r   rA   r&   r   r   r   r@   r>   rO   rQ   r	   implicit_factorr   r   r   _validate_yarn_parameters  sR   	



r   c                 C   s  | j }|d|dd }h d}h d}t| }t|||||d t| dr,| jnd}t| d| j| j	 }t
|| }	|d	}
t|
tsYtd
d |
D rYtd|
  t|
|	d krptd|	d  dt|
  |d}t|tstdd |D rtd|  t||	d krtd|	d  dt|  t| drtd d S |d}|d u rtd nt|tr|dk rtd|  |d}|d urt|tr|dk rtd|  d S d S d S )Nr&   r   >   r&   rj   rk   >   r@   r>   r	   r   r-   r.   r/   rk   c                 s       | ]
}t |ttfV  qd S NrD   r8   r;   .0r'   r   r   r   	<genexpr>      z0_validate_longrope_parameters.<locals>.<genexpr>zC`rope_scaling`'s short_factor field must be a list of numbers, got r0   z5`rope_scaling`'s short_factor field must have length z, got rj   c                 s   r   r   r   r   r   r   r   r     r   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r	   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.r@   z1Missing required keys in `rope_scaling`: 'factor'r   r>   g        r   )rA   rd   r   r   r   r   r-   r5   r6   r7   r8   rD   listallr   r   lenr   r;   )r   r   rA   r&   r   r   r   r-   r/   r=   rk   rj   r@   r>   r   r   r   _validate_longrope_parameters	  sH   




r   c           
      C   s6  | j }|d|dd }h d}t| }t||||d |d }|d u s0t|tr0|dk r8td|  |d }|d	 }|d u sIt|tsQtd
|  |d u sZt|tsbtd|  ||krqtd| d|  |d }	|	d u s~t|	t	std|	  |	| j
krtd|	 d| j
  d S d S )Nr&   r   >   r@   r&   rp   rq   r	   r   r@   r.   r   rp   rq   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r	   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)rA   rd   r   r   r   rD   r;   r   r   r8   r   )
r   r   rA   r&   r   r   r@   rp   rq   r	   r   r   r   _validate_llama3_parameters;  sL   
r   c                 C   sd   t | dd}|du rdS |d|dd}t|}|dur'|| |d dS td| d dS )	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    rA   Nr&   r   r|   r   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r5   rd   ROPE_VALIDATION_FUNCTIONSr   r   )r   r   rA   r&   validation_fnr   r   r   rope_config_validationj  s   

r   )NNNr   )NN)$rK   	functoolsr   typingr   configuration_utilsr   utilsr   r   
get_logger__name__r   r   r+   r8   tupler;   r?   rB   rH   ri   ro   r{   ROPE_INIT_FUNCTIONSstrr   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s   
?





.

X

7

-
B2&
