
    h                        d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	  e	j                  e      Z e       rd dlZd Z	 	 	 d"dee   d	ed
   dee   dedef   fdZ	 	 	 d"dee   d	ed
   dee   dedef   fdZ	 	 	 d"dee   d	ed
   dee   dedef   fdZ	 d#ded	d
dee   dedef   fdZ	 d#ded	d
dee   dedef   fdZ	 d#ded	d
dee   dedef   fdZeeeeeedZ	 	 d$dedededee   dee   f
dZd#dedee   fdZd#dedee   fdZd#dedee   fdZd#dedee   fdZd#dedee   fdZ d#dedee   fd Z!eeeee e!dZ"d#dedee   fd!Z#y)%    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                 B     d d t                fd       }|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                    t        j                  |      dz   }t        | j                  d      r| j                  j                  }n| j                  j
                  }||kD  rTt        | d      s)| j                  | j                  ||dz         \  | _        }| j                  d| j                  d       y| j                  j                  |      | _	        | j                  d| j                  d       y)	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr   max_position_embeddingsrope_init_fnr   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r   _s         _/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_updatez6dynamic_rope_update.<locals>.longrope_frequency_update+   s    ))L)A-4;; BC/3{{/[/[,/3{{/R/R,5541(,(9(9KK1QTU1U ): )%"A   T-?-?E R &*%;%;%>%>v%FD"  T-C-CPU V    c                    t        j                  |      dz   }|| j                  kD  rA| j                  | j                  ||      \  }| _        | j                  d|d       || _        || j                  k  rj| j                  | j                  kD  rP| j                  j                  |      | _        | j                  d| j                  d       | j                  | _        yyy)a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   max_seq_len_cachedr   r   attention_scalingr   original_max_seq_lenr   r   )r   r   r   r   r   s        r    dynamic_frequency_updatez5dynamic_rope_update.<locals>.dynamic_frequency_update>   s     ))L)A-T,,,/3/@/@f^e/@/f,Hd,  X% H&-D#T...43J3JTMfMf3f &*%;%;%>%>v%FD"  T-C-CPU V&*&?&?D# 4g.r"   c                     d| j                   v r | ||j                         n$| j                   dk(  r | ||j                          | ||      S )Ndynamic)r   longrope)	rope_typer   )r   xr   r'   r!   rope_forwards      r    wrapperz$dynamic_rope_update.<locals>.wrapperQ   sJ    &$T<I^^z)%dLJD!\22r"   r   )r-   r.   r'   r!   s   ` @@r    dynamic_rope_updater/      s/    W&@& <3 3 Nr"   r   r   ztorch.devicer   returnztorch.Tensorc                 J   | j                   }t        | dd      }t        | dd      xs | j                  | j                  z  }t	        ||z        }d}d|t        j                  d|dt
        j                        j                  |t
        j                        |z  z  z  }||fS )	a  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    partial_rotary_factor      ?head_dimNr      dtyper   r7   )

rope_thetagetattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser2   r4   dimattention_factorr   s	            r     _compute_default_rope_parametersrD   \   s    > D#F,CSIvz40dF4F4F&JdJd4dH
h..
/C du||AsAU[[ILLTZbgbmbmLnqttuvH%%%r"   c                 R    | j                   d   }t        | ||      \  }}||z  }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    factor)rope_scalingrD   )r   r   r   rF   r   rC   s         r    '_compute_linear_scaling_rope_parametersrH      sD    >   *F "B&&RY!ZH
 H%%%r"   c                    | j                   }t        | dd      }t        | d| j                  | j                  z        }t	        ||z        }| j
                  }| j                  d   }d}	||}ngt        |t        j                        rAt        j                  |t        j                  ||j                  |j                              }nt        ||      }|||z  |z  |dz
  z
  ||dz
  z  z  z  }d|t        j                  d|dt        j                   	      j#                  |t        j$                  
      |z  z  z  }
|
|	fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r2   r3   r4   rF   r7   r   r   r5   r   r6   r8   )r9   r:   r;   r<   r=   r   rG   
isinstancer   Tensormaximumtensorr7   r   r   r>   r?   r   r@   )r   r   r   rA   r2   r4   rB   r   rF   rC   r   s              r    _compute_dynamic_ntk_parametersrO      sB   T D#F,CSIvz6+=+=A[A[+[\H
h..
/C$<<  *F )	GU\\	*--LL0gnn]

 g67 FW$'>>6A:NTW[^ab[bTcddDdu||AsAU[[ILLTZbgbmbmLnqttuvH%%%r"   c                    | j                   }t        | dd      }t        | d| j                  | j                  z        }t	        ||z        }| j
                  d   }| j
                  j                  d      }| j
                  j                  d      }	| j
                  j                  d      }
| j
                  j                  d      xs | j                  }dd
}|)|	r|
rt         |||	       |||
      z        }n ||      }| j
                  j                  d      xs d}| j
                  j                  d      xs d	}d fd}d }|t        j                  d|d      j                  |t        j                        |z  z  }d|z  }d||z  z  }| j
                  j                  dd      } |||||||      \  }}d	 ||||dz        j                  |t        j                        z
  }|d	|z
  z  ||z  z   }||fS )ak  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r2   r3   r4   rF   rC   mscalemscale_all_dimr   r   c                 J    | dk  ryd|z  t        j                  |       z  dz   S )Nr   r3   g?)mathlog)scalerQ   s     r    
get_mscalez,_compute_yarn_parameters.<locals>.get_mscale:  s(    A:V|dhhuo-33r"   	beta_fast    	beta_slowc                     |t        j                  || dz  t         j                  z  z        z  dt        j                  |      z  z  S )zPInverse dimension formula to find the dimension based on the number of rotationsr5   )rT   rU   pi)num_rotationsrB   rA   r   s       r    find_correction_dimz5_compute_yarn_parameters.<locals>.find_correction_dimL  sB    dhh6-!:Kdgg:UVWW\]`d`h`him`n\noor"   c                      | |||      } ||||      }|r*t        j                  |      }t        j                  |      }t        |d      t	        ||dz
        fS )z.Find dimension range bounds based on rotationsr   r   )rT   floorceilr   min)	low_rothigh_rotrB   rA   r   truncatelowhighr^   s	           r    find_correction_rangez7_compute_yarn_parameters.<locals>.find_correction_rangeP  s^    !'36MN"8S$8OP**S/C99T?D3{CcAg...r"   c                     | |k(  r|dz  }t        j                  |t         j                        | z
  || z
  z  }t        j                  |dd      }|S )NgMbP?r6   r   r   )r   r>   float32clamp)rb   r   rB   linear_func	ramp_funcs        r    linear_ramp_factorz4_compute_yarn_parameters.<locals>.linear_ramp_factorY  sL    #:5LC||Cu}}=Cc	RKKQ2	r"   r   r5   r8   re   T)r   )r9   r:   r;   r<   r=   rG   getr   r@   r   r>   r   )r   r   r   rA   r2   r4   rB   rF   rC   rQ   rR   r   rW   rX   rZ   rh   rn   	pos_freqsinv_freq_extrapolationinv_freq_interpolationre   rf   rg   inv_freq_extrapolation_factorr   r^   s                            @r    _compute_yarn_parametersrt      s0   p D#F,CSIvz6+=+=A[A[+[\H
h..
/C  *F**../AB  $$X.F((,,-=>N BCevGeGe %4 n$Z%?*VUcBd%de)&1 ##''4:I##''49Ip/ aa03363UX[[\I 9_ FY$67""&&z48H%iCGgiqrIC %&(:3cQh(O(R(RZ`hmhshs(R(t$t!!&C"CD
 #@
@	A  %%%r"   c                 d   | j                   }t        | dd      }t        | d| j                  | j                  z        }t	        ||z        }| j
                  d   }| j
                  d   }| j
                  j                  d      }	| j
                  j                  d      }
t        | dd	      x}r| j                  |z  }	n| j                  }|
I|	dk  rd}
nAt        j                  d
t        j                  |	      t        j                  |      z  z         }
|r,||kD  r't        j                  |t        j                  |      }n&t        j                  |t        j                  |      }t        j                  d|dt        j                  |      j!                         |z  }d|||z  z  z  }||
fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r2   r3   r4   long_factorshort_factorrF   rC   r   Nr   rJ   r   r5   )r9   r:   r;   r<   r=   rG   ro   r   rT   sqrtrU   r   rN   rj   r>   r?   r@   )r   r   r   rA   r2   r4   rB   rv   rw   rF   rC   r   ext_factorsinv_freq_shaper   s                  r    _compute_longrope_parametersr{   s  s   ^ D#F,CSIvz6+=+=A[A[+[\H
h..
/C%%m4K&&~6L  $$X.F**../AB
 ,36;]_c+dd'd//2RR+1+I+I( S="#yyTXXf-=Ii@j-j)jk 7==ll;emmFSll<u}}VT\\!S!5;;vNTTVY\\NkD.$889H%%%r"   c                    t        | ||      \  }}| j                  d   }| j                  d   }| j                  d   }| j                  d   }||z  }	||z  }
dt        j                  z  |z  }t	        j
                  ||	kD  ||z  |      }||z  |z
  ||z
  z  }d|z
  |z  |z  ||z  z   }||
k   ||	kD   z  }t	        j
                  |||      }||fS )ap
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    rF   low_freq_factorhigh_freq_factorr   r5   r   )rD   rG   rT   r\   r   where)r   r   r   r   rC   rF   r}   r~   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                   r    _compute_llama3_parametersr     s   T "B&&RY!ZH  *F))*;<O**+=>))*LMO&8'*::$''kH$G [[+;!;X=NPXYN$w.@EUXgEghM]*n<vEXfHff!223BR8R6SSN[[1BNSN+++r"   )defaultlinearr)   yarnr*   llama3r+   received_keysrequired_keysoptional_keysignore_keysc                     d|v r|dhz  }|j                  d       |||z  }||z
  }|rt        d|  d|       |	||z
  |z
  }n||z
  }|rt        j                  d|  d|        yy)zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper+   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r+   r   r   r   r   missing_keysunused_keyss          r    _check_received_keysr     s     &!+& $ =0LRS\R]]`am`nopp #m3mC#m3NykY\]h\ijk r"   c                     | j                   }|j                  d|j                  dd             }dh}t        |j                               }t	        ||||       y )Nr+   r   r   )rG   ro   setkeysr   )r   r   rG   r+   r   r   s         r    !_validate_default_rope_parametersr   0  sT    &&L  l.>.>vt.LMI MM))+,MM=kZr"   c                 "   | j                   }|j                  d|j                  dd             }ddh}t        |j                               }t	        ||||       |d   }|t        |t              r|dk  rt        j                  d|        y y )Nr+   r   rF   r   r3   8`rope_scaling`'s factor field must be a float >= 1, got 	rG   ro   r   r   r   rK   r@   r   r   )r   r   rG   r+   r   r   rF   s          r    (_validate_linear_scaling_rope_parametersr   8  s    &&L  l.>.>vt.LMI (+M))+,MM=kZ(#F~Z6&3,QRXQYZ[ ;Gr"   c                 *   | j                   }|j                  d|j                  dd             }ddh}dh}t        |j                               }t	        |||||       |d   }|t        |t              r|dk  rt        j                  d|        y y )Nr+   r   rF   r   r   r3   r   r   )r   r   rG   r+   r   r   r   rF   s           r    )_validate_dynamic_scaling_rope_parametersr   D  s    &&L  l.>.>vt.LMI (+M78M))+,MM=-]hi(#F~Z6&3,QRXQYZ[ ;Gr"   c           	         | j                   }|j                  d|j                  dd             }ddh}h d}t        |j                               }t	        |||||       |d   }|t        |t              r|dk  rt        j                  d|        |j                  d      }|-t        |t              r|d	k  rt        j                  d
|        |j                  d      }	|	(t        |	t              st        j                  d|	        |j                  d      }
|
(t        |
t              st        j                  d|
        |	xs d|
xs dk  rt        j                  d|	 d|
 d       | j                   j                  d      }|5| j                  |z  }||k7  r t        j                  d| d| d| d       y y t        j                  d       y )Nr+   r   rF   >   rQ   re   rX   rZ   rR   rC   r   r   r3   r   rC   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rX   z6`rope_scaling`'s beta_fast field must be a float, got rZ   z6`rope_scaling`'s beta_slow field must be a float, got rY   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)rG   ro   r   r   r   rK   r@   r   r   r   warning_once)r   r   rG   r+   r   r   r   rF   rC   rX   rZ   r   implicit_factors                r    _validate_yarn_parametersr   R  s   &&L  l.>.>vt.LMI (+MM ))+,MM=-]hi(#F~Z6&3,QRXQYZ[#''(:;#Z8H%-PTdghThZ[kZlm	
   -IZ	5%AOPY{[\  -IZ	5%AOPY{[\RIN+]^g]h i66?[@XZ	
 (.':':'>'>?a'b$'3 88;[[f$Z[aZb cn ###A& Ju	u % 	_	
r"   c                    | j                   }|j                  d|j                  dd             }h d}h d}t        |j                               }t	        |||||       t        | dd      }t        | d| j                  | j                  z        }t        ||z        }	|j                  d	      }
t        |
t              s*t        d
 |
D              rt        j                  d|
        t        |
      |	dz  k7  r't        j                  d|	dz   dt        |
              |j                  d      }t        |t              s*t        d |D              rt        j                  d|        t        |      |	dz  k7  r't        j                  d|	dz   dt        |              t        | d      rt        j!                  d       y |j                  d      }|t        j                  d       n-t        |t"              r|dk  rt        j                  d|        |j                  d      }|/t        |t"              r|dk  rt        j                  d|        y y y )Nr+   r   >   r+   rv   rw   >   rF   rC   r   r   r2   r3   r4   rw   c              3   H   K   | ]  }t        |t        t        f        y wNrK   r=   r@   .0r,   s     r    	<genexpr>z0_validate_longrope_parameters.<locals>.<genexpr>  s     1dWcRS*Qe2MWc    "zC`rope_scaling`'s short_factor field must be a list of numbers, got r5   z5`rope_scaling`'s short_factor field must have length z, got rv   c              3   H   K   | ]  }t        |t        t        f        y wr   r   r   s     r    r   z0_validate_longrope_parameters.<locals>.<genexpr>  s     0bVaQRAU|1LVar   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.rF   z1Missing required keys in `rope_scaling`: 'factor'r   rC   g        r   )rG   ro   r   r   r   r:   r;   r<   r=   rK   listallr   r   lenr   r   r@   )r   r   rG   r+   r   r   r   r2   r4   rB   rw   rv   rF   rC   s                 r    _validate_longrope_parametersr     s@   &&L  l.>.>vt.LMI@MVM))+,MM=-]hi#F,CSIvz6+=+=A[A[+[\H
h..
/C##N3LlD)c1dWc1d.d\]i\jkl
<C1H$NsVWxjX^_bco_p^qrs""=1Kk4(S0bVa0b-b[\g[hij
;3!8#McUVhZW]^abm^n]opq
 v9:A	
 !!(+>NNNOFE*fslNNUV\U]^_'++,>?'.6:JS:Pbcsbtu ;Q (r"   c                    | j                   }|j                  d|j                  dd             }h d}t        |j                               }t	        ||||       |d   }|t        |t              r|dk  rt        j                  d|        |d   }|d	   }|t        |t              st        j                  d
|        |t        |t              st        j                  d|        ||k  rt        j                  d| d|        |d   }	|	t        |	t              st        j                  d|	        |	| j                  k\  r&t        j                  d|	 d| j                          y y )Nr+   r   >   rF   r+   r}   r~   r   r   rF   r3   r   r}   r~   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)rG   ro   r   r   r   rK   r@   r   r   r=   r   )
r   r   rG   r+   r   r   rF   r}   r~   r   s
             r    _validate_llama3_parametersr     s   &&L  l.>.>vt.LMIvM))+,MM=kZ(#F~Z6&3,QRXQYZ["#45O#$67j%&HUVeUfghz2BE'JVWgVhij?*q  5o5FH	

 (44V'W$'/zBbdg7h^/02	
 (6+I+IIu/00MfNlNlMmo	
 Jr"   c                     t        | dd      }|y|j                  d|j                  dd            }t        j                  |      }| || |       yt        j	                  d| d       y)	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    rG   Nr+   r   r   r   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r:   ro   ROPE_VALIDATION_FUNCTIONSr   r   )r   r   rG   r+   validation_fns        r    rope_config_validationr     sw     6>48L   l.>.>vy.QRI-11)<M f+6bclbmmno	
r"   )NNNr   )NN)$rT   	functoolsr   typingr   configuration_utilsr   utilsr   r	   
get_logger__name__r   r   r/   r=   tupler@   rD   rH   rO   rt   r{   r   ROPE_INIT_FUNCTIONSstrr   r   r   r   r   r   r   r   r   r    r"   r    <module>r      s#      1 . 
		H	% ;~ *.'+!(&%&(&^$(& c](& >5 !	(&X *.'+!(&%&(&^$(& c](& >5 !	(&X *.'+!A&%&A&^$A& c]A& >5 !	A&J PTz&z&&4z&?G}z&
>5 !z&| PTO&O&&4O&?G}O&
>5 !O&f PT>,>,&4>,?G}>,
>5 !>,J 05.$,(  $(!%lll l C=	l
 #l:[.> [XVY] [	\5E 	\T\]`Ta 	\\6F \U]^aUb \?
&6 ?
Xc] ?
D/*: /RU /d!
(8 !
xPS} !
L 168%-) 
#3 
(3- 
r"   