o
    
sh-                     @   s$  d dl Z d dlmZ d dlZd dlmZmZ ddlmZ ddlm	Z	 e
eZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZG dd  d ejZi d!ed"ed#d$d%fd&ed'ed(ed)d*ifd+ed,ed-ed.ejd/ed0ed1ed2ejd3ed4ejd5ejd6ejejej ej!ed7Z"ee"Z#d8d9 Z$e$d(Z%e$d'Z&e$d!Z'e$d&Z(e$d1Z)e$d6Z*e$d0Z+e$d/Z,dS ):    N)OrderedDict)Tensornn   )logging)is_torchdynamo_compilingc                   @   "   e Zd ZdZdedefddZdS )PytorchGELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    inputreturnc                 C   s   t jj|ddS )Ntanh)approximate)r   
functionalgeluselfr
    r   V/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/activations.pyforward%   s   zPytorchGELUTanh.forwardN__name__
__module____qualname____doc__r   r   r   r   r   r   r	      s    r	   c                   @   r   )NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r
   r   c                 C   s6   d| dt tdtj |dt |d     S )N      ?      ?       @Hm?g      @)torchr   mathsqrtpipowr   r   r   r   r   /   s   6zNewGELUActivation.forwardNr   r   r   r   r   r   )   s    r   c                       sL   e Zd ZdZddef fddZdedefdd	Zdedefd
dZ  Z	S )GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    Fuse_gelu_pythonc                    s(   t    |r| j| _d S tjj| _d S N)super__init___gelu_pythonactr   r   r   )r   r%   	__class__r   r   r(   ;   s   
zGELUActivation.__init__r
   r   c                 C   s    |d dt |td   S )Nr   r   r   )r   erfr    r!   r   r   r   r   r)   B   s    zGELUActivation._gelu_pythonc                 C   
   |  |S r&   r*   r   r   r   r   r   E      
zGELUActivation.forward)F)
r   r   r   r   boolr(   r   r)   r   __classcell__r   r   r+   r   r$   3   s
    r$   c                   @   r   )FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r
   r   c                 C   s*   d| dt |d dd| |     S )Nr   r   g3E?r   )r   r   r   r   r   r   r   N   s   *zFastGELUActivation.forwardNr   r   r   r   r   r3   I       r3   c                   @   r   )QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r
   r   c                 C   s   |t d|  S )NgZd;?)r   sigmoidr   r   r   r   r   W   s   zQuickGELUActivation.forwardNr   r   r   r   r   r5   R   r4   r5   c                       s<   e Zd ZdZdedef fddZdedefdd	Z  ZS )
ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                    s8   ||krt d| d| dt   || _|| _d S )Nzmin should be < max (got min: z, max: ))
ValueErrorr'   r(   r8   r9   )r   r8   r9   r+   r   r   r(   h   s
   

zClippedGELUActivation.__init__xr   c                 C   s   t t|| j| jS r&   )r   clipr   r8   r9   )r   r<   r   r   r   r   p      zClippedGELUActivation.forward)	r   r   r   r   floatr(   r   r   r2   r   r   r+   r   r7   [   s    r7   c                       s2   e Zd ZdZ fddZdedefddZ  ZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                    s    t    tdtj | _d S )N   )r'   r(   r    r!   r"   precomputed_constantr   r+   r   r   r(   |   s   
zAccurateGELUActivation.__init__r
   r   c                 C   s,   d| dt | j|dt |d     S )Nr   r   r      )r   r   rB   r#   r   r   r   r   r      s   ,zAccurateGELUActivation.forward)r   r   r   r   r(   r   r   r2   r   r   r+   r   r@   t   s    r@   c                       sD   e Zd ZdZ fddZdedefddZdedefdd	Z  ZS )
MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                    s   t    tjj| _d S r&   )r'   r(   r   r   mishr*   rC   r+   r   r   r(      s   
zMishActivation.__init__r
   r   c                 C   s   |t tj| S r&   )r   r   r   r   softplusr   r   r   r   _mish_python   r>   zMishActivation._mish_pythonc                 C   r.   r&   r/   r   r   r   r   r      r0   zMishActivation.forward)	r   r   r   r   r(   r   rH   r   r2   r   r   r+   r   rE      s
    rE   c                   @   r   )LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r
   r   c                 C   s   |S r&   r   r   r   r   r   r      s   zLinearActivation.forwardNr   r   r   r   r   rI      r4   rI   c                   @   s   e Zd ZdZdddZdS )LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    绹۞? ^/?c                 C   s*   ||  |td }ddt|  S )Nr   r   r   )divr    r!   r   r-   )r   r
   musigmar   r   r   r      s   zLaplaceActivation.forwardN)rK   rL   r   r   r   r   r   r   r   r   r   rJ      s    rJ   c                   @   s   e Zd ZdZdd ZdS )ReLUSquaredActivationz`
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
    c                 C   s   t j|}t|}|S r&   )r   r   relur   square)r   r
   relu_appliedsquaredr   r   r   r      s   
zReLUSquaredActivation.forwardNrP   r   r   r   r   rQ      s    rQ   c                       s   e Zd Z fddZ  ZS )ClassInstantierc                    s4   t  |}t|tr|n|i f\}}|di |S )Nr   )r'   __getitem__
isinstancetuple)r   keycontentclskwargsr+   r   r   rW      s   zClassInstantier.__getitem__)r   r   r   rW   r2   r   r   r+   r   rV      s    rV   c                       sf   e Zd ZdZddddejdf fdd	Zded	efd
dZded	efddZ	ded	efddZ
  ZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r   gưFc              
      s  t    ttttj||dd d| _	ttttj|| |dd d| _
| dtj||d | dtj||d || _t| j    | _t| j    | _d | _zFdd l}tjj | _d}zddlm}	 |	| j| _|d7 }W n ty }
 z|d	|
 d
7 }| j| _W Y d }
~
nd }
~
ww t | W d S  ty }
 zt dt!|
 W Y d }
~
d S d }
~
ww )N)dtyper   r   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r'   r(   r   	Parameterr   logexptensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsr?   r`   detachcpuitem_beta_scalarra   _eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch._dynamorb   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   alpha_p_initalpha_n_initr`   ra   r_   rk   rt   msgrb   errr+   r   r   r(      s@   
	,&zXIELUActivation.__init__r<   r   c              
   C   sh   t j| j}| jt j| j }t|dk|| | | j|  tt	|| j
| | | j|  S )Nr   )r   r   rG   rh   r`   ri   r   whereexpm1r8   ra   )r   r<   rh   ri   r   r   r   _xielu_python   s   $zXIELUActivation._xielu_pythonc                 C   s   |j }| dk r|d}| dk s	| dkr$|dd|d}||j kr1td||j  | j|| j	| j
| j| j| j}||S )zDFirewall function to prevent torch.compile from seeing .item() callsrD   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimrg   viewsizerz   r{   rq   r   rh   ri   ro   rp   rk   )r   r<   original_shaperesultr   r   r   rw      s*   


	zXIELUActivation._xielu_cudar
   c                 C   s4   | j d ur|jrt s| |S td | |S )Nz:torch._dynamo is compiling, using Python version of xIELU.)rq   is_cudar   rx   rz   r{   r   r   r   r   r   r     s
   


zXIELUActivation.forward)r   r   r   r   r   bfloat16r(   r   r   rw   r   r2   r   r   r+   r   r^      s    	+	r^   r   gelu_10i
   )r8   r9   	gelu_fastgelu_newgelu_pythonr%   Tgelu_pytorch_tanhgelu_accuratelaplace
leaky_relulinearrF   
quick_gelurR   relu2relu6r6   silu)swishr   prelurt   c                 C   s,   | t v rt |  S td|  dtt   )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_stringr   r   r   get_activation5  s   r   )-r    collectionsr   r   r   r   utilsr   utils.import_utilsr   
get_loggerr   rz   Moduler	   r   r$   r3   r5   r7   r@   rE   rI   rJ   rQ   rV   r^   	LeakyReLUReLUReLU6SigmoidSiLUTanhPReLUACT2CLSr   r   r   r   r   r   r   r   rF   
linear_actr   r   r   r   <module>   s   

			^	
