
    h53                        d dl Z d dlZd dlmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 ddlmZ  e
j                  e      Z ed       G d	 d
ej                                Z ed       G d dej                                Z ed       G d dej                                Z ed       G d dej                                Z ed       G d dej                                Z ed       G d dej                                Z G d dej                         Z G d dej                         Z G d dej                         Z G d  d!ej                         Z G d" d#ej                         Z G d$ d%ej                         Z G d& d'e      Z G d( d)ej                         Zi d*ed+ed,d-d.fd/ed0ed1ed2d3ifd4ed5ed6d3ifd7ed8ed9ej>                  d:ed;ed<ed=ej@                  d>ed?ejB                  d@ejD                  eejF                  ejH                  ejJ                  edAZ& ee&      Z'dB Z( e(d1      Z) e(d0      Z* e(d*      Z+ e(d/      Z, e(d<      Z- e(dC      Z. e(d;      Z/ e(d:      Z0y)D    N)OrderedDict)Tensornn   )use_kernel_forward_from_hub)logging)is_torchdynamo_compilingGeluTanhc                   J     e Zd ZdZddef fdZdedefdZdedefdZ xZ	S )	GELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    use_gelu_tanh_pythonc                     t         |           |r| j                  | _        y t	        j
                  t        j                  j                  d      | _        y )Ntanh)approximate)	super__init___gelu_tanh_pythonact	functoolspartialr   
functionalgelu)selfr   	__class__s     W/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/activations.pyr   zGELUTanh.__init__(   s<    --DH ((););PDH    inputreturnc                     |dz  dt        j                  t        j                  dt        j                  z        |dt        j
                  |d      z  z   z        z   z  S N      ?      ?       @Hm?g      @torchr   mathsqrtpipowr   r   s     r   r   zGELUTanh._gelu_tanh_python/   sP    s{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr   c                 $    | j                  |      S Nr   r+   s     r   forwardzGELUTanh.forward2       xxr   F)
__name__
__module____qualname____doc__boolr   r   r   r/   __classcell__r   s   @r   r   r      s?    QT Qwv w& wV  r   r   NewGELUc                        e Zd ZdZdedefdZy)NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r   r   c                     d|z  dt        j                  t        j                  dt        j                  z        |dt        j
                  |d      z  z   z        z   z  S r    r%   r+   s     r   r/   zNewGELUActivation.forward=   sP    U{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr   Nr2   r3   r4   r5   r   r/    r   r   r;   r;   6   s    
wV w wr   r;   GeLUc                   J     e Zd ZdZddef fdZdedefdZdedefdZ xZ	S )	GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    use_gelu_pythonc                     t         |           |r| j                  | _        y t        j
                  j                  | _        y r-   )r   r   _gelu_pythonr   r   r   r   )r   rB   r   s     r   r   zGELUActivation.__init__J   s/    ((DH}}))DHr   r   r   c                 j    |dz  dt        j                  |t        j                  d      z        z   z  S )Nr!   r"   r#   )r&   erfr'   r(   r+   s     r   rD   zGELUActivation._gelu_pythonQ   s,    s{cEIIediin.D$EEFFr   c                 $    | j                  |      S r-   r.   r+   s     r   r/   zGELUActivation.forwardT   r0   r   r1   )
r2   r3   r4   r5   r6   r   r   rD   r/   r7   r8   s   @r   rA   rA   A   s=    * *G& GV GV  r   rA   SiLUc                        e Zd ZdZdedefdZy)SiLUActivationa  
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    r   r   c                 @    t         j                  j                  |      S r-   )r   r   silur+   s     r   r/   zSiLUActivation.forwardb   s    }}!!%((r   Nr=   r>   r   r   rJ   rJ   X   s    )V ) )r   rJ   FastGELUc                        e Zd ZdZdedefdZy)FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 \    d|z  dt        j                  |dz  dd|z  |z  z   z        z   z  S )Nr!   r"   g3E?r$   )r&   r   r+   s     r   r/   zFastGELUActivation.forwardl   s:    U{cEJJu|/CsXX]M]`eMeGe/f$gghhr   Nr=   r>   r   r   rO   rO   f   s    iV i ir   rO   	QuickGELUc                        e Zd ZdZdedefdZy)QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 8    |t        j                  d|z        z  S )NgZd;?)r&   sigmoidr+   s     r   r/   zQuickGELUActivation.forwardv   s    u}}UU]333r   Nr=   r>   r   r   rS   rS   p   s    4V 4 4r   rS   c                   <     e Zd ZdZdedef fdZdedefdZ xZS )ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                 l    ||kD  rt        d| d| d      t        | 	          || _        || _        y )Nzmin should be < max (got min: z, max: ))
ValueErrorr   r   rX   rY   )r   rX   rY   r   s      r   r   zClippedGELUActivation.__init__   s>    9=cU'#aPQQr   xr   c                 j    t        j                  t        |      | j                  | j                        S r-   )r&   clipr   rX   rY   )r   r]   s     r   r/   zClippedGELUActivation.forward   s!    zz$q'488TXX66r   )	r2   r3   r4   r5   floatr   r   r/   r7   r8   s   @r   rW   rW   z   s.    
E  7 7F 7r   rW   c                   2     e Zd ZdZ fdZdedefdZ xZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                 x    t         |           t        j                  dt        j                  z        | _        y )N   )r   r   r'   r(   r)   precomputed_constantr   r   s    r   r   zAccurateGELUActivation.__init__   s'    $(IIa$''k$:!r   r   r   c                     d|z  dt        j                  | j                  |dt        j                  |d      z  z   z        z   z  S )Nr!   r   r$      )r&   r   re   r*   r+   s     r   r/   zAccurateGELUActivation.forward   sE    U{a%**T-F-F%RZ]b]f]fglno]pRpJp-q"rrssr   )r2   r3   r4   r5   r   r   r/   r7   r8   s   @r   rb   rb      s#    ;tV t tr   rb   c                   B     e Zd ZdZ fdZdedefdZdedefdZ xZS )MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                 `    t         |           t        j                  j                  | _        y r-   )r   r   r   r   mishr   rf   s    r   r   zMishActivation.__init__   s    ==%%r   r   r   c                 l    |t        j                  t        j                  j	                  |            z  S r-   )r&   r   r   r   softplusr+   s     r   _mish_pythonzMishActivation._mish_python   s%    uzz"--"8"8"?@@@r   c                 $    | j                  |      S r-   r.   r+   s     r   r/   zMishActivation.forward   r0   r   )	r2   r3   r4   r5   r   r   ro   r/   r7   r8   s   @r   rj   rj      s6    
&A& AV AV  r   rj   c                        e Zd ZdZdedefdZy)LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   r   c                     |S r-   r>   r+   s     r   r/   zLinearActivation.forward   s    r   Nr=   r>   r   r   rr   rr      s    V  r   rr   c                       e Zd ZdZddZy)LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    c                     ||z
  j                  |t        j                  d      z        }ddt        j                  |      z   z  S )Nr#   r!   r"   )divr'   r(   r&   rF   )r   r   musigmas       r   r/   zLaplaceActivation.forward   s<      3!78cEIIe,,--r   N)g۞?g ^/?r2   r3   r4   r5   r/   r>   r   r   ru   ru      s    .r   ru   c                       e Zd ZdZd Zy)ReLUSquaredActivationz`
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
    c                 n    t         j                  j                  |      }t        j                  |      }|S r-   )r   r   relur&   square)r   r   relu_appliedsquareds       r   r/   zReLUSquaredActivation.forward   s)    }}))%0,,|,r   Nrz   r>   r   r   r|   r|      s    r   r|   c                        e Zd Z fdZ xZS )ClassInstantierc                 d    t         |   |      }t        |t              r|n|i f\  }} |di |S )Nr>   )r   __getitem__
isinstancetuple)r   keycontentclskwargsr   s        r   r   zClassInstantier.__getitem__   s6    '%c*!+GU!;g'2V}V}r   )r2   r3   r4   r   r7   r8   s   @r   r   r      s     r   r   c                   t     e Zd ZdZddddej
                  df fd	Zdedefd	Zdedefd
Z	dedefdZ
 xZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r!   gưFc           
      Z   t         |           t        j                  t	        j
                  t	        j                  t	        j                  ||                  j                  d            | _	        t        j                  t	        j
                  t	        j                  t	        j                  ||z
  |                  j                  d            | _
        | j                  dt	        j                  ||             | j                  dt	        j                  ||             || _        t        | j                  j                         j!                         j                         j#                               | _        t        | j&                  j                         j!                         j                         j#                               | _        d | _        	 dd l}t        j.                  j0                  j3                         | _        d}	 ddlm}	  |	| j8                        | _        |dz  }t>        jA                  |       y # t<        $ r$}
|d|
 d	z  }| j8                  | _        Y d }
~
>d }
~
ww xY w# t<        $ r)}
t>        jA                  d
tC        |
             Y d }
~
y d }
~
ww xY w)N)dtyper   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r   r   r   	Parameterr&   logexpm1tensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsr`   r   detachcpuitem_beta_scalarr   _eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch._dynamor   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   alpha_p_initalpha_n_initr   r   r   r   r   msgr   errr   s              r   r   zXIELUActivation.__init__   s    	||EIIekk%,,|[`:a.b$c$m$mno$pq||IIekk%,,|d/B%"PQR\\]^_
 	VU\\$e%DEUELLE$BC!2!$))"2"2"4"8"8":"@"@"B"G"G"IJ !2!6!6!8!>!>!@!E!E!GH#	#(==#6#6#<#<#>D 2C78&4T5E5E&F#?? $  7DSEIstt&*&6&6##7  	jC 	sB   3I8 "I 2I8 	I5I0+I8 0I55I8 8	J*J%%J*r]   r   c           
         t         j                  j                  | j                        }| j                  t         j                  j                  | j
                        z   }t        j                  |dkD  ||z  |z  | j                  |z  z   t        j                  t        j                  || j                              |z
  |z  | j                  |z  z         S )Nr   )r   r   rn   r   r   r   r&   wherer   rX   r   )r   r]   r   r   s       r   _xielu_pythonzXIELUActivation._xielu_python  s    --((6))bmm44T\\BB{{EaK!Odii!m+[[1dhh/014?$))a-O
 	
r   c                 ~   |j                   }|j                         dk  r%|j                  d      }|j                         dk  r%|j                         dkD  r"|j                  dd|j	                  d            }||j                   k7  r!t
        j                  d||j                          | j                  j                  || j                  j                  |j                        | j                  j                  |j                        | j                  | j                  | j                        }|j                  |      S )zDFirewall function to prevent torch.compile from seeing .item() callsrh   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimr   viewsizer   r   r   r/   r   tor   r   r   r   r   )r   r]   original_shaperesults       r   r   zXIELUActivation._xielu_cuda  s    eegkAA eegk557Q;r1affRj)AQWW$q
 %%--LLOOAGG$LLOOAGG$""
 {{>**r   r   c                     | j                   <|j                  r0t               s| j                  |      S t        j                  d       | j                  |      S )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar	   r   r   r   r   r+   s     r   r/   zXIELUActivation.forward1  sK    ++-**511##$`a!!%((r   )r2   r3   r4   r5   r&   bfloat16r   r   r   r   r/   r7   r8   s   @r   r   r      s_     nn)V
v 
& 
+V + +2)V ) )r   r   r   gelu_10i
   )rX   rY   	gelu_fastgelu_newgelu_pythonrB   Tgelu_pytorch_tanhgelu_python_tanhr   gelu_accuratelaplace
leaky_relulinearrl   
quick_gelur~   relu2relu6rU   )rL   swishr   prelur   c           	      |    | t         v r	t         |    S t        d|  dt        t         j                                      )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_strings    r   get_activationr   U  sB    F"'((#4"55RSWX^XcXcXeSfRghiir   rL   )1r   r'   collectionsr   r&   r   r   integrations.hub_kernelsr   utilsr   utils.import_utilsr	   
get_loggerr2   r   Moduler   r;   rA   rJ   rO   rS   rW   rb   rj   rr   ru   r|   r   r   	LeakyReLUReLUReLU6SigmoidrH   TanhPReLUACT2CLSr   r   r   r   r   r   r   rL   rl   
linear_actr>   r   r   <module>r      s     #   A  8 
		H	% Z(ryy  ). Y'w		 w (w V$RYY  %, V$
)RYY 
) %
) Z(i i )i [)4")) 4 *47BII 72tRYY t RYY "ryy 
.		 
.BII k [)bii [)|
N%s2'>? # !	
 N%6$=>  $:D#AB +   ",,  N % BGG "  RXX!" rzz#$ WWGGXX-0 
	!j ]+*%f;'	L)
ffH%
r   