o
    
sh;                     @   s  d dl mZ ddlmZmZmZmZ e r/d dlZd dlm	Z	 d dl
Z
d dlmZ d dlmZ e r8d dlmZ eeZe
jdejfdd	Zd*dejdedeejejf fddZe
jdejdejdejdejfddZejfdejdejdejdejdee dejdejfddZ ej!dejfdejdejdejd ejdeeeef  dejdejfd!d"Z"G d#d$ d$e	j#Z$					%d+d&d'Z%		d,d(d)Z&dS )-    )Optional   )is_accelerate_availableis_torch_accelerator_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc           	      C   s   t jdd}|| t d| }t | | t j}t t |d }|| }||jj	}t 
|| | t 
|| | d S )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr
   pidoffsxsy r    g/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernel$   s   r"      r   
block_sizereturnc                    s      sJ  jd | dksJ tj tjd} jg   d d  d| R dtji} fdd}t|  |||d ||fS )Nr   r   r   c                    s   t   | d fS )Nr
   )tritoncdivnumel)metar   r    r!   grid6   s   zact_quant.<locals>.grid)r
   )	is_contiguousshapetorch
empty_likefloat8_e4m3fn	new_emptysizer   r"   )r   r$   r   r   r-   r    r,   r!   	act_quant0   s   2r5   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc           6      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | } || t d| | }!| | t d| | }"t d|}#| |!dddf |
 |#dddf |   }$||#dddf | |"dddf |   }%||!|  }&|"| }'||'|  }(t j||ft jd})tdt ||D ]h}*t j|$|#dddf ||*|  k dd}+t j|%|#dddf ||*|  k dd},|*| }-|-|	 }.t |&|.|  }/t |(|.|  }0|)t 	|+|,|/dddf  |0dddf  7 })|$|| 7 }$|%|| 7 }%q|j
jt jkr|)t j}1n|j
jt jkr%|)t j}1n|)t j}1|| t d| }2| | t d| }3|||2dddf   ||3dddf   }4|2dddf |k |3dddf |k @ }5t j|4|1|5d dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r   Nr'   g        )maskother)r:   )r   r   r)   minr   zerosr   ranger   dotr   r   bfloat16r   float16r   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr6   r7   r8   r9   r   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_maskr    r    r!   _w8a8_block_fp8_matmul>   sL   %,,((0,(rs   rB   rC   rE   rF   output_dtypec                    s  t |dksJ |d |d }}| jd |jd ksJ | jdd |jdd kr/|  s1J t| jd ||jd ksAJ |  | jd   |jdkrX| rX|jdksZJ |j\}t||jd kslJ t|||jd ksyJ | jdd f }	| j|	|d}
d} |k rt }t	|d}|}|| dksJ |} fd	d
}t
| | ||
|| |||| d| d|d|d|
d|
d|d|d|d|d|||dd |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    r   r      r&   Nr'   r#      c                    s"   t  | d t | d  fS )Nr6   r7   )r(   r)   )METArG   rH   r    r!   r-      s   "z*w8a8_block_fp8_matmul_triton.<locals>.grid   )r6   r7   r8   r9   )lenr/   r.   r(   r)   r*   ndimr3   next_power_of_2r   rs   stride)rB   rC   rE   rF   r$   rt   block_nblock_krI   C_shaperD   r6   r8   r7   r-   r    rx   r!   w8a8_block_fp8_matmul_triton   s^   (  


r   input_qweight_qinput_scaleweight_scalec              
   C   s  | j dkr| jn
d| jd | jd f\}}}|jd }	| d|}
||jd d}|	|d  }||d  }tj|| |	ftj| jd}t|D ]k}||d  }||d  }t|D ]X}||d  }||d  }|
dd||f }|||||f }|dd||d f }|||f }tj||	 tj
dtj| jd||d| }|dd||f  |7  < qZqH||||	}||S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       ru   r   r&   r   deviceN)scale_ascale_b	out_dtype)r|   r/   viewr0   r=   r   r   r>   
_scaled_mmttensorr   )r   r   r   r   r$   rt   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_resultr    r    r!   w8a8_block_fp8_matmul_compile   s>   ,

r   c                       sb   e Zd ZejZ					ddedededee	eef  f fdd	Z
d
ejdejfddZ  ZS )	FP8LinearFNdynamicin_featuresr   biasr$   c           
         s   t  || || _|| _tjtj||tj	|d| _
| j
 dkrJ||d  d |d  }||d  d |d  }	ttj||	tj|d| _n| dd  || _|| _|rdtt| j| _d S | dd  d S )Nr   ru   r   weight_scale_invr   )super__init__r   r   r0   nn	Parameteremptyr   r   weightelement_sizer   r   register_parameterr$   activation_schemer   )
selfr   r   r   r   r$   r   r   scale_out_featuresscale_in_features	__class__r    r!   r   )  s    
zFP8Linear.__init__inputr%   c              	   C   s   | j  dkrt|| j | jS t rtj j	nd}t
t|tj}||j  t|| jd \}}t|| j || j| j|jd}W d    n1 sKw   Y  |  | jd ur^|| j }|j|jdS )Nru   cuda)rt   r'   )r   r   Flinearr   r   r0   acceleratorcurrent_acceleratortypegetattrr   r   r5   r$   r   r   r   synchronizer   )r   r   device_typetorch_accelerator_moduleqinputscaler   r    r    r!   forwardK  s&   

zFP8Linear.forward)FNNNr   )__name__
__module____qualname__r0   r2   r   intboolr   tupler   Tensorr   __classcell__r    r    r   r!   r   &  s"    "r   Fc           	         s   |du rg }|   D ]p\}}|| t|tjr_||pg vr_d| t fdd|p-g D s_t # t|j	|j
|jdu|jj|jj|j|jd| j|< d}W d   n1 sZw   Y  tt| dkrut||||||d\}}|d	 q
| |fS )
z%Replace Linear layers with FP8Linear.N.c                 3   s    | ]}| v V  qd S )Nr    ).0keycurrent_key_name_strr    r!   	<genexpr>u  s    z+_replace_with_fp8_linear.<locals>.<genexpr>)r   r   r   r   r   r   r$   Tr   )has_been_replacedr&   )named_childrenappend
isinstancer   Linearjoinanyr	   r   r   r   r   r   r   r   r   weight_block_size_modulesr{   listchildren_replace_with_fp8_linearpop)	modeltp_planmodules_to_not_convertcurrent_key_namequantization_configr   namemodule_r    r   r!   r   d  s<   	

	
	r   c                 C   s\   |du rdgn|}|j dur||j  tt|}t| | j||d\} }|s,td | S )z:Helper function to replace model layers with FP8 versions.Nlm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   _tp_planloggerwarning)r   r   r   r   r    r    r!   replace_with_fp8_linear  s   

r   )r#   )NNNNF)NN)'typingr   utilsr   r   r   r   r0   torch.nnr   r(   triton.languagelanguager   r   r   
accelerater	   
get_loggerr   r   jit	constexprr"   r   r   r   r5   rs   r   r   r   r   compiler   r   r   r   r   r    r    r    r!   <module>   s   
&Z
QA@
0