o
    
shE                     @   s   d dl mZmZmZ e rddlZddlmZ e r ddlmZ ddlZe	e
Zg dZdd Zd	d
 ZejdddejdedejfddZG dd dejZdd Zdd Zdd Zdd Zdd Z					d#dd Z				d$d!d"ZdS )%   )is_accelerate_availableis_torch_availablelogging    N)nn)init_empty_weights)g        g      ?g      ?g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                 C   s.   |j jj}|| tjtjdd\} }| |fS )N   )axis)numerics_detailsmxfpdowncast_to_mxfp_torchtotorchbfloat16uint8)wtriton_kernels_hubr   w_scale r   ]/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/integrations/mxfp4.pyquantize_to_mxfp43   s   
r   c           
      C   sn   |j j|j j|j j}}}|jj}|jjj}|jdd\}}	||| |d|fi |	} ||||}| |fS )zE
    Changes the layout of the tensors depending on the hardware
    r   )mx_axisdtype)tensorFP4convert_layoutwrap_torch_tensortensor_detailslayoutStridedLayout"make_default_matmul_mxfp4_w_layout)
r   r   r   r   r   r   r   r    value_layoutvalue_layout_optsr   r   r   swizzle_mxfp49   s   

r$   i   )r   rows_per_chunkr   r%   returnc                C   s  ddl }| jstj r|  } | }|tjd }| jdd |jks6J d| jdd d|jtjt	|| j
d}| j^ }}}||| }	| |	|} ||	d}tj|	|d	 || j
d}
td|	|D ]R}t|| |	}| || }||| }|d
@ tj}|d? tj}|
|| }|| |ddddd	f< || |ddddd	f< tj|||d ~~~~~qk|
jg |||d	 R  jg ||| d	 R  }
~ ~~|
dd	 S )zw
    Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
    pass of GPT_OSS.
    r   N   zblocks.shape[:-1]=z does not match scales.shape=)r   devicer   r         )out)mathis_cudar   cudais_availabler   int32shaper   
FP4_VALUESr)   prodreshapeemptyrangeminlongldexpview	transpose
contiguous)blocksscalesr   r%   r-   lutprefix_shapeGB
rows_totalr,   r0r1blkexpidx_loidx_hisubr   r   r   convert_moe_packed_tensorsM   s4   44rL   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Mxfp4GptOssExpertsc                    sR  t    |j| _|j| _|j| _tjtj	| jd| j | jd dtj
ddd| _tjtj	| jd| j | jd tj
ddd| _tjtj	| jd| j tjddd| _tjtj	| j| j| jd dftj
ddd| _tjtj	| j| j| jd tj
ddd| _tjtj	| j| jtjddd| _d| _t|dd	| _d | _d | _t|dd	| _d S )
Nr          r   Frequires_gradgZd;?swiglu_limitg      @)super__init__num_local_expertsnum_expertsintermediate_sizehidden_sizer   	Parameterr   zerosr   gate_up_proj_blocksgate_up_proj_scalesfloat32gate_up_proj_biasdown_proj_blocksdown_proj_scalesdown_proj_biasalphagetattrlimitgate_up_proj_precision_configdown_proj_precision_config)selfconfig	__class__r   r   rT      s>   
"  zMxfp4GptOssExperts.__init__hidden_statesr&   c                 C   s   t jjt jjt jj}}}t jj}tj|j= ||d|d| j	| j
fd}	||| j| jtj||| jd |	d}
||
| j| jtj||| j|jd}W d    |S 1 sYw   Y  |S )Nswiglu)rb   rd   r   )gather_indxprecision_configgammasfused_activation)scatter_indxrn   ro   )r   
matmul_ogsFnSpecsFusedActivationrl   	swiglu_fnr   r/   r)   rb   rd   gate_up_projr^   r   r]   re   	down_projra   rf   	gate_scal)rg   rk   routing_data
gather_idxscatter_idxrs   rt   rr   ru   actintermediate_cache1intermediate_cache3r   r   r   forward   s<   

zMxfp4GptOssExperts.forward)__name__
__module____qualname__rT   r   Tensorr   __classcell__r   r   ri   r   rM      s    $rM   c                 C   s  dd l }tjjtjjtjjtjjf\}}}}tj	| j	 tj
 }t|jdd}d}	| jd }
| jd }|| }|| }|d | }|
| }dd }|| |\}}tj|dd}tj|dd\}}t|d|}|d}tj|||d d	|| }|dtj}d
}t||k ||}tj|ddtj}t|tj}t||k ||	}t||k||	}t||	k|	|}|| }t|| |	k|	|}|| | d}|| | d}||||}|}W d    n1 sw   Y  ||||||||fS )Nr   
LOCAL_RANK0r(   r   c                 S   sF   t j|  dddd d d |f }| }t j| |dd}|| fS )Nr   T)dimstabler   )r   argsortr9   take_along_dimint)valsktk_indxtk_valr   r   r   topk   s   "z routing_torch_dist.<locals>.topkr   )binsmaxi  T)r   )src_indxdst_indx)osr   routing
GatherIndxRoutingDataScatterIndxcompute_expt_data_torchr   r/   r)   distributedget_world_sizer   environgetr2   softmaxsortgatherr5   histcr;   r   r1   wherer   )logitsn_expts_actr   r   r   r   r   
world_sizerankreplace_valuen_tokensn_expts_totn_local_expertslocal_expert_startlocal_expert_endn_gates_padr   	expt_scal	expt_indxsort_indiceshistvar	topk_indx	gate_indxrx   rm   rq   	expt_datahit_expertsr   r   r   routing_torch_dist   sN   



4r   c           
      C   s   dd l m} | r| rt| drt}ntjj}|jd }|	d| j
j}tj|| j
j| j
j}tj|j ||| j
j\}}}W d    n1 sOw   Y  | ||||}	|		|d| j
j}	|	|fS )Nr   
_is_hookedr(   )torch.distributedr   r0   is_initializedhasattrr   r   r   r2   r5   router
hidden_dimr   
functionallinearweightbiasr   r/   r)   top_kexperts)
rg   rk   distr   
batch_sizerouter_logitsry   rz   r{   
routed_outr   r   r   mlp_forward  s   
r   c                    s(   d |  t fdd|D sdS dS )N.c                 3   s0    | ]}t | d  pt |  V  qdS )z\.N)rematch).0keycurrent_key_name_strr   r   	<genexpr>&  s     
z(should_convert_module.<locals>.<genexpr>TF)joinany)current_key_namepatternsr   r   r   should_convert_module$  s   
r   c                 K   s   ddl m} |d}|d}|d}	|d}
|d}|d}d	D ]g}||v r|d ur=||||||	|
||d
d	}| d}| d}t| |ddd | t| |rt| |rtt| |t| |}|dkrvtj	
 rvtj	  t| |tj|| t| | t| | q&d S )Nr   shard_and_distribute_modulemodelempty_paramcasting_dtypeto_contiguousr   device_mesh)rv   rw   F)	set_param_blocks_scalesr   r   cpu)integrations.tensor_parallelr   r   setattrrsplitr   rL   rc   r   r/   r0   empty_cacher   rY   r   delattr)module
param_nameparam_valuetarget_devicedq_param_namekwargsr   r   r   r   r   r   r   projblocks_attrscales_attrdequantizedr   r   r   
dequantize-  sB   










r   c              	   K   sp  |j j|j j|j j}}}ddlm}	 |d}
|d}|d}|d}|d}|d}d	|v rB|d
d dd }d|v rR|d
d dd }|durb|	|
||||||| nt| |	d
dd t
jj|dd | d}| d}t| |}t| |}|jjdkr4|jjdkr6|d}|dkr||| jd d}n
||d| jd }t|d|dkrd}|| }|| }t
j| t|dd|dd|\}}W d   n1 sw   Y  |dkrt
|| j| jd g|_nt
|| j| jg|_t| || t| | d|||| dd t| | t| | ~dS dS dS )zq
    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
    r   r   r   r   r   r   r   r   r>   r   r(   r   r   r?   r   Nr   FrP   metarv   typer   r/   _precision_config)rhs_data)weight_scaleflex_ctx)rr   PrecisionConfigFlexCtx
InFlexDatar   r   r   splitr   r   r   r   rY   rc   r)   r   sizer5   rW   r   r=   r/   r$   r<   SizerX   r2   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   r?   local_expertstriton_weight_tensorr   r   r   r   load_and_swizzle_mxfp4Q  sf   






$








r  Fc           
   	   C   s   |d u rg }|   D ]i\}}|| t||s|d q
|jjdkrC|jsCt  t|| j	|< d}W d    n1 s>w   Y  |jjdkrX|jsXddl
m} |t||_tt| dkrnt||||||d\}	}|d q
| |fS )Nr(   GptOssExpertsT	GptOssMLPr   )
MethodType)has_been_replacedrh   )named_childrenappendr   poprj   r   r   r   rM   _modulestypesr  r   r   lenlistchildren_replace_with_mxfp4_linear)
r   modules_to_not_convertr   quantization_configr  rh   namer   r  _r   r   r   r    s4   



r  c                 C   sz   |j r| S ddlm} |da|d u rdgn|}|jd ur#||j tt|}t| ||||d\} }|s;t	
d | S )Nr   )
get_kernelz kernels-community/triton_kernelslm_head)rh   zYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   kernelsr  r   r  extendr  setr  loggerwarning)r   r  r   r  rh   r  r  r   r   r   replace_with_mxfp4_linear  s(   

r  )NNNFN)NNNN)utilsr   r   r   r   r   
accelerater   r   
get_loggerr   r  r3   r   r$   r   r   r   r   rL   ModulerM   r   r   r   r   r  r  r  r   r   r   r   <module>   sJ   

6ID	$E
'