o
    
shF                     @   s   d dl mZmZmZ ddlmZ erddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ e r3d dlZeeZdZG d	d
 d
eZdS )    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                       sB  e Zd ZdZdZdZdgZ fddZdd Zd	d
 Z	d5ddZ
dddddedeeef fddZ	d6dddddedddeeef deee  fddZd7ddZddd ee d!ee fd"d#Z	d6ddd$eee  fd%d&Zd'ee d(edee fd)d*Zd+d, Zdedefd-d.Zd/d0 Zd6d1d2Zedefd3d4Z  ZS )8Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TF
acceleratec                    s$   t  j|fi | || _d | _d S N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__ e/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   1   s   
zMxfp4HfQuantizer.__init__c                 C   sF   | j du r zddlm} |d| _ W | j S  ty   tdw | j S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   kernelsr   ImportError)r   r   r   r   r   _lazy_import_kernels6   s   
z%Mxfp4HfQuantizer._lazy_import_kernelsc                 O   s@  t  std| jjrd S tj s$| jr t	d d| j_d S t
dt s+tdtj }|dk}tdo:t }| jrY|sKt	d d| j_d S |sXt	d	 d| j_d S n|s_td
|setd| jsl|   |d}|d u r|t	d d S |d ur| jst|trd| v sd| v rtdd S d S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`)      z3.4.0zMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200). We will default to dequantizing the model to bf16.ztMXFP4 quantization requires triton >= 3.4.0 and kernels installed, we will default to dequantizing the model to bf16zmMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200)zAMXFP4 quantization requires triton >= 3.4.0 and kernels installed
device_mapzYou have loaded an FP4 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   r   
dequantizetorchcudais_availablepre_quantizedloggerwarning_onceRuntimeErrorr	   get_device_capabilityr   r
   
ValueErrorr   get
isinstancedictvalues)r   argsr   compute_capabilitygpu_is_supportedkernels_availabler"   r   r   r   validate_environmentA   st   


z%Mxfp4HfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   s   |d u rt j}td| |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r&   bfloat16r*   info)r   r8   r   r   r   update_dtype   s   zMxfp4HfQuantizer.update_dtypemodelr   param_valueztorch.Tensor
param_name
state_dictc           
      K   s   ddl m} ddlm} | jjr'd|v sd|v r't||d td  \}}	nt||\}}	t||s<t||rD| jjrD|	dv rBdS d	S dS )
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrC   models.gpt_oss.modeling_gpt_ossrE   r   r%   r   lenr0   )
r   r>   r?   r@   rA   r   rC   rE   moduletensor_namer   r   r   check_quantized_param   s   
z&Mxfp4HfQuantizer.check_quantized_paramNtarget_deviceztorch.deviceunexpected_keysc              	   K   s.  ddl m}m}	m}
m}m} ddlm} | js| 	 }t
||\}}t|b t||r|||\}}|jj|jj|jj}}}||||\}}d|v rPdnd}t||| t|| d|||| dd t|| d	 t|| d
 W d    d S W d    d S 1 sw   Y  d S |d}|d}|d}|d}|d}d|v sd|v r| jjrt
||d td	  \}}nt
||\}}||||||d}t||st||r| jjr| jjr|d td	  }|	|||||fi | d S |
||||| 	 fi | d S d S d S )Nr   )rC   r%   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4rD   gate_up_proj	down_proj_precision_config)rhs_data)weight_scaleflex_ctxrH   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshrF   rG   )r]   r^   r_   r`   ra   r>   )rK   rC   r%   rS   rT   rU   rL   rE   r)   r   r   r&   devicer0   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr/   r   rM   )r   r>   r?   r@   rQ   rA   rR   r   rC   r%   rS   rT   rU   rE   r   rN   _triton_weight_tensorrZ   rd   re   rf   projr]   r^   r_   r`   ra   shard_kwargsdq_param_namer   r   r   create_quantized_param   sx   


"





	

z'Mxfp4HfQuantizer.create_quantized_paramc                 K   s.   | j jr	| | tj rtj  d S d S r   )r   r%   remove_quantization_configr&   r'   r(   empty_cache)r   r>   r   r   r   r   #_process_model_after_weight_loading  s
   

z4Mxfp4HfQuantizer._process_model_after_weight_loadingexpected_keyscheckpoint_keysc                 C   s  g }|D ]|}| dr#|d td  }||d  ||d  q| dr@|d td  }||d  ||d  q| js{| d	rY|d td  }||d  q| d
ro|d td  }||d  q| druq|| q|| q|S )Nz.mlp.experts.gate_up_projrV   gate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projrW   down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrG   )endswithrM   appendr)   )r   r>   rr   rs   new_expected_keyskeybaser   r   r   update_expected_keys  s,   




z%Mxfp4HfQuantizer.update_expected_keyskeep_in_fp32_modulesc                 K   sj   ddl m} | || jj|| _|dd}|r!td d| j_|j	}||| j| j|d}| j|j	_d S )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rK   r   get_modules_to_not_convertr   r   r/   r*   r+   r%   r   )r   r>   r~   r   r   r   r   r   r   r   $_process_model_before_weight_loading)  s$   
z5Mxfp4HfQuantizer._process_model_before_weight_loadingmissing_keysprefixc                    s   ddl m} g  | D ]*\}}t||r6|D ]}||v s&|| d| v r5|ds5|ds5 | qq fdd|D S )Nr   rB   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0knot_missing_keysr   r   
<listcomp>U  s    z8Mxfp4HfQuantizer.update_missing_keys.<locals>.<listcomp>)rK   rC   named_modulesr0   rx   ry   )r   r>   r   r   rC   namerN   missingr   r   r   update_missing_keysH  s   

z$Mxfp4HfQuantizer.update_missing_keysc                 C   s6   d|j jv rt|dd d ur|jddddd |S )NGptOssConfigbase_model_tp_plangrouped_gemm)z(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   update)r   r   r   r   r   update_tp_planW  s   zMxfp4HfQuantizer.update_tp_planc                 C   sj   | j jrd|v r|ddS d|v r|ddS |S | js3|dr(|ddS |dr3|ddS |S )NrH    r\   rV   rt   rW   rv   )r   r%   replacer)   rx   )r   r@   r   r   r   update_param_named  s   

z"Mxfp4HfQuantizer.update_param_namec                 C   s  ddl m} | }| D ]s\}}t||rt|drt|dr|jjj	|jjj
dddddd	|| d
< |jjjj	|jjjj
dd|| d< |jjj	|jjj
dddddd|| d< |jjjj	|jjjj
dd|| d< q|S )Nr   rB   rV   rW       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)rK   rC   rA   r   r0   hasattrrV   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configrZ   rW   down_proj_precision_config)r   r>   rC   rA   r   rN   r   r   r   get_state_dictq  s:   

zMxfp4HfQuantizer.get_state_dictc                 C   s   dS )NTr   )r   safe_serializationr   r   r   is_serializable  s   z Mxfp4HfQuantizer.is_serializablec                 C   s   t d dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r*   r+   )r   r   r   r   is_trainable  s   zMxfp4HfQuantizer.is_trainable)r8   r9   r:   r9   r   )r>   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r   r7   r=   strr1   r   rP   r   listrn   rq   r}   r   r   r   r   r   r   propertyboolr   __classcell__r   r   r   r   r   '   s^    
I





V 


"r   )typingr   r   r   r|   r   modeling_utilsr   utilsr	   r
   r   r   r   quantizers_utilsr   r&   
get_loggerr   r*   r   r   r   r   r   r   <module>   s   
