
    <hr?                         S SK JrJrJr  SSKJr  \(       a  SSKJr  SSKJ	r	J
r
JrJrJr  SSKJr  \" 5       (       a  S SKr\R"                  " \5      r " S	 S
\5      rg)    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                   X  ^  \ rS rSrSrSrSrS/rU 4S jrS r	S#S	 jr
S
SSSS\S\\\4   4S jr S$S
SSSS\SSS\\\4   S\\\      4S jjrS%S jrS
SS\\   S\\   4S jr S$S
SS\\\      4S jjrS\\   S\S\\   4S jrS rS\S\4S jrS$S  jr\S\4S! j5       rS"rU =r$ )&Mxfp4HfQuantizer&   z'
FP4 quantization using fbgemm kernels
TF
acceleratec                 4   > [         TU ]  " U40 UD6  Xl        g N)super__init__quantization_config)selfr   kwargs	__class__s      _/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   Mxfp4HfQuantizer.__init__1   s    ,77#6     c                 d   [        5       (       d  [        S5      eU R                  R                  (       a  g [        R
                  R                  5       (       dC  U R                  (       a'  [        R                  S5        SU R                  l        g [        S5      e[        5       (       d  [        S5      e[        R
                  R                  5       nUS:  n[        S5      =(       a
    [        5       nU R                  (       a]  U(       d'  [        R                  S5        SU R                  l        g U(       d'  [        R                  S	5        SU R                  l        g O$U(       d  [        S
5      eU(       d  [        S5      eU R                  (       d  SSKJn  U" S5      qUR'                  SS 5      nUc  [        R                  S5        g Ub\  U R                  (       dJ  [)        U[*        5      (       a4  SUR-                  5       ;   d  SUR-                  5       ;   a  [        S5      eg g g g )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`)      z3.4.0zMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200). We will default to dequantizing the model to bf16.ztMXFP4 quantization requires triton >= 3.4.0 and kernels installed, we will default to dequantizing the model to bf16zmMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200)zHMXFP4 quantization requires triton >= 3.4.0 and triton_kernels installedr   )
get_kernelz kernels-community/triton_kernels
device_mapzYou have loaded an FP4 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availablepre_quantizedloggerwarning_onceRuntimeErrorr
   get_device_capabilityr   r   
ValueErrorkernelsr"   triton_kernels_hubget
isinstancedictvalues)r   argsr   compute_capabilitygpu_is_supportedkernels_availabler"   r#   s           r   validate_environment%Mxfp4HfQuantizer.validate_environment5   s   !##] 
 ##..zz&&((!!##t 7;((3"#RSS&((YZZ"ZZ==?-7/8S=Q=S###I 7;((3$## K 7;((3 % "  #ghh!!* ",,N!OZZd3
| #&&z400j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                 X    Uc&  [         R                  n[        R                  SU5        U$ )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.)r(   bfloat16r,   info)r   torch_dtypes     r   update_torch_dtype#Mxfp4HfQuantizer.update_torch_dtype   s0    ..KKKF  r   modelr	   param_valueztorch.Tensor
param_name
state_dictc                 @   SSK Jn  SSKJn  U R                  R
                  (       a'  SU;   d  SU;   a  [        XS [        S5      *  5      u  pO[        X5      u  p[        X5      (       d+  [        X5      (       a#  U R                  R
                  (       a  U	S;   a  gg	g)
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrJ   models.gpt_oss.modeling_gpt_ossrL   r   r'   r   lenr4   )
r   rD   rE   rF   rG   r   rJ   rL   moduletensor_names
             r   check_quantized_param&Mxfp4HfQuantizer.check_quantized_param   s     	6C ##..H
4JhZdNd"6uIZCPYN?>["\FK"6u"IFf11v--$2J2J2U2UEEr   target_deviceztorch.deviceunexpected_keysc           
         SSK JnJn	Jn
Jn  SSKJn  U R                  (       Gd  [        R                  R                  [        R                  R                  [        R                  R                  pn[        X5      u  nn[        R                  R!                  U5         [#        UU5      (       Gah  SU;   a  UR$                  nUR&                  n[        R(                  R*                  R-                  USUSUSS4SSS9nU" U5      u  nnU" UU" U" 5       S9S	9Ul        UUl        [        R(                  R3                  UR4                  R6                  S
S9Ul        OSU;   a  UR:                  nUR<                  n[        R(                  R*                  R-                  USUSUSS4SSS9R?                  U5      nU" U5      u  nnU" UU" U" 5       S9S	9Ul         UUl!        [        R(                  R3                  UR4                  R6                  S
S9Ul"        S S S 5        g URG                  SS 5      nURG                  SS 5      nURG                  SS 5      nURG                  SS 5      nURG                  SS 5      nSU;   d  SU;   a7  U RH                  R                  (       a  [        XS [K        S5      *  5      u  nnO[        X5      u  nnUUUUUUS.n[#        UU5      (       d,  [#        UU5      (       a`  U RH                  R                  (       aD  U RH                  R                  (       a  US [K        S5      *  nU	" UX2UU40 UD6  g U
" UUUU40 UD6  g g g ! , (       d  f       g = f)Nr   )rJ   r'   load_and_swizzle_mxfp4quantize_to_mxfp4rK   gate_up_projr   constant)modevalue)rhs_data)weight_scaleflex_ctxF)requires_grad	down_projempty_paramcasting_dtypeto_contiguousrankdevice_meshrM   rN   rO   )rg   rh   ri   rj   rk   rD   )&rR   rJ   r'   r\   r]   rS   rL   r+   r2   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatar   r(   r)   devicer4   gate_up_proj_right_padgate_up_proj_bottom_padnn
functionalpadgate_up_proj_precision_configr^   	Parameterstoragedatagate_up_proj_blocksdown_proj_right_paddown_proj_bottom_padtodown_proj_precision_configrf   down_proj_blocksr3   r   rT   )r   rD   rE   rF   rY   rG   rZ   r   rJ   r'   r\   r]   rL   rm   rn   ro   rU   _	right_pad
bottom_padloaded_weighttriton_weight_tensorrc   rg   rh   ri   rj   rk   shard_kwargsdq_param_names                                 r   create_quantized_param'Mxfp4HfQuantizer.create_quantized_param   sY    	mlC!!!"--=="--55"--88 '1O
 -U?IFA""=1f&899%3$*$A$A	%+%C%C
(-(;(;(?(?'!Y:q!)LS]ef )@ ) >O}=]:,l?N)5Q[Q]@^@< /C+5:XX5G5G088==U 6H 62 %
2$*$>$>	%+%@%@
(-(;(;(?(?'!Y:q!)LS]ef )@ )"]+ & >O}=]:,l<K)5Q[Q]@^=9 ,@(27((2D2D088==U 3E 3/7 21D !**]D9K"JJ=M"JJ=M::fd+D **]D9KJ&(j*@dF^F^FiFi0CTc)n_8UV	0C	  +!.!.*L &"4556=11d6N6N6Y6Y++66 %//@#i.$AMvz}m`lm*"#%	
 ' 7Z1o 21s   E;M
M(c                     U R                   R                  (       a  U R                  U5        [        R                  R                  5       (       a  [        R                  R                  5         g g r   )r   r'   remove_quantization_configr(   r)   r*   empty_cache)r   rD   r   s      r   #_process_model_after_weight_loading4Mxfp4HfQuantizer._process_model_after_weight_loading  sG    ##..++E2::""$$JJ""$ %r   expected_keyscheckpoint_keysc                 z   / nU H  nUR                  S5      (       a9  US [        S5      *  nUR                  US-   5        UR                  US-   5        MR  UR                  S5      (       a9  US [        S5      *  nUR                  US-   5        UR                  US-   5        M  UR                  U5        M     U$ )	Nz.mlp.experts.gate_up_projr^   rz   gate_up_proj_scalesz.mlp.experts.down_projrf   r   down_proj_scales)endswithrT   append)r   rD   r   r   new_expected_keyskeybases          r   update_expected_keys%Mxfp4HfQuantizer.update_expected_keys	  s     C||7881c.112!((0E)EF!((0E)EF677.c+../!((0B)BC!((0B)BC!((- ! ! r   keep_in_fp32_modulesc                 l   SSK Jn  U R                  XR                  R                  U5      U l        UR                  SS5      nU(       a&  [        R                  S5        SU R                  l        UR                  nU" UU R                  U R                  US9nU R                  UR                  l        g )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rR   r   get_modules_to_not_convertr   r   r3   r,   r-   r'   r   )r   rD   r   r   r   r   r   s          r   $_process_model_before_weight_loading5Mxfp4HfQuantizer._process_model_before_weight_loading  s     	=&*&E&E++BBDX'
# jj6e 37D$$/)#'#>#> $ 8 8	
 ,0+C+C(r   missing_keysprefixc                 Z   SSK Jn  / nUR                  5        Hr  u  pg[        Xt5      (       d  M  U HU  nXh;   d  Xc SU 3;   d  M  UR	                  S5      (       a  M,  UR	                  S5      (       a  MD  UR                  U5        MW     Mt     U V	s/ sH  oU;  d  M
  U	PM     sn	$ s  sn	f )Nr   rI   .z.weightz.bias)rR   rJ   named_modulesr4   r   r   )
r   rD   r   r   rJ   not_missing_keysnamerU   missingks
             r   update_missing_keys$Mxfp4HfQuantizer.update_missing_keys8  s    5!//1LD&55+GDhay4I,I ' 0 0 ; ; ' 0 0 9 9(//8  , 2 (E<a4D+D<EEEs   B(B(c                     SUR                   R                  ;   a.  [        USS 5      b   UR                  R	                  SSSSS.5        U$ )NGptOssConfigbase_model_tp_plangrouped_gemm)z(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   update)r   r   s     r   update_tp_planMxfp4HfQuantizer.update_tp_planG  sR    V--666v3T:F))00DRDRAOAO	 r   c                     U R                   R                  (       a0  SU;   a  UR                  SS5      $ SU;   a  UR                  SS5      $ U$ )NrO    _scales)r   r'   replace)r   rF   s     r   update_param_name"Mxfp4HfQuantizer.update_param_nameT  sM    ##..J&!)))R88j(!)))R88r   c                 .    [         R                  S5        g)Nz@MXFP4 quantization is not serializable using safetensors for nowFr,   r-   )r   safe_serializations     r   is_serializable Mxfp4HfQuantizer.is_serializable\  s    ^_r   c                 .    [         R                  S5        g)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()Fr   )r   s    r   is_trainableMxfp4HfQuantizer.is_trainable`  s     x	
 r   )r   r   )rA   torch.dtyper=   r   r   )rD   r	   )r   
__module____qualname____firstlineno____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r;   rB   strr5   r   rW   r   listr   r   r   r   r   r   r   r   propertyboolr   __static_attributes____classcell__)r   s   @r   r   r   &   su    (,$ %7JX
  $ 	
 cN@ 04Y Y $Y 	Y
 &Y cNY "$s),Yv%!*; !DQTI !hlmphq !& 59D D 'tCy1D>FtCy F# FRVWZR[ FC C  d  r   r   )typingr   r   r   r   r   modeling_utilsr	   utilsr
   r   r   r   r   quantizers_utilsr   r(   
get_loggerr   r,   r    r   r   <module>r      sS    0 /  0  3 			H	%{ r   