
    hK                         d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ  e
       rd dlZ ej                   e      ZdZ G d	 d
e      Zy)    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                        e Zd ZdZdZdZdgZ fdZd Zd Z	d#d	Z
d
ddedefdZd
ddddeddfdZd$dZd
ddee   dee   fdZ	 d%d
ddeee      fdZdee   dedee   fdZd Zd ZdedefdZd&defd Zd%d!Zedefd"       Z xZS )'Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TF
acceleratec                 B    t        |   |fi | || _        d | _        y N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__s      f/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__1   s&    ,77#6 "&    c                     | j                    	 ddlm}  |d      | _         | j                   S | j                   S # t        $ r t        d      w xY w)z3Lazy import and initialize kernels only when neededr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels6   s]    ""*X.*45W*X' &&&t&&&  X!"VWWXs	   9 Ac                 x   t               st        d      | j                  j                  ry t        j
                  j                         s\t        j                  j                         s>| j                  r't        j                  d       d| j                  _        y t        d      t               st        d      t        j                  j                         rd}t        d      xr
 t               }n:t        j
                  j                         }|dk\  }t        d      xr
 t               }| j                  rR|s't        j                  d	       d| j                  _        y |sAt        j                  d
       d| j                  _        y |st!        d      |st!        d      | j                  s| j#                          |j%                  d      }|t        j                  d       y |N| j                  sAt'        |t(              r0d|j+                         v sd|j+                         v rt!        d      y y y y )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r    r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr	   r   r
   get_device_capability
ValueErrorr!   get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr%   s          r   validate_environmentz%Mxfp4HfQuantizer.validate_environmentA   s#   !#] 
 ##..

'')UYY-C-C-E!!##t 7;((3"#RSS&(YZZ99!!## 3G < WAUAW!&!A!A!C1V; 3G < WAUAW###I 7;((3$##  7;((3! r  # H  !!%%'ZZ-
V #&&z40j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                 V    |&t         j                  }t        j                  d|       |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r)   bfloat16r.   info)r   dtypes     r   update_dtypezMxfp4HfQuantizer.update_dtype   s.    =NNEKK@  r   modelr   
param_namec                    ddl m} ddlm} | j                  j
                  r%d|v sd|v rt        ||d t        d              \  }}nt        ||      \  }}t        ||      s"t        ||      r| j                  j
                  r|dv ryy	y)
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrF   models.gpt_oss.modeling_gpt_ossrH   r   r(   r   lenr4   )r   rB   rC   r   rF   rH   moduletensor_names           r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   s    5C ##..H
4JhZdNd"6ujIZCPYN?>["\FK"6uj"IFKf01v}-$2J2J2U2UEEr   param_valueztorch.Tensortarget_deviceztorch.devicec                    ddl m}m}m}m}	m}
 ddlm} | j                  s| j                         }t        ||      \  }}t        j                  |      5  t        ||      r |	||      \  }}|j                  j                  |j                  j                   |j                  j"                  }}} |
|||      \  }}d|v rdnd}t%        |||       t%        || d || | |                          t'        || d	       t'        || d
       d d d        y |j)                  d      }|j)                  d      }|j)                  d      }|j)                  d      }|j)                  d      }d|v sd|v r3| j*                  j                  rt        ||d t-        d	              \  }}nt        ||      \  }}||||||d}t        ||      s"t        ||      rf| j*                  j                  rO| j*                  j                  r|d t-        d	        } ||||||fi | y  |||||| j                         fi | y y y # 1 sw Y   y xY w)Nr   )rF   r(   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4rG   gate_up_proj	down_proj_precision_config)rhs_data)weight_scaleflex_ctxrK   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshrI   rJ   )ra   rb   rc   rd   re   rB   )rN   rF   r(   rW   rX   rY   rO   rH   r-   r!   r   r)   devicer4   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr3   r   rP   )r   rB   rT   rC   rU   r   rF   r(   rW   rX   rY   rH   r   rQ   _triton_weight_tensorr^   rh   ri   rj   projra   rb   rc   rd   re   shard_kwargsdq_param_names                               r   create_quantized_paramz'Mxfp4HfQuantizer.create_quantized_param   sq   	
 	
 	D!!!%!:!:!<,UJ?IFAm,f&899J;Xj9k6(,*55EE*55==*55@@ /9WO
 :G,l<N:6(, .<z-I>{DFD*>?& 12'\G]g]iLjk FtfG$45FtfG$45+ -,4 !**]3K"JJ7M"JJ7M::f%D **]3KJ&(j*@dF^F^FiFi0
CTc)n_8UV	0
C	  +!.!.*L &"456=1d6N6N6Y6Y++66 %//@#i.$AMvz;}m`lm*"#%113 ' 7Z1_ -,s   B?IIc                 F   | j                   j                  r| j                  |       t        j                  j                         rt        j                  j                          y t        j                  j                         rt        j                  j                          y y r   )r   r(   remove_quantization_configr)   r*   r+   empty_cacher,   )r   rB   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading  sd    ##..++E2::""$JJ""$YY##%II!!# &r   expected_keyscheckpoint_keysc                    g }|D ]C  }|j                  d      r8|d t        d        }|j                  |dz          |j                  |dz          M|j                  d      r8|d t        d        }|j                  |dz          |j                  |dz          | j                  s|j                  d	      r$|d t        d        }|j                  |dz          |j                  d
      r%|d t        d        }|j                  |dz          |j                  d      r |j                  |       3|j                  |       F |S )Nz.mlp.experts.gate_up_projrZ   gate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projr[   down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrJ   )endswithrP   appendr-   )r   rB   rw   rx   new_expected_keyskeybases          r   update_expected_keysz%Mxfp4HfQuantizer.update_expected_keys  sN    C||781c.112!((0E)EF!((0E)EF67.c+../!((0B)BC!((0B)BC''<< ?@9#&8"9!9:D%,,TK-?@\\"DE<#&;"<!<=D%,,TN-BC\\(+%,,S1!((-/ !0 ! r   keep_in_fp32_modulesc                 j   ddl m} | j                  || j                  j                  |      | _        |j                  dd      }|r&t        j                  d       d| j                  _        |j                  } ||| j                  | j                  |      }| j                  |j                  _        y )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rN   r   get_modules_to_not_convertr   r   r3   r.   r/   r(   r   )r   rB   r   r   r   r   r   s          r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading(  s     	=&*&E&E4++BBDX'
# jj6e 37D$$/)#'#>#> $ 8 8	
 ,0+C+C(r   missing_keysprefixc                 $   ddl m} g }|j                         D ]\  \  }}t        ||      s|D ]E  }||v s
|| d| v s|j	                  d      r#|j	                  d      r5|j                  |       G ^ |D 	cg c]	  }	|	|vs|	 c}	S c c}	w )Nr   rE   .z.weightz.bias)rN   rF   named_modulesr4   r~   r   )
r   rB   r   r   rF   not_missing_keysnamerQ   missingks
             r   update_missing_keysz$Mxfp4HfQuantizer.update_missing_keysG  s    5!//1LD&&"45+GDvhay4I,I ' 0 0 ; ' 0 0 9(//8  , 2 (E<a14D+D<EEEs   <	BBc                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   updater   r   s     r   update_tp_planzMxfp4HfQuantizer.update_tp_planV  R    V--666v3T:F))00DRDRAOAO	 r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )Nr   base_model_ep_planr   r   )r   r   r   r   r   r   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_planc  r   r   c                 2   | j                   j                  r.d|v r|j                  dd      S d|v r|j                  dd      S |S | j                  sF|j	                  d      r|j                  dd      S |j	                  d      r|j                  dd      S |S )NrK    r`   rZ   rz   r[   r|   )r   r(   replacer-   r~   )r   rC   s     r   update_param_namez"Mxfp4HfQuantizer.update_param_namep  s    ##..J&!)))R88j(!)))R88  ##"">2!)).:OPP"";/!))+7IJJr   safe_serializationc                 l   ddl m} |j                         }|j                         D ]  \  }}t	        ||      st        |d      s!t        |d      s.|j                  j                  j                  j                  |j                  j                  j                        j                  dd      j                  dddd	      || d
<   |j                  j                  j                  j                  j                  |j                  j                  j                  j                        j                  dd      || d<   |j                  j                  j                  j                  |j                  j                  j                        j                  dd      j                  dddd      || d<   |j                   j                  j                  j                  j                  |j                   j                  j                  j                        j                  dd      || d<    i }||fS )Nr   rE   rZ   r[       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)rN   rF   
state_dictr   r4   hasattrrZ   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configr^   r[   down_proj_precision_config)r   rB   r   rF   r   r   rQ   metadatas           r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata}  s   5%%'
!//1LD&6#56FN3FK0 ''//66EEfFYFYFaFaFfFfgYr2&WRR, dV#789 88EEMMTTcc<<IIQQVViB' dV#789 $$,,33BB6CSCSC[C[C`C`aYr2&WRr2. dV#456 55BBJJQQ``99FFNNSSiB' dV#456+ 26 8##r   c                      y)NT )r   r   s     r   is_serializablez Mxfp4HfQuantizer.is_serializable  s    r   c                 .    t         j                  d       y)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r.   r/   )r   s    r   is_trainablezMxfp4HfQuantizer.is_trainable  s     x	
 r   )r@   torch.dtyper<   r   )rB   r   r   )F)r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r!   r;   rA   strboolrS   rr   rv   listr   r   r   r   r   r   r   r   r   propertyr   __classcell__)r   s   @r   r   r   '   sI    (,$ %'
	'M^
.? S _c "R R $R 	R
 &Rh$!*; !DQTI !hlmphq !@ 59D D 'tCy1D>FtCy F# FRVWZR[ FC C !$T !$F d  r   r   )typingr   r   r   r   modeling_utilsr   utilsr	   r
   r   r   r   quantizers_utilsr   r)   
get_loggerr   r.   r   r   r   r   r   <module>r      sU    +  0  3 			H	% A{ Ar   