ó
    <±hr?  ã                   óÀ   • S SK JrJrJr  SSKJr  \(       a  SSKJr  SSKJ	r	J
r
JrJrJr  SSKJr  \" 5       (       a  S SKr\R"                  " \5      r " S	 S
\5      rg)é    )ÚTYPE_CHECKINGÚAnyÚOptionalé   )ÚHfQuantizeré   )ÚPreTrainedModel)Úis_accelerate_availableÚis_kernels_availableÚis_torch_availableÚis_triton_availableÚlogging)Úget_module_from_nameNc                   óX  ^ • \ rS rSrSrSrSrS/rU 4S jrS r	S#S	 jr
S
SSSS\S\\\4   4S jr S$S
SSSS\SSS\\\4   S\\\      4S jjrS%S jrS
SS\\   S\\   4S jr S$S
SS\\\      4S jjrS\\   S\S\\   4S jrS rS\S\4S jrS$S  jr\S\4S! j5       rS"rU =r$ )&ÚMxfp4HfQuantizeré&   z'
FP4 quantization using fbgemm kernels
TFÚ
acceleratec                 ó4   >• [         TU ]  " U40 UD6  Xl        g ©N)ÚsuperÚ__init__Úquantization_config)Úselfr   ÚkwargsÚ	__class__s      €Ú_/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   ÚMxfp4HfQuantizer.__init__1   s   ø€ Ü‰ÒÐ,Ñ7°Ò7Ø#6Õ ó    c                 ód  • [        5       (       d  [        S5      eU R                  R                  (       a  g [        R
                  R                  5       (       dC  U R                  (       a'  [        R                  S5        SU R                  l        g [        S5      e[        5       (       d  [        S5      e[        R
                  R                  5       nUS:¬  n[        S5      =(       a
    [        5       nU R                  (       a]  U(       d'  [        R                  S5        SU R                  l        g U(       d'  [        R                  S	5        SU R                  l        g O$U(       d  [        S
5      eU(       d  [        S5      eU R                  (       d  SSKJn  U" S5      qUR'                  SS 5      nUc  [        R                  S5        g Ub\  U R                  (       dJ  [)        U[*        5      (       a4  SUR-                  5       ;   d  SUR-                  5       ;   a  [        S5      eg g g g )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`)é   é   z3.4.0z¡MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200). We will default to dequantizing the model to bf16.ztMXFP4 quantization requires triton >= 3.4.0 and kernels installed, we will default to dequantizing the model to bf16zmMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200)zHMXFP4 quantization requires triton >= 3.4.0 and triton_kernels installedr   )Ú
get_kernelz kernels-community/triton_kernelsÚ
device_mapzÀYou have loaded an FP4 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. ÚcpuÚdiskzòYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ÚImportErrorr   Ú
dequantizeÚtorchÚcudaÚis_availableÚpre_quantizedÚloggerÚwarning_onceÚRuntimeErrorr
   Úget_device_capabilityr   r   Ú
ValueErrorÚkernelsr"   Útriton_kernels_hubÚgetÚ
isinstanceÚdictÚvalues)r   Úargsr   Úcompute_capabilityÚgpu_is_supportedÚkernels_availabler"   r#   s           r   Úvalidate_environmentÚ%Mxfp4HfQuantizer.validate_environment5   sø  € Ü!×#Ñ#Üð]óð ð
 ×#Ñ#×.×.Øäz‰z×&Ñ&×(Ñ(Ø×!×!Ü×#Ñ#Øtôð 7;×(Ñ(Ô3Øä"Ð#RÓSÐSä&×(Ñ(ÜÐYÓZÐZä"ŸZ™Z×=Ñ=Ó?ÐØ-°Ñ7ÐÜ/°Ó8×SÔ=QÓ=SÐà××æ#Ü×#Ñ#ðIôð 7;×(Ñ(Ô3Øæ$Ü×#Ñ#ð Kôð 7;×(Ñ(Ô3Øð %ö "äØóð ö #äÐgÓhÐhà×!×!Ý*ñ ",Ð,NÓ!OÐà—Z‘Z ¨dÓ3ˆ
ØÑÜ×Ñð|õð Ñ#à×&×&Ü˜z¬4×0Ñ0Ø˜j×/Ñ/Ó1Ó1°V¸z×?PÑ?PÓ?RÓ5Rä ðnóð ð 6Sð 1ð 'ð $r   Úreturnc                 óX   • Uc&  [         R                  n[        R                  SU5        U$ )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.)r(   Úbfloat16r,   Úinfo)r   Útorch_dtypes     r   Úupdate_torch_dtypeÚ#Mxfp4HfQuantizer.update_torch_dtype   s0   € ØÑÜŸ.™.ˆKÜK‰KðFð ôð Ðr   Úmodelr	   Úparam_valueztorch.TensorÚ
param_nameÚ
state_dictc                 ó@  • SSK Jn  SSKJn  U R                  R
                  (       a'  SU;   d  SU;   a  [        XS [        S5      *  5      u  p‰O[        X5      u  p‰[        X†5      (       d+  [        X‡5      (       a#  U R                  R
                  (       a  U	S;   a  gg	g)
Nr   ©ÚMxfp4GptOssExperts©ÚGptOssExpertsÚblocksÚscalesÚ_blocks)Údown_proj_biasÚgate_up_proj_biasFT)	ÚintegrationsrJ   Úmodels.gpt_oss.modeling_gpt_ossrL   r   r'   r   Úlenr4   )
r   rD   rE   rF   rG   r   rJ   rL   ÚmoduleÚtensor_names
             r   Úcheck_quantized_paramÚ&Mxfp4HfQuantizer.check_quantized_param   s‹   € õ 	6ÝCð ×#Ñ#×.×.°HÀ
Ó4JÈhÐZdÓNdÜ"6°uÐIZÌCÐPYËNÈ?Ð>[Ó"\ÑˆFKä"6°uÓ"IÑˆFäf×1Ñ1Üv×-Ñ-°$×2JÑ2J×2U×2UàÐEÓEØØØr   Útarget_deviceztorch.deviceÚunexpected_keysc           
      óÖ  • SSK JnJn	Jn
Jn  SSKJn  U R                  (       Gdþ  [        R                  R                  [        R                  R                  [        R                  R                  pþn[        X5      u  nn[        R                  R!                  U5         [#        UU5      (       Gah  SU;   a§  UR$                  nUR&                  n[        R(                  R*                  R-                  USUSUSS4SSS9nU" U5      u  nnU" UU" U" 5       S9S	9Ul        UUl        [        R(                  R3                  UR4                  R6                  S
S9Ul        O»SU;   aµ  UR:                  nUR<                  n[        R(                  R*                  R-                  USUSUSS4SSS9R?                  U5      nU" U5      u  nnU" UU" U" 5       S9S	9Ul         UUl!        [        R(                  R3                  UR4                  R6                  S
S9Ul"        S S S 5        g URG                  SS 5      nURG                  SS 5      nURG                  SS 5      nURG                  SS 5      nURG                  SS 5      nSU;   d  SU;   a7  U RH                  R                  (       a  [        XS [K        S5      *  5      u  nnO[        X5      u  nnUUUUUUS.n[#        UU5      (       d,  [#        UU5      (       a`  U RH                  R                  (       aD  U RH                  R                  (       a  US [K        S5      *  nU	" UX2UU40 UD6  g U
" UUUU40 UD6  g g g ! , (       d  f       g = f)Nr   )rJ   r'   Úload_and_swizzle_mxfp4Úquantize_to_mxfp4rK   Úgate_up_projr   Úconstant)ÚmodeÚvalue)Úrhs_data)Úweight_scaleÚflex_ctxF)Úrequires_gradÚ	down_projÚempty_paramÚcasting_dtypeÚto_contiguousÚrankÚdevice_meshrM   rN   rO   )rg   rh   ri   rj   rk   rD   )&rR   rJ   r'   r\   r]   rS   rL   r+   r2   Ú
matmul_ogsÚPrecisionConfigÚFlexCtxÚ
InFlexDatar   r(   r)   Údevicer4   Úgate_up_proj_right_padÚgate_up_proj_bottom_padÚnnÚ
functionalÚpadÚgate_up_proj_precision_configr^   Ú	ParameterÚstorageÚdataÚgate_up_proj_blocksÚdown_proj_right_padÚdown_proj_bottom_padÚtoÚdown_proj_precision_configrf   Údown_proj_blocksr3   r   rT   )r   rD   rE   rF   rY   rG   rZ   r   rJ   r'   r\   r]   rL   rm   rn   ro   rU   Ú_Ú	right_padÚ
bottom_padÚloaded_weightÚtriton_weight_tensorrc   rg   rh   ri   rj   rk   Úshard_kwargsÚdq_param_names                                 r   Úcreate_quantized_paramÚ'Mxfp4HfQuantizer.create_quantized_param¦   sY  € ÷ 	mÓlÝCà×!×!Ð!ä"×-Ñ-×=Ñ=Ü"×-Ñ-×5Ñ5Ü"×-Ñ-×8Ñ8ð '1ˆOô
 -¨UÓ?‰IˆFAÜ—‘×"Ñ" =Õ1Ü˜fÐ&8×9Ò9Ø%¨Ó3Ø$*×$AÑ$A˜	Ø%+×%CÑ%C˜
Ü(-¯©×(;Ñ(;×(?Ñ(?Ø'¨!¨Y¸¸:ÀqÈ!Ð)LÐS]Ðefð )@ð )˜ñ >OÈ}Ó=]Ñ:Ð,¨lÙ?NØ)5ÁÑQ[ÓQ]Ñ@^ñ@˜Ô<ð /C˜Ô+Ü5:·X±X×5GÑ5GØ0×8Ñ8×=Ñ=ÈUð 6Hð 6˜Õ2ð %¨
Ó2Ø$*×$>Ñ$>˜	Ø%+×%@Ñ%@˜
Ü(-¯©×(;Ñ(;×(?Ñ(?Ø'¨!¨Y¸¸:ÀqÈ!Ð)LÐS]Ðefð )@ð )ç™"˜]Ó+ð &ñ >OÈ}Ó=]Ñ:Ð,¨lÙ<KØ)5ÁÑQ[ÓQ]Ñ@^ñ=˜Ô9ð ,@˜Ô(Ü27·(±(×2DÑ2DØ0×8Ñ8×=Ñ=ÈUð 3Eð 3˜Ô/÷7 2Ð1ðD !Ÿ*™* ]°DÓ9ˆKØ"ŸJ™J ¸Ó=ˆMØ"ŸJ™J ¸Ó=ˆMØ—:‘:˜f dÓ+ˆDØ Ÿ*™* ]°DÓ9ˆKØ˜JÓ&¨(°jÓ*@Àd×F^ÑF^×Fi×Fiä0°ÐCTÄcÈ)ÃnÀ_Ð8UÓV‘	™ä0°ÓC‘	˜ð  +Ø!.Ø!.ØØ*ØñˆLô ˜&Ð"4×5Ñ5Ü˜6 =×1Ñ1°d×6NÑ6N×6Y×6Yà×+Ñ+×6×6ð %/Ð/@´#°i³.°Ð$AMÙ˜v zÀÈ}ÑmÐ`lÓmá*ØØ"Ø#Ø%ñ	ð
 'óð 7ZÐ1÷o 2Õ1ús   ÂE;MÍ
M(c                 óà   • U R                   R                  (       a  U R                  U5        [        R                  R                  5       (       a  [        R                  R                  5         g g r   )r   r'   Úremove_quantization_configr(   r)   r*   Úempty_cache)r   rD   r   s      r   Ú#_process_model_after_weight_loadingÚ4Mxfp4HfQuantizer._process_model_after_weight_loading  sG   € à×#Ñ#×.×.Ø×+Ñ+¨EÔ2ä:‰:×"Ñ"×$Ñ$ÜJ‰J×"Ñ"Õ$ð %r   Úexpected_keysÚcheckpoint_keysc                 óz  • / nU H²  nUR                  S5      (       a9  US [        S5      *  nUR                  US-   5        UR                  US-   5        MR  UR                  S5      (       a9  US [        S5      *  nUR                  US-   5        UR                  US-   5        M¡  UR                  U5        M´     U$ )	Nz.mlp.experts.gate_up_projr^   rz   Úgate_up_proj_scalesz.mlp.experts.down_projrf   r   Údown_proj_scales)ÚendswithrT   Úappend)r   rD   rŽ   r   Únew_expected_keysÚkeyÚbases          r   Úupdate_expected_keysÚ%Mxfp4HfQuantizer.update_expected_keys	  sÂ   € àÐÛ ˆCØ|‰|Ð7×8Ñ8ØÐ1œc .Ó1Ð1Ð2Ø!×(Ñ(¨Ð0EÑ)EÔFØ!×(Ñ(¨Ð0EÑ)EÖFØ—‘Ð6×7Ñ7ØÐ.œc +Ó.Ð.Ð/Ø!×(Ñ(¨Ð0BÑ)BÔCØ!×(Ñ(¨Ð0BÑ)BÖCà!×(Ñ(¨Ö-ñ !ð !Ð r   Úkeep_in_fp32_modulesc                 ól  • SSK Jn  U R                  XR                  R                  U5      U l        UR                  SS5      nU(       a&  [        R                  S5        SU R                  l        UR                  nU" UU R                  U R                  US9nU R                  UR                  l        g )Nr   )Úreplace_with_mxfp4_linearÚuse_kernelsFzžYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)Úmodules_to_not_convertr   Úconfig)
rR   rœ   Úget_modules_to_not_convertr   rž   r3   r,   r-   r'   rŸ   )r   rD   rš   r   rœ   r   rŸ   s          r   Ú$_process_model_before_weight_loadingÚ5Mxfp4HfQuantizer._process_model_before_weight_loading  s¤   € õ 	=à&*×&EÑ&EØ×+Ñ+×BÑBÐDXó'
ˆÔ#ð —j‘j °Ó6ˆæÜ×Ñðeôð 37ˆD×$Ñ$Ô/à—‘ˆÙ)ØØ#'×#>Ñ#>Ø $× 8Ñ 8Øñ	
ˆð ,0×+CÑ+Cˆ‰Õ(r   Úmissing_keysÚprefixc                 óZ  • SSK Jn  / nUR                  5        Hr  u  pg[        Xt5      (       d  M  U HU  nXh;   d  Xc SU 3;   d  M  UR	                  S5      (       a  M,  UR	                  S5      (       a  MD  UR                  U5        MW     Mt     U V	s/ sH  o™U;  d  M
  U	PM     sn	$ s  sn	f )Nr   rI   Ú.z.weightz.bias)rR   rJ   Únamed_modulesr4   r“   r”   )
r   rD   r£   r¤   rJ   Únot_missing_keysÚnamerU   ÚmissingÚks
             r   Úupdate_missing_keysÚ$Mxfp4HfQuantizer.update_missing_keys8  sš   € Ý5àÐØ!×/Ñ/Ö1‰LˆDÜ˜&×5Ó5Û+Gà›¨D°h¸aÀ¸yÐ4IÕ,IØ '× 0Ñ 0°× ;Ó ;Ø '× 0Ñ 0°× 9Ó 9à(×/Ñ/°Ö8ó  ,ñ 2ñ (ÓE™<aÐ4DÑ+D—™<ÑEÐEùÒEs   ÂB(ÂB(c                 ó–   • SUR                   R                  ;   a.  [        USS 5      b   UR                  R	                  SSSSS.5        U$ )NÚGptOssConfigÚbase_model_tp_planÚgrouped_gemm)z(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   Ú__name__Úgetattrr°   Úupdate)r   rŸ   s     r   Úupdate_tp_planÚMxfp4HfQuantizer.update_tp_planG  sR   € Ø˜V×-Ñ-×6Ñ6Ó6ÜvÐ3°TÓ:ÑFØ×)Ñ)×0Ñ0àDRØDRØAOØAOñ	ôð ˆr   c                 óœ   • U R                   R                  (       a0  SU;   a  UR                  SS5      $ SU;   a  UR                  SS5      $ U$ )NrO   Ú Ú_scales)r   r'   Úreplace)r   rF   s     r   Úupdate_param_nameÚ"Mxfp4HfQuantizer.update_param_nameT  sM   € Ø×#Ñ#×.×.Ø˜JÓ&Ø!×)Ñ)¨)°RÓ8Ð8Ø˜jÓ(Ø!×)Ñ)¨)°RÓ8Ð8ØÐr   c                 ó.   • [         R                  S5        g)Nz@MXFP4 quantization is not serializable using safetensors for nowF©r,   r-   )r   Úsafe_serializations     r   Úis_serializableÚ Mxfp4HfQuantizer.is_serializable\  s   € Ü×ÑÐ^Ô_Ør   c                 ó.   • [         R                  S5        g)Nz©MXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()Fr¾   )r   s    r   Úis_trainableÚMxfp4HfQuantizer.is_trainable`  s   € ä×Ñð xô	
ð r   )rž   r   )rA   útorch.dtyper=   rÅ   r   )rD   r	   )r²   Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú requires_parameters_quantizationÚrequires_calibrationÚrequired_packagesr   r;   rB   Ústrr5   r   rW   r   Úlistr‡   rŒ   r˜   r¡   r¬   rµ   r»   rÀ   ÚpropertyÚboolrÃ   Ú__static_attributes__Ú__classcell__)r   s   @r   r   r   &   su  ø† ñð (,Ð$à Ðà%˜Ðõ7òJôX
ðà ðð $ðð ð	ð
 ˜˜c˜‘Nôð@ 04ñYà ðYð $ðYð ð	Yð
 &ðYð ˜˜c˜‘NðYð " $ s¡)Ñ,õYôv%ð!Ð*;ð !ÈDÐQTÉIð !ÐhlÐmpÑhqô !ð& 59ñDà ðDð ' t¨C¡yÑ1õDð>F°t¸C±yð FÈ#ð FÐRVÐWZÑR[ô Fòð¨Cð °Cô ôð ð˜dó ó ör   r   )Útypingr   r   r   r—   r   Úmodeling_utilsr	   Úutilsr
   r   r   r   r   Úquantizers_utilsr   r(   Ú
get_loggerr²   r,   r   © r   r   Ú<module>rÙ      sS   ð÷ 0Ñ /å ö Ý0÷õ õ 3ñ ×ÑÛà	×	Ò	˜HÓ	%€ô{õ r   