
    h\P                     x   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 ddlmZ ddlmZ erdd	lmZ d d
lmZ ddlmZmZmZ  e       r
d dlZd dlmZ  e       rPd dlZ e
j6                  e j8                  j                  d             e
j6                  d      k\  rd dlmZmZ d dl m!Z!  ejD                  e#      Z$de%dee%   fdZ&d Z'd Z( e       rWejR                  jT                  ejR                  jV                  gZ, e
j6                  e j8                  j                  d            Z- G d de      Z.y)    N)defaultdict)TYPE_CHECKINGOptionalUnion)version   )HfQuantizer)get_module_from_name   )PreTrainedModel)	safe_open)is_torch_availableis_torchao_availableloggingtorchao0.14.0)flatten_tensor_state_dictunflatten_tensor_state_dict)is_metadata_torchaoconfig_namereturnc                 v    | j                         } t        j                  d|       }|r|j                  d      S y)z
    Extract the size digit from strings like "4weight", "8weight".
    Returns the digit as an integer if found, otherwise None.
    z
(\d)weightr   N)lowerresearchgroup)r   	str_matchs     h/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_torchao.pyfuzzy_match_sizer   4   s7    
 ##%K		-5Iq!!    c                    ddl m} ddlm} t	        | |      r*| j
                  j                   d| j                          dS t	        | |      r<| j
                  j                   d| j                   dt        | j                         dS y )Nr   )AffineQuantizedTensor)LinearActivationQuantizedTensor()z(activation=	, weight=)
torchao.dtypesr"   7torchao.quantization.linear_activation_quantized_tensorr#   
isinstance	__class____name___quantization_typeinput_quant_funcoriginal_weight_tensor)weightr"   r#   s      r   r,   r,   C   s    4g&/0""++,Af.G.G.I-J!LL&9:""++,L9P9P8QQZ[mnt  oL  oL  \M  [N  NO  P  	P ;r    c                    t        | j                        }|7d| j                  j                  d    d| j                  j                  d    dS d| j                  j                  d    d| j                  j                  d    d| S )Nzin_features=r   z, out_features=r   z, weight=Noner&   )r,   r/   shape)selfr/   s     r   _linear_extra_reprr3   N   s    ,F~dkk//23?4;;CTCTUVCWBXXeffdkk//23?4;;CTCTUVCWBXXabhaijjr    c                   P    e Zd ZdZdZdZdgZ fdZd Zd Z	d"de
e   fd	Zd#dZdeeeeef   f   d
eeeeef   f   fdZ	 d$ddde
ee      fdZdee   d
ee   fdZddded
efdZdddddeddfdZd Zd$d
efdZd Zed
efd       Zed
efd       Zd ee   fd!Z xZS )%TorchAoHfQuantizerz?
    Quantizer for torchao: https://github.com/pytorch/ao/
    TFr   c                 d   t        |   |fi | t        | j                  j                  t
              rd| j                  j                  v }n8| j                  j                  j                  j                  }t        |      dk(  }|r
g d| _	        n	ddg| _	        | j                  dgz   | _
        y )Nint44)qdatascale
zero_pointr9   r:   _data)super__init__r)   quantization_config
quant_typestrr*   r+   r   weight_ao_keysfull_ao_keys)r2   r?   kwargsis_int_4r   r*   s        r   r>   zTorchAoHfQuantizer.__init__h   s    ,77d..993?!9!9!D!DDH22==GGPPK'4;H "BD#*G"4D //7);r    c                 6   t               st        d      d| _        |j                  d      }t	        |t
              rbd|j                         v sd|j                         v r>t        |      dkD  r0d| _        | j                  rd|j                         v rt        d      | j                  rn|j                  d	      }|rZt        j                  t        j                  j                  d
            }|t        j                  d      k  rt        d| d      y y y )NzSLoading an torchao quantized model requires torchao library (`pip install torchao`)F
device_mapdiskcpur   TzYou are attempting to perform disk offload with a pre-quantized torchao model This is not supported yet . Please remove the disk device from the device_map.weights_onlytorchz2.5.0zlIn order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is zc. You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch)r   ImportErroroffloadgetr)   dictvalueslenpre_quantized
ValueErrorr   parse	importlibmetadataRuntimeError)r2   argsrD   rG   rJ   torch_versions         r   validate_environmentz'TorchAoHfQuantizer.validate_environmenty   s   #%sttZZ-
j$'*++--*:K:K:M1MSVWaSbefSf#%%&J4E4E4G*G$i  !::n5L 'i.@.@.H.H.Q R 7==#99& G  HU  GV V} ~  :  r    c                 b   | j                   j                  dk(  rU|,|t        j                  k7  rt        j                  d| d       |%t        j                  d       t        j                  }| j                   j                  dk(  r'|%t        j                  d       t        j                  }|S )Nint4_weight_onlyzSetting dtype to zo for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the dtype to bfloat16.zSetting dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set dtype=torch.bfloat16 to remove this warning.#int8_dynamic_activation_int8_weightzSetting dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no dtype was specified in from_pretrained)r?   r@   rK   bfloat16loggerwarning_onceinfofloat32)r2   dtypes     r   update_dtypezTorchAoHfQuantizer.update_dtype   s    ##..2DD Uenn%<##'w  /^  _ }## | ##..2WW} W r    safe_serializationc                     t        | j                  j                        t        v rI|rGt        t        j                  d      k\  rt        |j                               S t        dt               di fS )z
        If the model is safe serializable, we flatten the state dict of tensor subclasses so that it is compatible with
        the safetensors format.
        r   zaIn order to use safetensors with torchao, please use torchao version >= 0.14.0. Current version: N)
typer?   r@   $SUPPORTED_SAFE_SERIALIZATION_CONFIGSTORCHAO_VERSIONr   rT   r   
state_dictrW   )r2   modelre   s      r   get_state_dict_and_metadataz.TorchAoHfQuantizer.get_state_dict_and_metadata   su    
 ((3348\\as'--"9901A1A1CDD"w  yH  xI  J  8Or    r   c                 t   t        j                  t        j                  j                  d            t        j                  d      kD  rddlm} | j                  j                         t        j                  d      kD  rjddl	m
} | j                  j                  }t        ||      rB|j                  j                  }t        |      }|dk(  r|j                   S t"        j$                  S |j                   t"        j$                  t"        j$                  d d}|| j                  j                     S t'        d	      )
N
acceleratez0.19.0r   )CustomDtype0.9.0AOBaseConfigr8   r\   int8_weight_onlyr]   	autoquantzYou are using `device_map='auto'` on a torchao quantized model. To automatically compute the appropriate device map, you should upgrade your `accelerate` library with `pip install --upgrade accelerate`)r   rT   rU   rV   accelerate.utilsro   r?   _get_ao_versionVersiontorchao.core.configrr   r@   r)   r*   r+   r   INT4rK   int8rS   )r2   rc   ro   rr   r@   r   
size_digitmap_to_target_dtypes           r   adjust_target_dtypez&TorchAoHfQuantizer.adjust_target_dtype   s    ==++33LABW]]S[E\\4 ''779GOOG<TT<!55@@
j,7","6"6"?"?K!1+!>J "S(*///  %zz) %0$4$4$)JJ7<zz!	# 't'?'?'J'JKK5 r    
max_memoryc                 ^    |j                         D ci c]  \  }}||dz   }}}|S c c}}w )Ng?)items)r2   r   keyvals       r   adjust_max_memoryz$TorchAoHfQuantizer.adjust_max_memory   s9    5?5E5E5GH5Gcc39n5G
H Is   )rk   r   keep_in_fp32_modulesc                     | j                  || j                  j                  |      | _        | j                  j                  r|j	                         }|j                         D cg c]  \  }}t        |      t        |      k(  s|! }}}|j                         }|j                         D cg c]  \  }}t        |      t        |      k(  s|! }	}}| j                  D 
cg c]  }
|
||	z   vs|
 c}
| _        y c c}}w c c}}w c c}
w N)get_modules_to_not_convertr?   modules_to_not_convertinclude_input_output_embeddingsget_input_embeddingsnamed_modulesidget_output_embeddings)r2   rk   r   rD   	input_embnamemoduleinput_emb_names
output_emboutput_emb_namesxs              r   $_process_model_before_weight_loadingz7TorchAoHfQuantizer._process_model_before_weight_loading   s    '+&E&E4++BBDX'
# ##CC224I8=8K8K8Mm8MfQSTZQ[_abk_lQlt8MOm446J9>9L9L9No9NvRTU[R\`bcm`nRn9No66+6a!?UeCe:e6+D' 	 no+s$   &C?C?0DD%D2Dunexpected_keysc                 n    |D cg c]#  t        fd| j                  D              r"% c}S c c}w )Nc              3   @   K   | ]  }j                  |        y wr   endswith).0r   ks     r   	<genexpr>z<TorchAoHfQuantizer.update_unexpected_keys.<locals>.<genexpr>   s     5_M^ajjmM^s   )anyrC   )r2   rk   r   r   s      `r   update_unexpected_keysz)TorchAoHfQuantizer.update_unexpected_keys   s.    *`?a#5_TM^M^5_2_?```s   #22
param_namec                    | j                   j                  dk(  ryt        fd| j                  D              ryt        fd| j                  D              ryt        |      \  }}t        j                  j                  g}| j                   j                  r)|j                  t        j                  j                         t        |t        |            xr |dk(  S )Nru   Fc              3   :   K   | ]  }|d z   v xs |k(    yw).N )r   r   r   s     r   r   z>TorchAoHfQuantizer.param_needs_quantization.<locals>.<genexpr>   s*     cGbsSyJ&;#*;;Gbs   c              3   F   K   | ]  }j                  d |         yw):Nr   )r   r   r   s     r   r   z>TorchAoHfQuantizer.param_needs_quantization.<locals>.<genexpr>   s%     I7H!$$qW-7Hs   !Tr/   )r?   r@   r   r   rC   r
   rK   nnLinearr   append	Embeddingr)   tuple)r2   rk   r   rD   r   tensor_name_QUANTIZABLEs     `    r   param_needs_quantizationz+TorchAoHfQuantizer.param_needs_quantization   s    ##..+= ctGbGbccIt7H7HII #7uj"IFK!HHOO,L''GG##EHH$6$67feL&9:V{h?VVr    param_valueztorch.Tensortarget_deviceztorch.devicec                    ddl m} |}d|v r|j                  dd      d   }t        ||      \  }}	| j                  rd|v}
|	dk(  s|
rHt
        j                  j                  |j                  |      |j                        |j                  |	<   yt        t        j                  d      k\  rt        | j                        st!        d	      t#        | d
      st%        t&              | _        | j(                  |   j+                  ||i       t-        | j(                  |         t-        | j.                        k(  rzt1        | j(                  |   | j                        |   }t
        j                  j                  |j                  |      |j                        |j                  |	<   | j(                  |= t3        |t        j4                        r t7        j8                  t:        |      |_        yyt
        j                  j                  ||j                        j                  |      |j                  |	<   |j?                         }| j@                  jB                  rNtE        |      tE        |      k(  r7|jG                          tI        |jJ                  jM                  d      dd       | j@                  jO                         t        jP                  d      k\  rddl m)} | j@                  jU                         }t3        ||      r`|j                  dd      \  }}d}||jV                  v r|jV                  |   }n|jV                  jY                  dd      }| |||d        y ||| j@                  jU                                y)z
        Each nn.Linear layer that needs to be quantized is processed here.
        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
        r   )	quantize_r   r   bias)requires_gradNr   zOTo use `safetensors` serialization, you should have `torchao>=0.14.0` installed	ao_paramsT)decodertie_word_embeddingsFz0.12.0)ModuleFqnToConfigr   _defaultc                      yNTr   )r   fqns     r   <lambda>z;TorchAoHfQuantizer.create_quantized_param.<locals>.<lambda>Q  s    dr    )	filter_fn)-torchao.quantizationr   rsplitr
   rR   rK   r   	Parametertor   _parametersri   r   rT   r   rV   rS   hasattrr   rO   r   updaterQ   rB   r   r)   r   types
MethodTyper3   
extra_reprr   r?   untie_embedding_weightsr   tie_weightssetattrconfigget_text_configrw   rx   r   get_apply_tensor_subclassmodule_fqn_to_configrN   )r2   rk   r   r   r   rD   r   	full_namer   r   is_unsafe_serialization	new_paraminput_embedr   r   
module_fqn_cs                     r   create_quantized_paramz)TorchAoHfQuantizer.create_quantized_param  s    	3	*#**3215J25*E '*&:#f$(?27((2D2DNN=1AZAZ 3E 3"";/ %x)@@EXY]YfYfEg !rss 4-!,T!2NN:&--y+.FG 4>>*-.#d6I6I2JJ7z8RTXTaTabcmn	27((2D2DLL/y?V?V 3E 3"";/
 NN:. &")),$)$4$45G$P! - /4hh.@.@;+D+D /A /b {+  446K''??BvJRTU`RaDa!!#44T4BDY[`a ''779W__X=VVB11KKMf&78$.$5$5c1$=MJA!V%@%@@"77
C"77;;JM}!&!7JKfd66PPRSr    c                     | j                   j                  dk(  rEddlm} ddlm} t        j                  |d      } ||f|dd| j                   j                  }|S y	)
z/No process required for torchao quantized modelru   r   )ru   )ALL_AUTOQUANT_CLASS_LISTzmax-autotune)modeF)qtensor_class_listset_inductor_configN)	r?   r@   r   ru   r   r   rK   compilequant_type_kwargs)r2   rk   rD   ru   r   s        r   #_process_model_after_weight_loadingz6TorchAoHfQuantizer._process_model_after_weight_loadingV  sd    ##..+=)EMM%n=E#;$) **<<	E Lr    c           	      N   |rt        | j                  j                        t        v xr t        t        j                  d      k\  }|sDt        j                  dt         dt        | j                  j                         dt         d       |S t        j                  t        j                  j                  d            t        j                  d      k\  }|st        j                  d       | j                  r,| j                  j                  t        j                  d	       y
|S )Nr   z=torchao quantized model only supports safe serialization for zv,                     and torchao version >= 0.14.0, please set `safe_serialization` to False for                     z and r   huggingface_hubz0.25.0zMtorchao quantized model is only serializable after huggingface_hub >= 0.25.0 a  The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them.If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config.F)rg   r?   r@   rh   ri   r   rT   r_   warningrU   rV   rM   r   )r2   re   _is_torchao_serializables      r   is_serializablez"TorchAoHfQuantizer.is_serializablef  s   '+((33(5(6 (e:IW]][cMd:d % ,STxSy z$22==>?u_DUUVX
 ,+#*==1C1C1K1KL]1^#_cjcpcpd
 $
  (NNjk<<D44KKSNND ''r    c                 B   | j                   j                         t        j                  d      kD  rPddlm} | j                   j                  }t        ||      r(|j                  j                  }t        |      }|dk(  ryyddddd}|| j                   j                     S )a9  
        This factor is used in caching_allocator_warmup to determine how many bytes to pre-allocate for accelerator warmup.
        - A factor of 2 means we pre-allocate the full memory footprint of the model.
        - A factor of 4 means we pre-allocate half of that, and so on

        However, when using TorchAO, calculating memory usage with param.numel() * param.element_size() doesn't give the correct size for quantized weights (like int4 or int8)
        That's because TorchAO internally represents quantized tensors using subtensors and metadata, and the reported element_size() still corresponds to the dtype
        not the actual bit-width of the quantized data.

        To correct for this:
        - Use a division factor of 8 for int4 weights
        - Use a division factor of 4 for int8 weights
        rp   r   rq   r8         rs   )r?   rw   r   rx   ry   rr   r@   r)   r*   r+   r   )r2   rr   r@   r   r|   r}   s         r   get_accelerator_warm_up_factorz1TorchAoHfQuantizer.get_accelerator_warm_up_factor  s     ##3358PP811<<J*l3(22;;-k:
$ !" !34	
 #4#;#;#F#FGGr    c                 :    ddg}| j                   j                  |v S )Nrt   r]   )r?   r@   )r2   "supported_quant_types_for_trainings     r   is_trainablezTorchAoHfQuantizer.is_trainable  s,     1.
* ''226XXXr    c                      yr   r   )r2   s    r   is_compileablez!TorchAoHfQuantizer.is_compileable  s    r    checkpoint_filesc                     |d   j                  d      rLi }|D ]=  }t        |d      5 }|j                         xs i }|j                  |       d d d        ? || _        y y # 1 sw Y   RxY w)Nr   z.safetensorspt)	framework)r   r   rV   r   )r2   r   rV   
checkpointf	metadata_s         r   set_metadatazTorchAoHfQuantizer.set_metadata  sk    A''7H.
zT:a !

 2IOOI. ;: /
 %DM 8 ;:s   &A""A+	)F)rc   torch.dtyper   r   r   ) r+   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr>   rZ   rd   r   boolrl   r~   rO   rA   r   intr   listr   r   r   r   r   r   r   propertyr   r   r   __classcell__)r*   s   @r   r5   r5   _   s{    (,$ "<"0(Xd^ "HDeCHo1E,F 4PSUZ[^`c[cUdPdKe  UY&>FtCy>Q aT#Y a4PS9 aW.? WS W_c W"ST ST $ST 	ST
 &STj ($ (4%HN Yd Y Y   %T#Y %r    r5   )/rU   r   r   collectionsr   typingr   r   r   	packagingr   baser	   quantizers_utilsr
   modeling_utilsr   safetensorsr   utilsr   r   r   rK   torch.nnr   r   rT   rV   1torchao.prototype.safetensors.safetensors_supportr   r   /torchao.prototype.safetensors.safetensors_utilsr   
get_loggerr+   r_   rA   r   r,   r3   quantizationFloat8WeightOnlyConfig)Float8DynamicActivationFloat8WeightConfigrh   ri   r5   r   r    r   <module>r     s    	  # 1 1   2 0 ! E E w}}Y''//	:;}w}}X?VV	
 	X 
		H	%# (3- Pk 33FF,(
 $gmmI$6$6$>$>y$IJO\% \%r    