o
    
sh                     @   s|   d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZ ddlmZ e	 r/d dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_torch_availableis_vptq_availablelogging)QuantizationConfigMixinNc                       s   e Zd ZdZdZdgZdef fddZdd ZdddZ		dddde
ee  fddZdddZedefddZdddZ  ZS )VptqHfQuantizerzS
    Quantizer of the VPTQ method. Enables the loading of prequantized models.
    Tvptqquantization_configc                    s   t  j|fi | || _d S N)super__init__r   )selfr   kwargs	__class__ d/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_vptq.pyr   (   s   
zVptqHfQuantizer.__init__c                 O   s    t  stdt stdd S )NzGUsing `vptq` quantization requires Accelerate: `pip install accelerate`zEUsing `vptq` quantization requires VPTQ>=0.0.4: `pip install -U vptq`)r   ImportErrorr
   )r   argsr   r   r   r   validate_environment,   s
   z$VptqHfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   sf   |d u r1t j rt j}td |S dd l}t|ddd }|ddu r)tdt j	}td	 |S )
NzCUDA available. Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `dtype` manually.r   device_availabilityc                 S      dS NFr   )devicer   r   r   <lambda>=   s    z.VptqHfQuantizer.update_dtype.<locals>.<lambda>cpuTzKNo GPU found. Please wait for the next release of VPTQ to use CPU inferencezVNo GPU found. Assuming VPTQ inference on CPU and loading the model in `torch.float32`.)
torchcudais_availablefloat16loggerinfor   getattrRuntimeErrorfloat32)r   r   r   r   r   r   r   update_dtype3   s   

zVptqHfQuantizer.update_dtypeNmodelr   keep_in_fp32_modulesc                 K   s@   ddl m} | || jj|| _||| j| jd | j|j_dS )z
        we don't have param like modules_to_not_convert to indicate which layers should not be quantized
        because `quantization_config` include the layers that should be quantized
        r   )replace_with_vptq_linear)r   modules_to_not_convertN)integrationsr1   get_modules_to_not_convertr   r2   config)r   r/   r0   r   r1   r   r   r   $_process_model_before_weight_loadingD   s   

z4VptqHfQuantizer._process_model_before_weight_loadingc                 K   s   |S r   r   )r   r/   r   r   r   r   #_process_model_after_weight_loading[      z3VptqHfQuantizer._process_model_after_weight_loadingc                 C   r    r!   r   )r   r   r   r   is_trainable^   s   zVptqHfQuantizer.is_trainablec                 C   r    )NTr   )r   safe_serializationr   r   r   is_serializableb   r8   zVptqHfQuantizer.is_serializable)r   r   r   r   r   )r/   r   )__name__
__module____qualname____doc__requires_calibrationrequired_packagesr   r   r   r.   r   liststrr6   r7   propertyboolr9   r;   __classcell__r   r   r   r   r       s"    



r   )typingr   r   baser   modeling_utilsr   utilsr   r	   r
   r   utils.quantization_configr   r%   
get_loggerr<   r)   r   r   r   r   r   <module>   s   
