
    h]<                         d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	m
Z
 ddlmZ ddlmZ erd	d
lmZ d	dlmZmZmZmZmZmZmZmZ  e       r
d dlZd	dlmZ  ej:                  e      Z G d de      Z y)    N)defaultdict)cached_property)TYPE_CHECKINGOptionalUnion)version   )HfQuantizer)get_module_from_name   )PreTrainedModel)ACCELERATE_MIN_VERSIONis_accelerate_availableis_bitsandbytes_availableis_torch_availableis_torch_hpu_availableis_torch_npu_availableis_torch_xpu_availablelogging)Conv1Dc                   4    e Zd ZdZdZdZdZddgZ fdZd Z	d d	Z
d
ee   dee   fdZdddedefdZdddddeddfdZdeeeeef   f   deeeeef   f   fdZd!dZd Z	 d"dddeee      fdZd#dZd"dZedefd       Zedefd       Zd Z xZ S )$Bnb4BitHfQuantizera  
    4-bit quantization from bitsandbytes.py quantization method:
        before loading: converts transformer layers into Linear4bit during loading: load 16bit weight and pass to the
        layer object after: quantizes individual weights in Linear4bit into 4bit at the first .cuda() call
        saving:
            from state dict, as usual; saves weights and `quant_state` components
        loading:
            need to locate `quant_state` components and pass to Param4bit constructor
    TFbitsandbytes
acceleratec                 2   t        |   |fi | | j                  j                  | j                  j                  | _        d| j                  j
                   ddg| _        | j                  j                  r| j                  j                  ddg       y y )Nzquant_state.bitsandbytes__absmax	quant_mapnested_absmaxnested_quant_map)	super__init__quantization_configllm_int8_skip_modulesmodules_to_not_convertbnb_4bit_quant_typebnb_keysbnb_4bit_use_double_quantextend)selfr"   kwargs	__class__s      i/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_bnb_4bit.pyr!   zBnb4BitHfQuantizer.__init__A   s    ,77##99E*.*B*B*X*XD' ))A)A)U)U(VW

 ##==MM  /3E!FG >    c                 r   t               st        dt         d      t        d      st        d      t	               st        d      t        j                  t        j                  j                  d            t        j                  d      k  r)t        j                  j                         st        d	      d
dlm} d
dlm}  |       } |d       |j!                  dd      s|j!                  dd      rt#        d      |j!                  d      }|t%        |t&              r| j(                  j*                  sr|D ci c]  }|| j,                  vs|||    }}t/        |j1                               dhk(  r|ry d|j1                         v sd|j1                         v rt#        d      y y y y c c}w )NzWUsing `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>=z'`T)check_library_onlyzrUsing `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`zThe bitsandbytes library requires PyTorch but it was not found in your environment. You can install it with `pip install torch`.r   z0.43.1zThe installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.r   )!validate_bnb_backend_availability)'is_bitsandbytes_multi_backend_available)raise_exceptionfrom_tfF	from_flaxzConverting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.
device_mapcpudiska  Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. )r   ImportErrorr   r   r   r   parse	importlibmetadatatorchcudais_availableintegrationsr0   utilsr1   get
ValueError
isinstancedictr"    llm_int8_enable_fp32_cpu_offloadr$   setvalues)	r)   argsr*   r0   r1   bnb_multibackend_is_enabledr5   keydevice_map_without_lm_heads	            r,   validate_environmentz'Bnb4BitHfQuantizer.validate_environmentP   s   &(i  kA  jB  BD  E  )DA E  "#?  ==++33NCDw}}U]G^^::**,!m 
 	EC&M&O#)$?::i'6::k5+I; 
 ZZ-
":t,,,MM 1;*0:cIdId>dZ_$
 ' * :$$&'E727R4;;==KeKlKlKnAn )  Bo N - #*s   F4F4returnc                    t        j                  t        j                  j                  d            t        j                  d      kD  r:ddlm} |t        j                  k7  rt        j                  d       |j                  S t        d      )Nr   z0.19.0r   )CustomDtypezXtarget_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantizationaU  You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute the appropriate device map, you should upgrade your `accelerate` library,`pip install --upgrade accelerate` or install it from source to support fp4 auto device mapcalculation. You may encounter unexpected behavior, or pass your own device map)r   r9   r:   r;   accelerate.utilsrO   r<   int8loggerinfoINT4rB   )r)   target_dtyperO   s      r,   adjust_target_dtypez&Bnb4BitHfQuantizer.adjust_target_dtype   si    ==++33LABW]]S[E\\4uzz)vw###b r-   unexpected_keysc                 n    |D cg c]#  t        fd| j                  D              r"% c}S c c}w )Nc              3   @   K   | ]  }j                  |        y wNendswith).0xks     r,   	<genexpr>z<Bnb4BitHfQuantizer.update_unexpected_keys.<locals>.<genexpr>   s     5[]ajjm]   )anyr&   )r)   modelrW   r_   s      `r,   update_unexpected_keysz)Bnb4BitHfQuantizer.update_unexpected_keys   s,    *\?a#5[T]]5[2[?\\\s   #22rc   r   
param_namec                     dd l }t        fd| j                  D              ryt        |      \  }}t	        ||j
                  j                        xr |dk7  S )Nr   c              3   @   K   | ]  }j                  |        y wrZ   r[   r]   r^   re   s     r,   r`   z>Bnb4BitHfQuantizer.param_needs_quantization.<locals>.<genexpr>   s     =}!z""1%}ra   Tbias)r   rb   r&   r   rC   nn
Linear4bit)r)   rc   re   r*   bnbmodulenames     `    r,   param_needs_quantizationz+Bnb4BitHfQuantizer.param_needs_quantization   sM    " =t}}==+E:>&#&&"3"34GGr-   param_valueztorch.Tensortarget_deviceztorch.devicec                    dd l }t        fd| j                  D              }}|r.dvrj                  dd      d   nj                  dd      d   t	        |      \  }	}
t        |t              rt               rd| }| j                  rj                  dd      d   }t        | d      st        t              | _        | j                  |   j                  ||i       t        | j                  |         t        | j                        dz   k(  ri }| j                  r|	|d	<   | j                  |   j!                  | d
      } |j"                  j$                  j&                  d|| j                  |   d|d|}||	j(                  |
<   | j                  |= y y |j+                  d      }t-        |	|
      }t/        |	j0                  t2              r|j4                  }|j6                  }|j!                  dd         |j"                  j$                  |fddi|j+                  |      }||	j(                  |
<   y )Nr   c              3   @   K   | ]  }j                  |        y wrZ   r[   rh   s     r,   r`   z<Bnb4BitHfQuantizer.create_quantized_param.<locals>.<genexpr>   s     JMqJ//2Mra   zquant_state..r	   r   npu:param_quant_statsrm   z.weightF)dataquantized_statsrequires_graddevicer6   _is_hf_initializedry    )r   rb   r&   rsplitr   rC   intr   pre_quantizedhasattrr   rD   rv   updatelen$is_bnb_supports_quant_storage_modulepoprj   
Params4bitfrom_prequantized_parameterstogetattr
issubclass
source_clsr   T__dict__)r)   rc   rp   re   rq   r*   rl   is_quant_stat	full_namerm   tensor_namemodule_nameparam_kwargsweight	new_value	old_values      `            r,   create_quantized_paramz)Bnb4BitHfQuantizer.create_quantized_param   s?    	#JDMMJJ	0>j0P
!!#q)!,V`VgVghkmnVopqVr  35*E mS).D.F"=/2M $++C3A6K4!45)4T):&"";/66	;7OP 4))+673t}};MPQ;QQ!<<-3L*//<@@K=PWAXY?CFF--?? $($:$:;$G"'(	
 #	 3<"";/**;7! R$ $u-I4I &++V4%KK	''FJJ+T2))))S5SFSVVWdeI.7F{+r-   
max_memoryc                 ^    |j                         D ci c]  \  }}||dz   }}}|S c c}}w )Ng?)items)r)   r   rJ   vals       r,   adjust_max_memoryz$Bnb4BitHfQuantizer.adjust_max_memory   s9    6@6F6F6HI6H(#sc3:o6H
I Js   )c                 V    |&t         j                  d|       t        j                  }|S )NzOverriding dtype=%s with `dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.float16 to remove this warning.)rR   rS   r<   float16)r)   dtypes     r,   update_dtypezBnb4BitHfQuantizer.update_dtype   s-    =KK?  MMEr-   c                    |t         j                  j                         r!dt         j                  j                         i}nt	               r$ddt         j
                  j                          i}n]t               r$ddt         j                  j                          i}n/t               r!dt         j                  j                         i}nddi}t        j                  d| d       |S )N ru   zhpu:r6   z:The device_map was not initialized. Setting device_map to zL. If you want to use the model for inference, please set device_map ='auto' )r<   r=   r>   current_devicer   npur   hpur   xpurR   rS   )r)   r5   s     r,   update_device_mapz$Bnb4BitHfQuantizer.update_device_map   s    zz&&( %**";";"=>
') D)A)A)C(D"EF
') D)A)A)C(D"EF
') %))":":"<=
 %[
KK))3 5]]
 r-   keep_in_fp32_modulesc                 &   ddl m} | j                  j                  }| j	                  || j                  j
                  |      | _        t        |t              ryt        |j                               dkD  r]|j                         D cg c]  \  }}|dv s| }	}}t        |	      dkD  r|st        d      | j                  j                  |	        ||| j                  | j                        }| j                  |j                  _        y c c}}w )Nr   )replace_with_bnb_linearr	   )r7   r6   r   zIf you want to offload some keys to `cpu` or `disk`, you need to set `llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be  converted to 8-bit but kept in 32-bit.)r$   r"   )r?   r   r"   rE   get_modules_to_not_convertr#   r$   rC   rD   r   keysr   rB   r(   config)
r)   rc   r5   r   r*   r   rE   rJ   valuekeys_on_cpus
             r,   $_process_model_before_weight_loadingz7Bnb4BitHfQuantizer._process_model_before_weight_loading
  s     	;+/+C+C+d+d(&*&E&E4++AACW'
#
 j$'C
0A,BQ,F1;1A1A1C`1C:3uP_G_31CK`;!#,L > 
 ''..{;'$*E*E[_[s[s
 ,0+C+C( as   DDc                 >    d|_         | j                         |_        |S NT)is_loaded_in_4bitis_serializableis_4bit_serializable)r)   rc   r*   s      r,   #_process_model_after_weight_loadingz6Bnb4BitHfQuantizer._process_model_after_weight_loading,  s     "&%)%9%9%;"r-   c                     t        j                  t        j                  j                  d            t        j                  d      k\  }|st        j                  d       yy)Nr   z0.41.3zYou are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.FT)r   r9   r:   r;   rR   warning)r)   safe_serialization_is_4bit_serializables      r,   r   z"Bnb4BitHfQuantizer.is_serializable1  sQ     'i.@.@.H.H.X Y]d]j]jks]t t$NNh r-   c                     t        j                  t        j                  j                  d            t        j                  d      k\  S )z
        determines if the current version of bitsandbytes supports
        the `module` parameter in `Params4bit.from_prequantized`
        :return:
        r   z0.43.3)r   r9   r:   r;   r)   s    r,   r   z7Bnb4BitHfQuantizer.is_bnb_supports_quant_storage_module=  s3     }}Y//77GHGMMZbLcccr-   c                      yr   r|   r   s    r,   is_trainablezBnb4BitHfQuantizer.is_trainableF  s    r-   c                 P    ddl m}  ||| j                  | j                        }|S )Nr   )dequantize_and_replace)r"   )r?   r   r$   r"   )r)   rc   r   s      r,   _dequantizezBnb4BitHfQuantizer._dequantizeJ  s)    9&4..DD\D\
 r-   )rU   torch.dtyperM   r   )r   r   rM   r   rZ   )rc   r   )!__name__
__module____qualname____doc__use_keep_in_fp32_modules requires_parameters_quantizationrequires_calibrationrequired_packagesr!   rL   rV   liststrrd   boolro   r   rD   r   r~   r   r   r   r   r   r   r   r   r   propertyr   r   __classcell__)r+   s   @r,   r   r   0   s[     $'+$ '6H6p]T#Y ]4PS9 ]H.? HS H_c H=8 =8 $=8 	=8
 &=8@DeCHo1E,F 4PSUZ[^`c[cUdPdKe 0 59	D D 'tCy1	DD

 dd d d d  r-   r   )!r:   collectionsr   	functoolsr   typingr   r   r   	packagingr   baser
   quantizers_utilsr   modeling_utilsr   r@   r   r   r   r   r   r   r   r   r<   pytorch_utilsr   
get_loggerr   rR   r   r|   r-   r,   <module>r      sd     # % 1 1   2 0	 	 	 &			H	%` `r-   