o
    hdP                  
   @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZmZ e r=ddlZddlmZ eeZg d	g d
g dddg d	g dg ddddg d	g d
g dddg d	g d
g ddddZdddddddddddddddddddddddddZdd Z				d-defddZdd  Zd!d" Zd#d$ Zd%d& Z d'd( Z!d)d* Z"d+d, Z#dS ).z;AWQ (Activation aware Weight Quantization) integration file    N)version   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_ipex_availableis_torch_availablelogging)AwqBackendPackingMethod	AwqConfigAWQLinearVersionExllamaVersion)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)input_layernormpost_attention_layernormnormF)	attentionmlp	layernorm	use_alibi)w1w3w2g    .A)r   r   r   r   
rope_theta)mistralmixtralllamallavaactc_fc)r$   layer_before_actdense_h_to_4hr   fc_in	gelu_impl)
starcoder2RefinedWebModelfalconmptgptjgpt_neoxgpt_bigcodebloomc                 C   s   ddl m} |tvr| S |  D ]7\}}t| d }t| d }||krBt| |rBt| t| d }|j}t|}	|||	| j	|< t
||}
q| S )Nr   )ScaledActivationr$   r&   )awq.modules.actr2   AWQ_SCALES_MAPPINGSnamed_childrenhasattrgetattrout_featurestorchones_modulesreplace_quantization_scales)model
model_typer2   namemoduleact_namelayer_before_act_namer&   size
scale_like_ rF   q/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/integrations/awq.pyr<   M   s   
r<   returnc              	      s  |du rg }|j }t std|tjkr||jtjkr$ddlm	} |}n`|jtj
kr3ddlm} |}nQ|jtjkre|jd tjkrJddlm}	 |	}n:|jd tjkr[ddlm}
 |
}n)td	|jd  |jtjkrtdd
lm} |}ntd|j ddlm} |}|  D ]b\}} du rg   | t|tjr||vrt  fdd|D s|j!}|j"}||j#|j$|||j%du|j&j'd| j(|< d}| j(| )d t*t+|, dkrt-|| ||d\}} .d q| |fS )a  
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
    conversion has been successfull or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`AwqConfig`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
        current_key_name (`list`, *optional*):
            A list that contains the current key name. This is used for recursion and should not be passed by the user.
        has_been_replaced (`bool`, *optional*):
            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
            should not be passed by the user.
    NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr   )WQLinear_GEMM)WQLinear_GEMVr   )WQLinear_Exllama)WQLinear_ExllamaV2Unrecognized Exllama version: WQLinear_IPEXzUnrecognized AWQ version: )WQLinearc                 3   s    | ]
}|d   v V  qdS ).N)join).0keycurrent_key_namerF   rG   	<genexpr>   s    z*replace_with_awq_linear.<locals>.<genexpr>)w_bit
group_sizein_featuresr8   biasdevTF)modules_to_not_convertrV   quantization_confighas_been_replaced)/backendr   
ValueErrorr
   AUTOAWQr   r   GEMMawq.modules.linear.gemmrI   GEMVawq.modules.linear.gemvrJ   EXLLAMAexllama_configr   ONEawq.modules.linear.exllamarK   TWOawq.modules.linear.exllamav2rL   IPEXawq.modules.linear.gemm_ipexrO   awq.quantize.qmodulerP   r5   append
isinstancennLinearanyrZ   r8   bitsrY   r[   weightdevicer;   requires_grad_lenlistchildrenreplace_with_awq_linearpop)r=   r]   r^   rV   r_   ra   rI   
target_clsrJ   rK   rL   rO   rP   r?   r@   rZ   r8   rE   rF   rU   rG   r}   ^   sp   


r}   c                 C   s   t | tstd| jj |jdur|j}|j|d< |S | jjt	v rOt	| jj }| jj
dd}|j}|j}t|d|}||d< ||d< ||d< |j|d< |S td	)
af  
    Returns the fusing mapping given the quantization config and the model

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`~transformers.quantization_config.AWQConfig`):
            The quantization configuration to use.
    z:The model should be an instance of `PreTrainedModel`, got Nmax_seq_lenTdecodernum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)rr   r   	TypeError	__class____name__modules_to_fusefuse_max_seq_lenconfigr>   AWQ_FUSED_MAPPINGSget_text_configr   r   r7   rb   )r=   r^   current_fused_mappingr   r   r   r   rF   rF   rG   get_modules_to_fuse   s(   




r   c           
         sX  t |tr
t|}|j}t| |}t|dd}|tjkr0ddl	m
} ddlm} ddlm} ntdg  |  D ]E\}|durNtfdd	|D rNq:t|d
 || |jdkrft| |d || ntd t| |||}	|	r dd  q:t dkr|  D ]\}t fdd	 D rt|drt|jdrd|j_q| S )aJ  
    Optionally fuse some modules in the model to speedup inference.

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`Union[AwqConfig, dict]`):
            The quantization configuration to use.
    r]   Nr   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendc                 3   s    | ]}| v V  qd S NrF   )rS   module_name_to_not_convert)r?   rF   rG   rW     s    z#fuse_awq_modules.<locals>.<genexpr>r   ipexr   z7The IPEX version AWQ does not support fuse mlp for now.rQ   c                 3   s    | ]} v V  qd S r   rF   )rS   fused_attention_parent_module)fused_attention_modulesmodule_namerF   rG   rW   (  s    
r   _attn_implementationcustom)rr   dictr   	from_dictra   r   r7   r
   rc   awq.modules.fused.attnr   awq.modules.fused.mlpr   awq.modules.fused.normr   rb   named_modulesru   _fuse_awq_layernormr   _fuse_awq_mlploggerinfo_fuse_awq_attention_layersrq   splitrz   r6   r   r   )
r=   r^   ra   r   r]   r   r   r   r@   attention_has_been_fusedrF   )r   r   r?   rG   fuse_awq_modules   sD   






r   c                 C   sB   | D ]}t ||rt||}||j|j|jj|j|< ~qdS )a  
    Fuse the LayerNorm layers into a target class using autoawq

    Args:
        fuse_module_names (`List[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.FasterTransformerRMSNorm`):
            The `FasterTransformerRMSNorm` class as it only supports that class
            for now.
    N)r6   r7   rw   variance_epsilontorx   r;   )fuse_module_namesr@   r   r   
old_modulerF   rF   rG   r   0  s   


r   c                 C   s   t |dkrdS t||d rXt||d }t||d }t||d }|jj}| jjdd}	|	j}
t|
 }|||||}|	dd\}}| 
|}t|||| ~~~dS dS )a  
    Fuse the MLP layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
        fuse_module_names (`List[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.QuantFusedMLP`):
            The `QuantFusedMLP` class as it only supports that class
            for now.
    r   N   r   Tr   rQ   )rz   r6   r7   qweightrx   r   r   
hidden_actr   rsplitget_submodulesetattrr   )r=   current_module_namer   r@   r   r   r   r   previous_devicer   r   activation_fn
new_moduleparent_name
child_nameparentrF   rF   rG   r   G  s    

r   c                 C   sH  ddl m}m} d}t|d dkr|S t||d d r"t||d d }t||r1|}	d}
n/t||r;|}	d}
n%t r\t	t
jdt	dkr\ddl m} t||r[|}	d}
ntd	|jj}t||d d }t||d d
 }t||d d }|jdurtj|j|j|jgddnd}|	|j|j|j|j|j |j |jdutt|  j}tj|j|j|jg|
d|_tj|j|j|jg|
d|_tj|j|j|jg|
d|_t||r|j|_||_||d |d |d ||||d |d |ddd	}d|_| dd\}}| !|}t"|||#| ~~~~d}|S )a  
    Fuse the Attention layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        modules_to_fuse (`List[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
            The current submodule name
        target_cls (`~autoawq.QuantAttentionFused`):
            The `QuantAttentionFused` class as it only supports that class
            for now.
    r   )rI   rJ   Fr   r   autoawqz0.2.6rN   z'Unsupported q_proj type: {type(q_proj)}r      N)dimr   r   r   r   r   r   g     @)r   r   TrQ   )$awq.modules.linearrI   rJ   rz   r6   r7   rr   r   r   parse	importlibmetadatarO   rb   r   rx   r[   r9   catrX   rY   rZ   r8   nextiter
state_dictvaluesqzerosscalessplit_k_itersgetis_hf_transformersr   r   r   r   )r=   r@   r   r   r   rI   rJ   module_has_been_fusedr   linear_target_clscat_dimrO   r   r   r   r   r[   	qkv_layerfused_attention_layerr   r   r   rF   rF   rG   r   p  sp   

"
(	


r   c                 C   sl   |d t jkrddlm} || } | S |d t jkr-ddlm} || |d |d d} | S td|d  )	z
    Runs post init for Exllama layers which performs:
        - Weights unpacking, reordering and repacking
        - Devices scratch space allocation
    r   r   )exllama_post_init)exllamav2_post_initmax_input_lenmax_batch_size)r   r   rM   )r   rj   rk   r   rl   rm   r   rb   )r=   ri   r   r   rF   rF   rG   post_init_awq_exllama_modules  s   r   c                 C   s   ddl m} || } | S )zl
    Runs post init for IPEX layers which performs:
        - Weights packing, reordering and repacking
    r   )ipex_post_init)ro   r   )r=   r   rF   rF   rG   post_init_awq_ipex_modules  s   r   )NNNF)$__doc__r   	packagingr   activationsr   modeling_utilsr   utilsr   r   r   r	   utils.quantization_configr
   r   r   r   r9   torch.nnrs   
get_loggerr   r   r   r4   r<   boolr}   r   r   r   r   r   r   r   rF   rF   rF   rG   <module>   sr   

i)@)_