o
    h0                     @   s   d dl mZ d dlmZmZmZmZ e rddlZddlmZ e r(ddl	m
Z
 e r/ddlZeeZG dd dejjZG d	d
 d
ejZ							dddZ						dddZdS )   )ACT2FN)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging    N)nn)init_empty_weightsc                       *   e Zd Zejf fdd	Zdd Z  ZS )FbgemmFp8Linearc                    s   t  ||| || _|| _tjtj||ftjd| _	tjtj|df|d| _
| jdtjdgtjddd |rKtjtj| j|d| _d S d | _d S )Ndtype   input_scale_ubF
persistent)super__init__in_featuresout_featurestorchr   	Parameterzerosfloat8_e4m3fnweightweight_scaleregister_bufferfloatbias)selfr   r   r   weight_dtype	__class__ x/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/integrations/fbgemm_fp8.pyr   !   s   
zFbgemmFp8Linear.__init__c                 C   s   g |j d d dR }tjjj|d|j d  | jd\}}| j	tj
}tjjj|| j||dd}| jd ur?|| j n|}|	|j}||}~~|S )N)scale_ubTuse_fast_accum)shaper   opsfbgemmquantize_fp8_per_rowview
contiguousr   r   tofloat32f8f8bf16_rowwiser   r   devicereshape)r   xoutput_shapex_quantizedx_scaleweight_scale_float32outputr#   r#   r$   forward/   s   

zFbgemmFp8Linear.forward__name__
__module____qualname__r   r0   r   r:   __classcell__r#   r#   r!   r$   r       s    r   c                       r
   )FbgemmFp8Llama4TextExpertsc                    s   t    |j| _|j| _|j| _| j| _t|j | _	t
jt
j| j| jd| j ft
jd| _t
jt
j| jd| jd ft
jd| _t
jt
j| j| j| jft
jd| _t
jt
j| j| jdft
jd| _| jdt
jdgt
jddd d S )Nr   r   r   r   Fr   )r   r   num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimr   
hidden_actact_fnr   r   r   r   r   gate_up_projr0   gate_up_proj_scale	down_projdown_proj_scaler   r   )r   configr   r!   r#   r$   r   H   s&   
"z#FbgemmFp8Llama4TextExperts.__init__c              	   C   s  | | jd| j}d}t|}t| jD ]}|| }|d| j}tjj	||| j
\}}| jjd d }	| jtj}
tjjj|| j| ddd|	  ||
| d d|	  dd dd}tjjj|| j| dd|	d  ||
| d |	d  dd dd}|| | }tjj	||| j
\}}| jtj}tjjj|| j| dd |||  dd dd}|||< q||j}| d| jS )z
        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
        Returns:
            torch.Tensor: (batch_size * token_num, hidden_size)
        r%   Nr   r   r   Tr'   )r-   rB   rD   r   
empty_likeranger3   r*   r+   r,   r   rH   r)   rI   r/   r0   r1   	transposer.   rG   rK   rJ   r2   )r   hidden_states
num_tokensnext_statesiexpert_hiddenexpert_hidden_reshapedexpert_quantizedexpert_scalesharded_expert_dimgate_up_proj_scale_float32gateup	activatedactivated_quantizedactivated_scaledown_proj_scale_float32expert_outputr#   r#   r$   r:   `   sP   

z"FbgemmFp8Llama4TextExperts.forwardr;   r#   r#   r!   r$   r@   G   s    r@   Fc                    s  ddl }|du r
g }|  D ]\}	}
||	 t|
tjrp|	|vrpd| t fdd|D sptdd$ |
j	}|
j
}t|||
jdu| j|	< d}| j|	 d W d   n1 s]w   Y  tj|jgtjd	| j|	 _|
jjd
kr|	|vrd| t fdd|D stdd d||dd d < t|j| j|	< W d   n1 sw   Y  tj|jgtjd	| j|	 _tt|
 dkrt|
|||||||d\}}|d q| |fS )z
    Private method that wraps the recursion for module replacement.

    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
    r   N.c                 3   $    | ]}|d   v p| kV  qdS ra   Nr#   .0keycurrent_key_name_strr#   r$   	<genexpr>       
z2_replace_with_fbgemm_fp8_linear.<locals>.<genexpr>T)include_buffersFr   Llama4TextExpertsc                 3   rb   rc   r#   rd   rg   r#   r$   ri      rj   z\d+*z.down_proj_scale)has_been_replacedpre_quantizedrL   tp_planr%   )renamed_childrenappend
isinstancer   Linearjoinanyr	   r   r   r   r   _modulesrequires_grad_r   tensoractivation_scale_ubr   r   r"   r<   subr@   text_configlenlistchildren_replace_with_fbgemm_fp8_linearpop)modelmodules_to_not_convertcurrent_key_namequantization_configrn   ro   rL   rp   rq   namemoduler   r   _r#   rg   r$   r      sh   





r   c              	   C   s`   |du rdgn|}|j dur||j  tt|}t| ||||||d\} }|s.td | S )a  
    A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
    This will enable running your models using high performance fp8 kernel from FBGEMM library.

    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
    CPU/GPU memory is required to run this function. Each weight will be quantized along the channel.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
            Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision
            for numerical stability reasons.
        current_key_name (`List[`str`]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
            `disk`).
    Nlm_head)ro   rL   rp   zYou are loading your model using FP8 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   extendr   setr   loggerwarning)r   r   r   r   ro   rL   rp   rn   r#   r#   r$   replace_with_fbgemm_fp8_linear   s$   

	r   )NNNFFNN)NNNFNN)activationsr   utilsr   r   r   r   r   r   
accelerater	   fbgemm_gpu.experimental.gen_ai
fbgemm_gpu
get_loggerr<   r   ru   r   Moduler@   r   r   r#   r#   r#   r$   <module>   s6   
'X
O