o
    h:                     @   s  d dl mZmZmZ ddlmZmZmZ e r1d dlZd dl	m
Z
 d dlZd dlmZ d dl	mZ e r:d dlmZ eeZejdejfdd	Zd*dejdedeejejf fddZejdejdejdejdejfddZejfdejdejdejdejdee dejdejfddZej dejfdejdejdejd ejdeeeef  dejdejfd!d"Z!G d#d$ d$e
j"Z#					%d+d&d'Z$		d,d(d)Z%dS )-    )ListOptionalTuple   )is_accelerate_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc           	      C   s   t jdd}|| t d| }t | | t j}t t |d }|| }||jj	}t 
|| | t 
|| | d S )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr   pidoffsxsy r!   }/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernel$   s   r#      r   
block_sizereturnc                    s      sJ  jd | dksJ tj tjd} jg   d d  d| R dtji} fdd}t|  |||d ||fS )Nr   r   r   c                    s   t   | d fS )Nr   )tritoncdivnumel)metar   r!   r"   grid6   s   zact_quant.<locals>.grid)r   )	is_contiguousshapetorch
empty_likefloat8_e4m3fn	new_emptysizer   r#   )r   r%   r    r   r.   r!   r-   r"   	act_quant0   s   2r6   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc           6      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | } || t d| | }!| | t d| | }"t d|}#| |!dddf |
 |#dddf |   }$||#dddf | |"dddf |   }%||!|  }&|"| }'||'|  }(t j||ft jd})tdt ||D ]h}*t j|$|#dddf ||*|  k dd}+t j|%|#dddf ||*|  k dd},|*| }-|-|	 }.t |&|.|  }/t |(|.|  }0|)t 	|+|,|/dddf  |0dddf  7 })|$|| 7 }$|%|| 7 }%q|j
jt jkr|)t j}1n|j
jt jkr%|)t j}1n|)t j}1|| t d| }2| | t d| }3|||2dddf   ||3dddf   }4|2dddf |k |3dddf |k @ }5t j|4|1|5d dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r   Nr(   g        )maskother)r;   )r   r   r*   minr   zerosr   ranger   dotr   r   bfloat16r   float16r   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr7   r8   r9   r:   r   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_maskr!   r!   r"   _w8a8_block_fp8_matmul>   sL   %,,((0,(rt   rC   rD   rF   rG   output_dtypec                    s  t |dksJ |d |d }}| jd |jd ksJ | jdd |jdd kr/|  s1J t| jd ||jd ksAJ |  | jd   |jdkrX| rX|jdksZJ |j\}t||jd kslJ t|||jd ksyJ | jdd f }	| j|	|d}
d} |k rt }t	|d}|}|| dksJ |} fd	d
}t
| | ||
|| |||| d| d|d|d|
d|
d|d|d|d|d|||dd |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    r   r      r'   Nr(   r$      c                    s"   t  | d t | d  fS )Nr7   r8   )r)   r*   )METArH   rI   r!   r"   r.      s   "z*w8a8_block_fp8_matmul_triton.<locals>.grid   )r7   r8   r9   r:   )lenr0   r/   r)   r*   r+   ndimr4   next_power_of_2r   rt   stride)rC   rD   rF   rG   r%   ru   block_nblock_krJ   C_shaperE   r7   r9   r8   r.   r!   ry   r"   w8a8_block_fp8_matmul_triton   s^   (  


r   input_qweight_qinput_scaleweight_scalec              
   C   s  | j dkr| jn
d| jd | jd f\}}}|jd }	| d|}
||jd d}|	|d  }||d  }tj|| |	ftj| jd}t|D ]k}||d  }||d  }t|D ]X}||d  }||d  }|
dd||f }|||||f }|dd||d f }|||f }tj||	 tj
dtj| jd||d| }|dd||f  |7  < qZqH||||	}||S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       rv   r   r'   r   deviceN)scale_ascale_b	out_dtype)r}   r0   viewr1   r>   r   r   r?   
_scaled_mmttensorr   )r   r   r   r   r%   ru   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_resultr!   r!   r"   w8a8_block_fp8_matmul_compile   s>   ,

r   c                       sb   e Zd ZejZ					ddedededee	eef  f fdd	Z
d
ejdejfddZ  ZS )	FP8LinearFNdynamicin_featuresr   biasr%   c           
         s   t  || || _|| _tjtj||tj	|d| _
| j
 dkrJ||d  d |d  }||d  d |d  }	ttj||	tj|d| _n| dd  || _|| _|rdtt| j| _d S | dd  d S )Nr   rv   r   weight_scale_invr   )super__init__r   r   r1   nn	Parameteremptyr   r   weightelement_sizer   r   register_parameterr%   activation_schemer   )
selfr   r   r   r   r%   r   r   scale_out_featuresscale_in_features	__class__r!   r"   r   )  s    
zFP8Linear.__init__inputr&   c              	   C   s   | j  dkrt|| j | jS t|| jd \}}tj	|j	 t
|| j || j| j|jd}W d    n1 s:w   Y  tj  | jd urN|| j }|j|jdS )Nrv   )ru   r(   )r   r   Flinearr   r6   r%   r1   cudar   r   r   r   synchronizer   )r   r   qinputscaler   r!   r!   r"   forwardK  s"   
	

zFP8Linear.forward)FNNNr   )__name__
__module____qualname__r1   r3   r   intboolr   r   r   Tensorr   __classcell__r!   r!   r   r"   r   &  s"    "r   Fc           	         s   |du rg }|   D ]p\}}|| t|tjr_||pg vr_d| t fdd|p-g D s_t # t|j	|j
|jdu|jj|jj|j|jd| j|< d}W d   n1 sZw   Y  tt| dkrut||||||d\}}|d	 q
| |fS )
z%Replace Linear layers with FP8Linear.N.c                 3   s    | ]}| v V  qd S )Nr!   ).0keycurrent_key_name_strr!   r"   	<genexpr>u  s    z+_replace_with_fp8_linear.<locals>.<genexpr>)r   r   r   r   r   r   r%   Tr   )has_been_replacedr'   )named_childrenappend
isinstancer   Linearjoinanyr
   r   r   r   r   r   r   r   r   weight_block_size_modulesr|   listchildren_replace_with_fp8_linearpop)	modeltp_planmodules_to_not_convertcurrent_key_namequantization_configr   namemodule_r!   r   r"   r   d  s<   	

	
	r   c                 C   s\   |du rdgn|}|j dur||j  tt|}t| | j||d\} }|s,td | S )z:Helper function to replace model layers with FP8 versions.Nlm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   _tp_planloggerwarning)r   r   r   r   r!   r!   r"   replace_with_fp8_linear  s   

r   )r$   )NNNNF)NN)&typingr   r   r   utilsr   r   r   r1   torch.nnr   r)   triton.languagelanguager   r	   r   
accelerater
   
get_loggerr   r   jit	constexprr#   r   r   r6   rt   r   r   r   compiler   r   r   r   r   r!   r!   r!   r"   <module>   s   
&Z
QA@
0