o
    nhK                     @   s  d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd	efd
dZe defddZ e de!fddZ"e de!fddZ#e de!fddZ$de!fddZ%e de!fddZ&eddd Z'de!fddZ(ed d!G d"d# d#Z)G d$d% d%eZ*dS )&    )BaseBackend	GPUTarget)irpassesllvmnvidia)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   dt tttf fdd}|S )Nreturnc                 S   s0   | j j}|j j}||ksJ d|dkrdS dS )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidth r   s/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibility   s   z-min_dot_size.<locals>.check_dot_compatibility)r   int)r   r   r   r   r   min_dot_size   s   	r   binaryc                 C   s   | t d7 } tjd|   ddtjtjt	d| g}|D ]5}tj
|rWtj|rWtj|dgtjd}|d urWtjd|d	tjd
}|d urW||df  S q"td|  )NEXETRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )	sysconfigget_config_varosenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r    pathsr2   resultversionr   r   r   _path_to_binary!   s   rD   archc                 C   s   | dkrdnd}t |S )Nd   zptxas-blackwellptxas)rD   )rE   namer   r   r   	get_ptxas3   s   rI   c                 C   s8   t jd}|d ur|S tt| d dgd}|S )NTRITON_MOCK_PTX_VERSIONr   r&   r(   )r.   r/   r0   r8   r9   rI   r=   )rE   mock_verrC   r   r   r   get_ptxas_version9   s
   rL   r   c                 C   sr   t | tsJ tt| d\}}|dkr#|dk rd| S d| d S |dkr+d| S |dkr3d	| S td
|  )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   r+      F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr   splitr@   )cuda_versionmajorminorr   r   r   ptx_get_versionB   s   r\   c                 C   s&   | j }|d u rt|\}}t|}|S N)ptx_versionrI   r\   )optionsrE   r^   _rY   r   r   r   get_ptx_version_from_optionsU   s
   ra   c                 C   s"   t | |}td|}d| }|S )NV   z+ptx)ra   min)r_   rE   r^   llvm_ptx_versionfeaturesr   r   r   get_features]   s   


rf   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r2   fr   r   r   	file_hashk   s   $rn   
capabilityc                 C   s   | dkrdnd}d|  | S )NZ   ar$   sm_r   )ro   suffixr   r   r   sm_arch_from_capabilityq   s   rt   T)frozenc                   @   s6  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< d Zeed!< dZeed"< dZ eed#< d$d% Z!d&d' Z"dS )(CUDAOptions   	num_warpsr+   num_ctas   
num_stagesr   num_buffers_warp_specnum_consumer_groupsreg_dec_producerreg_inc_consumerNmaxnreg)r+   r+   r+   cluster_dimsr^   Tenable_fp_fusionFlaunch_cooperative_grid)fp8e5fp8e4b15supported_fp8_dtypesr   deprecated_fp8_dtypestf32default_dot_input_precision)r   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowrE   c                 C   s   t tjd }| jd u ri nt| j}|dd s%tdt|d |d< t	
| dt|  | jdkr?| j| jd @ dksCJ dd S )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcr   r   r+   znum_warps must be a power of 2)r   r5   parentr   dictr0   r.   getenvrV   object__setattr__tupleitemsrx   )selfdefault_libdirr   r   r   r   __post_init__   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S r]   )rn   ).0kvr   r   r   	<genexpr>   s    z#CUDAOptions.hash.<locals>.<genexpr>r   r`   c                 S   s   g | ]\}}| d | qS )-r   )r   rH   valr   r   r   
<listcomp>   s    z$CUDAOptions.hash.<locals>.<listcomp>r(   )
r   __dict__r   sortedr3   r   ri   rj   encoderl   )r   	hash_dictkeyr   r   r   hash   s   
zCUDAOptions.hash)#__name__
__module____qualname__rx   r   __annotations__ry   r{   r|   r}   r~   r   r   r   r   r   r^   r   boolr   r   r   rV   r   r   r   r   r   r   r   r   r   rE   r   r   r   r   r   r   rv   w   s2   
 	rv   c                       s   e Zd ZedefddZdd Zdeddf fdd	Zdefd
dZ	dd Z
dd Zdeeef fddZdd Zedd Zedd Zdd Zdd Zdd Zdd Ze d d! Z  ZS )"CUDABackendr   c                 C   s
   | j dkS )Nr   )backend)r   r   r   r   supports_target   s   
zCUDABackend.supports_targetc                 C   s0   d}t ||}|std| t|dS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r+   )r;   	fullmatch
ValueErrorr   r?   )r   rE   patternmatchr   r   r   _parse_arch   s
   zCUDABackend._parse_archr   Nc                    s   t  | d| _d S )Ncubin)super__init__
binary_ext)r   r   	__class__r   r   r      s   
zCUDABackend.__init__c                    s   dt dd| jj i}| fddtj D  t| 	|d }d|vr?t
tj}|dkr7|d tt||d< d	|vrK|d
krKd|d	< d|vrYt dddk|d< |d
kr_dnd|d< tdi |S )NrE   TRITON_OVERRIDE_ARCHsmc                    s*   i | ]}| v r | d ur| | qS r]   r   )r   r   optsr   r   
<dictcomp>   s   * z-CUDABackend.parse_options.<locals>.<dictcomp>r   Y   fp8e4nvr   rp   )r   r   TRITON_DEFAULT_FP_FUSION1i   @r   r   r   )r.   r   r   rE   updaterv   __dataclass_fields__keysr   r   setr   addr   r   )r   r   argsro   r   r   r   r   parse_options   s   

zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r+      )rx   ry   sharedr   )r   metadatar   r   r   pack_metadata   s   zCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t| |j}|dkr|jn|jt	| j
d}|S )Nr   rP   )convert_custom_typesr   )triton.language.extra.cudalanguageextrar   r   r   rE   convert_custom_float8_sm80convert_custom_float8_sm70r   r   )r   r_   r   ro   codegen_fnsr   r   r   get_codegen_implementation   s   z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   r   r   r   get_module_map   s   zCUDABackend.get_module_mapc                 C   s   t | d S r]   )r   load_dialects)r   ctxr   r   r   r      s   zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| tj	| tj
| tj| tj| tj| tj| ||  | S r]   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optpmr   r   r   	make_ttir   s   
zCUDABackend.make_ttirc                 C   s  t  }|jd ur|jd |_|jd |_|jd |_t| j}|	 }t
j|d| |jd|j t
j| |d dkrFt
j| t j
j|| t
j| t
j| t
j| t
j| t
j||dk t
j| |d d	v rt
j| t
j| t
j| t
j| t
j| t
j| t
j||j  t
j!||j  t
j"||j  t
j#||j$|j |j%|j& t
j'||j(| t
j)||j  t
j*||j  nw|d dkrPt
j| t
j| t
j| t
j| t
j||j  t
j!||j  t
j"||j  t
j#||j$|j |j%|j& t
j'||j(| t
j| t j
j+| t j
j,| t
j*||j  t
j| nt
j| t
j-| t
j||dk t
j.| t
j| t
j/| t
j0| t
j| t
j1| |d d
krt j
j2| t j
j3| t
j| |d d
krt
j4||j  |5|  |j|j|jf|d< | S )Nr   r+   r   zcuda:r   rS   r   rP   )r   	   r   r   )6r   ClusterInfor   clusterDimXclusterDimYclusterDimZr   r   r   r   r   r   add_convert_to_ttgpuirrx   ry   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r   add_fuse_nested_loopsr   add_licmadd_optimize_accumulator_init add_combine_tensor_select_and_ifadd_ws_task_partitionr}   add_taskid_propagateadd_ws_data_partitionadd_ws_code_partitionr|   r~   r   add_pipeliner{   add_ping_pong_syncadd_ws_loweringadd_promote_lhs_to_tmemadd_keep_acc_in_tmemadd_prefetchadd_coalesce_async_copyadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringadd_ws_canonicalizationr   )r   r   r   ro   cluster_infor   dump_enabledr   r   r   
make_ttgir   s   

zCUDABackend.make_ttgirc                 C   s@  t || jj}|}t|j}|  tjj	
| tj| tj| tj| tj| tjj	| tj| tjj||| tj| tj| tjj	| tjj	| tj| tj| tj| tjdddkrtj| || t !  t  }tjdddkrt"dt #||}	t$|}
t%|| jj}d}t &|	||
| t'|	 |j(d ur|	) D ]}|* s|+ r|,|j( q|j-rdd |j-D }t .|	| t /|	t j0 |1d	}|d ur||d
< |1d|d< |1d|d< |1d|d< |1d|d< t2|	}~	~|S )NTRITON_DISABLE_LINE_INFO0TRITON_ENABLE_ASANr   zYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )r   rH   r2   r   r   r   r   j      z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsrx   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_align)3ra   r   rE   r   r   r   r   r   r   r   add_lower_mmar   r  add_allocate_warp_groupsconvertadd_scf_to_cfadd_allocate_shared_memoryadd_allocate_tensor_memory"add_allocate_global_scratch_memoryadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   r.   r/   r0   llvmiradd_di_scoper   r   init_targetsr@   	to_modulert   rf   attach_datalayoutset_nvvm_reflect_ftzr   get_functionsis_declarationis_external_linkageset_nvvm_maxnregr   link_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrrV   )r   srcr   r_   ro   r^   r   r   r   llvm_modprocre   tripler   rA   total_num_warpsretr   r   r   	make_llir<  sl   



zCUDABackend.make_llirc              	   C   s   t || jj}d}t|}t|| jj}t||||dg|jd}	t	d|	}
t
|
dks/J |
d |d< |d  d	|d  }tjd
d| |	tjd}	tjdd| |	tjd}	tdd|	}	tjdddkrrtd t|	 |	S )Nr  znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r+   r   rH   rS   rM   z\.version \d+\.\d+z	.version r)   z\.target sm_\d+z.target sm_z,\s*debug|debug,\s*r$   NVPTX_ENABLE_DUMPr  r   z // -----// NVPTX Dump //----- //)ra   r   rE   rt   rf   r   translate_to_asmr   r;   findalllensubr>   r.   r/   r0   print)r   r7  r   r   ro   r^   r:  r9  re   r<  namesr   r   r   make_ptx}  s    zCUDABackend.make_ptxc                 C   s~  t | jj\}}tjdddd#}tjdddd }|| |  |jd }	tj	
dd	d
kr7ddgndg}
|jr?g ndg}t|}tj	
dd	d
krSdd	gng }|g|
|d|d| |jd|	}z%tj|dd|d tj|jrt|j tj|jrt|j W n\ tjy } zOt|j}| }W d    n1 sw   Y  tj|jrt|j |jdkrd}n|jdtj krd}nd|j }t| d| dd| dd }~ww t|	d}| }W d    n	1 sw   Y  tj|	rt|	 W d    n1 sw   Y  W d    |S W d    |S 1 s8w   Y  |S )NFwz.ptx)deletemoders   rz.logz.or  r  r   z	-lineinfoz-suppress-debug-infoz--fmad=falseDISABLE_PTXAS_OPTz--opt-levelz-vz--gpu-name=z-oT)check	close_fdsr'      z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command:  
rg   )rI   r   rE   tempfileNamedTemporaryFilewriteflushrH   r.   r/   r0   r   rt   r8   r   r2   r6   removeCalledProcessErrorrh   rk   
returncodesignalSIGSEGVr   r3   )r   r7  r   r   ro   rG   r`   fsrcflogfbin	line_infofmadrE   	opt_level	ptxas_cmdelog_filelogerrorrm   r   r   r   r   
make_cubin  sp   

.




*'''zCUDABackend.make_cubinc                    sn    j fdd|d<  fdd|d<  fdd|d< fdd|d	< fd
d|d< d S )Nc                    s    | | S r]   )r   r7  r   r_   r   r   r   <lambda>  s    z(CUDABackend.add_stages.<locals>.<lambda>r   c                        | | S r]   )r  rf  ro   r_   r   r   r   rh        ttgirc                    ri  r]   )r=  rf  rj  r   r   rh    rk  llirc                        | | jjS r]   )rE  r   rE   rf  rg  r   r   rh    r  ptxc                    rn  r]   )re  r   rE   rf  rg  r   r   rh    r  r   )r   rE   )r   stagesr_   r   rj  r   
add_stages  s   zCUDABackend.add_stagesc                 C   s   t | jj}| d| jj S )Nr   )rL   r   rE   )r   rC   r   r   r   r     s   zCUDABackend.hash)r   r   r   staticmethodr   r   r   r   r
   r   r   r   r   rV   r   r   r   r   r  r=  rE  re  rq  	functools	lru_cacher   __classcell__r   r   r   r   r      s(    



HA+r   )+triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   triton.runtime.errorsr   dataclassesr	   rs  typingr
   r   r   r   typesr   ri   r;   rQ  rX  r.   r8   pathlibr   r,   r   rt  rV   rD   r   rI   rL   r\   ra   rf   rn   rt   rv   r   r   r   r   r   <module>   sB    
*