U
    Î@·f¾3  ã                   @   s  d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ e
 ¡ edœdd	„ƒZe
 ¡ d
d„ ƒZe
 ¡ edœdd„ƒZe
 d¡dd„ ƒZe	ddG dd„ dƒƒZG dd„ deƒZdS )é    )ÚBaseBackendÚ	GPUTarget)ÚirÚpassesÚllvmÚnvidia)Ú	dataclassN)ÚAnyÚTupleÚOptional)ÚPath)Úbinaryc                 C   s´   t j d|  ¡ › dd¡t j t j t¡d| ¡g}|D ]j}t j |¡r6t j 	|¡r6t
j|dgt
jd}|d k	r6tjd| d¡tjd	}|d k	r6|| d
¡f  S q6td| › ƒ‚d S )NZTRITON_Z_PATHÚ Úbinú	--version)Ústderrz.*release (\d+\.\d+).*úutf-8©Úflagsé   zCannot find )ÚosÚenvironÚgetÚupperÚpathÚjoinÚdirnameÚ__file__ÚexistsÚisfileÚ
subprocessÚcheck_outputÚSTDOUTÚreÚsearchÚdecodeÚ	MULTILINEÚgroupÚRuntimeError)r   Úpathsr   ÚresultÚversion© r,   úC/tmp/pip-unpacked-wheel-lsp6gkeu/triton/backends/nvidia/compiler.pyÚ_path_to_binary   s    þr.   c                  C   s    t  tdƒd dg¡ d¡} | S )NÚptxasr   r   r   )r    r!   r.   r%   )r+   r,   r,   r-   Úget_ptxas_version!   s    r0   ©Úreturnc                 C   s^   t | tƒst‚tt|  d¡ƒ\}}|dkr2d| S |dkrBd| S |dkrRd| S tdƒ‚d	S )
zK
    Get the highest PTX version supported by the current CUDA driver.
    Ú.é   éP   é   éF   é
   é?   z'Triton only support CUDA 10.0 or higherN)Ú
isinstanceÚstrÚAssertionErrorÚmapÚintÚsplitr(   )Úcuda_versionÚmajorÚminorr,   r,   r-   Úptx_get_version'   s    rC   c              
   C   s4   t | dƒ }t | ¡ ¡ ¡ W  5 Q R £ S Q R X d S )NÚrb)ÚopenÚhashlibÚsha256ÚreadÚ	hexdigest)r   Úfr,   r,   r-   Ú	file_hash7   s    rK   T)Úfrozenc                   @   sÚ   e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dd„ Zdd„ ZdS )ÚCUDAOptionsé   Ú	num_warpsr   Únum_ctasé   Ú
num_stagesNÚmaxnreg)r   r   r   Úcluster_dimsÚptx_versionTÚenable_fp_fusionFÚallow_fp8e4nvÚallow_fp8e4b15Útf32Údefault_dot_input_precision)rY   Ztf32x3ZieeeÚallowed_dot_input_precisionsÚmax_num_imprecise_acc_defaultÚextern_libsÚdebugÚcudaÚbackend_namec                 C   sŠ   t tƒjd }| jd kri nt| jƒ}| dd ¡sJt dt|d ƒ¡|d< t	 
| dt| ¡ ƒ¡ | jdkr~| j| jd @ dks†tdƒ‚d S )	NÚlibZ	libdeviceZTRITON_LIBDEVICE_PATHzlibdevice.10.bcr]   r   r   znum_warps must be a power of 2)r   r   Úparentr]   Údictr   r   Úgetenvr;   ÚobjectÚ__setattr__ÚtupleÚitemsrO   r<   )ÚselfZdefault_libdirr]   r,   r,   r-   Ú__post_init__Q   s     ÿzCUDAOptions.__post_init__c                 C   sX   t | jƒ}tdd„ t|d ƒD ƒƒ|d< d dd„ t| ¡ ƒD ƒ¡}t | d¡¡ 	¡ S )Nc                 s   s   | ]\}}|t |ƒfV  qd S ©N)rK   )Ú.0ÚkÚvr,   r,   r-   Ú	<genexpr>\   s     z#CUDAOptions.hash.<locals>.<genexpr>r]   Ú_c                 S   s   g | ]\}}|› d |› ‘qS )ú-r,   )rl   ÚnameÚvalr,   r,   r-   Ú
<listcomp>]   s     z$CUDAOptions.hash.<locals>.<listcomp>r   )
rc   Ú__dict__rg   Úsortedr   rh   rF   rG   ÚencoderI   )ri   Z	hash_dictÚkeyr,   r,   r-   ÚhashZ   s    
zCUDAOptions.hash)Ú__name__Ú
__module__Ú__qualname__rO   r>   Ú__annotations__rP   rR   rS   r   rT   rg   rU   rV   ÚboolrW   rX   rZ   r;   r[   r
   r\   r]   rc   r^   r`   rj   ry   r,   r,   r,   r-   rM   =   s"   
	rM   c                       s°   e Zd Zeedœdd„ƒZeddœ‡ fdd„Zedœd	d
„Zdd„ Z	dd„ Z
dd„ Zedd„ ƒZedd„ ƒZedd„ ƒZedd„ ƒZedd„ ƒZdd„ Ze ¡ dd„ ƒZ‡  ZS )ÚCUDABackend©Útargetc                 C   s
   | j dkS )Nr_   )Úbackendr€   r,   r,   r-   Úsupports_targetc   s    zCUDABackend.supports_targetN)r   r2   c                    s.   t ƒ  |¡ |j| _t| jtƒs$t‚d| _d S )NÚcubin)ÚsuperÚ__init__ÚarchÚ
capabilityr:   r>   r<   Z
binary_ext)ri   r   ©Ú	__class__r,   r-   r†   g   s    zCUDABackend.__init__r1   c                    sT   ‡ fdd„t j ¡ D ƒ}| jdk|d< | jdk |d< | jdkrBdnd|d	< t f |ŽS )
Nc                    s   i | ]}|ˆ kr|ˆ | “qS r,   r,   )rl   rm   ©Úoptsr,   r-   Ú
<dictcomp>n   s       z-CUDABackend.parse_options.<locals>.<dictcomp>éY   rW   éZ   rX   i   @r   r\   )rM   Z__dataclass_fields__Úkeysrˆ   )ri   rŒ   Úargsr,   r‹   r-   Úparse_optionsm   s
    zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r   é   )rO   rP   ÚsharedrT   )ri   Úmetadatar,   r,   r-   Úpack_metadatat   s    úzCUDABackend.pack_metadatac                 C   s6   dd l m  m  m} d| jdkr*|jn|ji}|S )Nr   Zconvert_custom_typesr5   )Ztriton.language.extra.cudaÚlanguageÚextrar_   rˆ   Zconvert_custom_float8_sm80Zconvert_custom_float8_sm70)ri   r_   Zcodegen_fnsr,   r,   r-   Úget_codegen_implementation~   s
    þz&CUDABackend.get_codegen_implementationc                 C   s   t  |¡ d S rk   )r   Úload_dialects)ri   Úctxr,   r,   r-   rš   †   s    zCUDABackend.load_dialectsc                 C   s‚   t  | j¡}| ¡  tj |¡ tj |¡ tj 	|¡ tj 
|¡ tj |¡ tj |¡ tj |¡ tj |¡ | | ¡ | S rk   )r   Úpass_managerÚcontextÚenable_debugr   ÚcommonZadd_inlinerÚttirZadd_rewrite_tensor_pointerZadd_combineÚadd_canonicalizerZadd_reorder_broadcastÚadd_cseZadd_licmÚadd_symbol_dceÚrun)Úmodr•   ÚoptÚpmr,   r,   r-   Ú	make_ttir‰   s    
zCUDABackend.make_ttirc                 C   sÆ  t  ¡ }|jd k	r6|jd |_|jd |_|jd |_t | j¡}| 	¡  t
j |d|› |jd|j¡ t
j |¡ |d dkrŒt
j |¡ t j
j ||¡ t
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j ||dk¡ t
j |¡ |d dkrt
j |¡ t
j ||j¡ t
j |¡ t
j ||dk¡ t
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j  |¡ |d d	kr˜t j
j !|¡ t j
j "|¡ t
j #|¡ | $| ¡ |j|j|jf|d
< | S )Nr   r   r“   zcuda:é    r8   é   r5   é	   rT   )%r   ZClusterInforT   ZclusterDimXZclusterDimYZclusterDimZr   rœ   r   rž   r   r    Zadd_convert_to_ttgpuirrO   rP   ÚttgpuirZadd_coalesceZadd_f32_dot_tcÚ	ttnvgpuirZadd_plan_ctaZadd_remove_layout_conversionsZadd_optimize_thread_localityZadd_accelerate_matmulZadd_optimize_dot_operandsrŸ   r¢   Ú add_combine_tensor_select_and_ifZadd_pipelinerR   Zadd_prefetchZadd_reduce_data_duplicationZadd_reorder_instructionsr£   Zadd_fence_insertionZadd_tma_loweringr¡   r¤   )r¥   r•   r¦   rˆ   Zcluster_infor§   r,   r,   r-   Ú
make_ttgir˜   sF    

zCUDABackend.make_ttgirc                 C   s   |   d¡}|d k	r"|d  |9  < | }t |j¡}| ¡  tjj |¡ tj 	|¡ tj
 |¡ tj
 |¡ tj |¡ tjj ||¡ tjj |¡ tj
 |¡ tj |¡ tj |¡ tj |¡ tj dd¡dkrätj |¡ | |¡ t ¡  t ¡ }t ||¡}t |¡ |j d k	rP| !¡ D ]&}	|	 "¡ s(|	 #¡ r(|	 $|j ¡ q(|j%rtdd„ |j%D ƒ}
t &||
¡ t '|tj(¡ |   d¡|d< t)|ƒ}~~|S )	Nz"triton_gpu.num-warp-groups-per-ctarO   ÚTRITON_DISABLE_LINE_INFOÚ0c                 S   s   g | ]\}}|‘qS r,   r,   )rl   rr   r   r,   r,   r-   rt   æ   s     z)CUDABackend.make_llir.<locals>.<listcomp>ztriton_gpu.sharedr”   )*Zget_int_attrr   rœ   r   rž   r   r   r¬   Z%add_decompose_unsupported_conversionsr®   ÚconvertZadd_scf_to_cfZadd_index_to_llvmirZadd_allocate_shared_memoryZadd_to_llvmirr­   Zadd_nvgpu_to_llvmZadd_arith_to_llvmirrŸ   r¡   r¢   r£   r   r   r   ZllvmirZadd_di_scoper¤   r   Zinit_targetsZ	to_moduleZset_nvvm_reflect_ftzrS   Zget_functionsZis_declarationZis_external_linkageZset_nvvm_maxnregr]   Zlink_extern_libsZoptimize_moduleZOPTIMIZE_O3r;   )Úsrcr•   Úoptionsrˆ   Znum_warp_groupsr¥   r§   r   Zllvm_modrm   r)   Úretr,   r,   r-   Ú	make_llirÁ   sJ    


zCUDABackend.make_llirc              	   C   sö   |j }|d kr"tdƒ\}}t|ƒ}td|ƒ}d}|dkr<dnd|› }	d|› }
t | ||	|
dg|jd	¡}t d
|¡}t	|ƒdks†t
‚|d |d< |d › d|d › }tjdd|› |tjd}t dd|¡}tj dd¡dkròtdƒ t|ƒ |S )Nr/   éS   znvptx64-nvidia-cudar   Zsm_90aZsm_z+ptxznvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   rr   r8   r3   z\.version \d+\.\d+z	.version r   z,\s*debug|debug,\s*r   ZNVPTX_ENABLE_DUMPr±   Ú1z // -----// NVPTX Dump //----- //)rU   r.   rC   Úminr   Ztranslate_to_asmrV   r#   ÚfindallÚlenr<   Úsubr&   r   r   r   Úprint)r³   r•   r¦   rˆ   rU   rp   r@   Zllvm_ptx_versionZtripleÚprocÚfeaturesrµ   Únamesr,   r,   r-   Úmake_ptxò   s&    

zCUDABackend.make_ptxc                 C   s6  t dƒ\}}tjdddd}tjddddð}| | ¡ | ¡  |jd }tj d	¡r`d
nd}	|j	rnd
nd}
|dkr~dnd}tj dd¡dkrÄ|› |	› |
› d|› |› |j› d|› d|j› }n.|› |	› |
› d|› |› |j› d|› d|j› }zºztj|ddd W n  tjk
r¨ } z~t|jƒ}| ¡ }W 5 Q R X |jdkrXtd|› ƒ‚n@|jdtj kr‚td|j› d|› ƒ‚ntd|j› d|› ƒ‚W 5 d }~X Y nX W 5 tj
 |j¡rÊt |j¡ tj
 |j¡ræt |j¡ X t|d ƒ}| ¡ }W 5 Q R X tj
 |¡rt |¡ W 5 Q R X W 5 Q R X |S )!Nr/   FÚwz.ptx)ÚdeleteÚmodeÚsuffixÚrz.logz.or°   r   z
 -lineinfoz --fmad=falser   za ú ZDISABLE_PTXAS_OPTr±   r¸   z  -v --opt-level 0 --gpu-name=sm_z -o z 2> z -v --gpu-name=sm_T)ÚshellÚcheckéÿ   z$Internal Triton PTX codegen error: 
é€   zPlease run `ptxas z+` to confirm that this is a bug in `ptxas`
z`ptxas` failed with error code z: 
rD   )r.   ÚtempfileÚNamedTemporaryFileÚwriteÚflushrr   r   r   r   rV   r   r   Úremover    r¤   ÚCalledProcessErrorrE   rH   Ú
returncoder(   ÚsignalÚSIGSEGV)r³   r•   r¦   rˆ   r/   rp   ÚfsrcZflogZfbinZ	line_infoZfmadrÅ   ÚcmdÚeZlog_fileÚlogrJ   r„   r,   r,   r-   Ú
make_cubin  sF    ÿ

0.ÿ,zCUDABackend.make_cubinc                    s^   ‡ ‡fdd„|d< ‡ ‡fdd„|d< ‡ ‡fdd„|d< ‡ ‡fdd„|d	< ‡ ‡fd
d„|d< d S )Nc                    s   ˆ  | |ˆ ¡S rk   )r¨   ©r³   r•   ©r´   ri   r,   r-   Ú<lambda><  ó    z(CUDABackend.add_stages.<locals>.<lambda>r    c                    s   ˆ  | |ˆ ˆj¡S rk   )r¯   rˆ   rÚ   rÛ   r,   r-   rÜ   =  rÝ   Zttgirc                    s   ˆ  | |ˆ ˆj¡S rk   )r¶   rˆ   rÚ   rÛ   r,   r-   rÜ   >  rÝ   Zllirc                    s   ˆ  | |ˆ ˆj¡S rk   )rÁ   rˆ   rÚ   rÛ   r,   r-   rÜ   ?  rÝ   Zptxc                    s   ˆ  | |ˆ ˆj¡S rk   )rÙ   rˆ   rÚ   rÛ   r,   r-   rÜ   @  rÝ   r„   r,   )ri   Zstagesr´   r,   rÛ   r-   Ú
add_stages;  s
    zCUDABackend.add_stagesc                 C   s   t ƒ }|› d| j› S )Nrq   )r0   rˆ   )ri   r+   r,   r,   r-   ry   B  s    zCUDABackend.hash)rz   r{   r|   Ústaticmethodr   rƒ   r†   r	   r’   r–   r™   rš   r¨   r¯   r¶   rÁ   rÙ   rÞ   Ú	functoolsÚ	lru_cachery   Ú__classcell__r,   r,   r‰   r-   r   a   s(   


(
0

(r   ) Ztriton.backends.compilerr   r   Ztriton._C.libtritonr   r   r   r   Zdataclassesr   rà   Útypingr	   r
   r   rF   r#   rÌ   rÓ   r   r    Úpathlibr   rá   r;   r.   r0   r>   rC   rK   rM   r   r,   r,   r,   r-   Ú<module>   s,   

#