
    3jR                     >   S SK r S SKrS SKJr  S SKr S SKJr  SrS SK
Jr  S\ R                  4S jrS\ R                  4S	 jrS\ R                  4S
 jrS\SS4S jrS\S\4S jrS\ R                  4S jrS\ R                  4S jrS\ R                  4S jrS\\   4S jr    S$S\S\S\S-  S\S-  S\S-  S\S\\\4   4S jjr " S S5      r " S S5      r S%S\\-  S\\   S-  S\\ \S4   -  4S jjr! S&S \S!\S"\S\4S# jjrg! \	 a    SrSr Nf = f)'    N)Any)runtimeTF)_get_device_indexreturnc                  
    SS K n [        R                  " [        U R	                  S5      S   5      5      nUR                  Ul        UR                  Ul        UR                   Ul        UR$                  Ul        UR(                  Ul        U$ ! [
        [        4 ad    [        R                  S:X  a7  [        R                  " S[        R                  R                  S    S35      n N[        R                  " S5      n Nf = f)Nr   amdhip64win32	amdhip64_.dllzlibamdhip64.so)rocm_sdkctypesCDLLstrfind_librariesImportError
IndexErrorsysplatformtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)r   libs     K/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/cuda/_utils.py_get_hip_runtime_libraryr$      s    	0kk#h55jA!DEF 00C00C!66C22C 44CJ $ 0<<7"++	%--*;*;A*>)?tDEC++./C	0s   5B AD)DDc                      [         R                  S:X  a  [        R                  " S5      $ [        R                  " S5      $ )Nr	   z
nvcuda.dllzlibcuda.so.1)r   r   r   r        r#   _get_cuda_libraryr(   -   s,    
||w{{<(({{>**r'   c                  h    [         R                  R                  (       a
  [        5       $ [	        5       $ N)r   r   r   r$   r(   r&   r'   r#   _get_gpu_runtime_libraryr+   5   s!    }}')) ""r'   resultc                    U S:X  a  g [         R                  " 5       n[        5       nUR                  U [         R                  " U5      5        UR
                  b  UR
                  R                  5       OSn[        SU 35      e)Nr   Unknown CUDA errorCUDA error: )r   c_char_pr+   r   byrefvaluedecodeRuntimeError)r,   err_strlibcudaerror_messages       r#   _check_cudar8   =   sn    {ooG&(GVV\\'%:;")--";AU  m_5
66r'   c                 Z   [         (       d  [        S5      eU tpU[        R                  R                  :w  aO  [        R
                  " U5      u  p4[        U[        5      (       a  UR                  5       n[        SU SU S35      e[        U5      S:X  a  g[        U5      S:X  a  US   $ U$ )a  Check a cuda.bindings (cuda-python) call result for errors.

All cuda.bindings runtime calls return ``(error, *outputs)``.  This
helper unpacks the tuple, raises on non-success, and returns the
outputs (``None`` for zero outputs, scalar for one, tuple otherwise).
zcuda.bindings is not availabler/   z ()r   N   )
_HAS_CUDA_BINDINGSr4   _cuda_bindings_runtimecudaError_tcudaSuccesscudaGetErrorString
isinstancebytesr3   len)r,   errout_r5   s        r#   _check_cuda_bindingsrG   I   s     ;<<IC!--99	: #55 	
 gu%%nn&G\#b	;<<
3x1}
3x1}1vJr'   c                      SS K n [        R                  " [        U R	                  S5      S   5      5      nUR                  Ul        UR                  Ul        UR"                  Ul        UR&                  Ul        UR*                  Ul        UR.                  Ul        UR2                  Ul        UR6                  Ul        UR:                  Ul        UR>                  Ul         U$ ! [
        [        4 a    [        R                  S:X  ah  SR                  S[        R                  R                  S   S[        R                  R                  S   /5      n[        R                  " SU S35      n GN8[        R                  " S5      n GNQf = f)	Nr   hiprtcr	    0   r   zlibhiprtc.so)!r   r   r   r   r   r   r   r   r   joinr   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetCUBINSizehiprtcGetCodenvrtcGetCUBINhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)r   r"   version_strs      r#   _get_hiprtc_libraryrc   f   sA   .kk#h55h?BCD "66C 44C!66C!66C11C))C!$!<!<C 44C!$!<!<C!66CJ) $ .<<7"''emm''*C1B1B11EFK ++{m489C++n-C.s   5C# #B	F	/F	F	c                  *   [        [        R                  R                  R	                  S5      S   5      n [
        R                  S:X  a  SU  S3/nOSU  3S/nU H  n [        R                  " U5      s  $    [        S5      e! [         a     M6  f = f)	N.r   r	   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r   r   OSError)major_version
nvrtc_libslib_names      r#   _get_nvrtc_libraryrn      s    **005a89M
||w}oW-


 =/*

 	;;x(( 
 4
55  		s   B
BBc                  h    [         R                  R                  (       a
  [        5       $ [	        5       $ r*   )r   r   r   rc   rn   r&   r'   r#   _get_gpu_rtc_libraryrp      s#     }}"$$!##r'   c                      SSK Jn Jn  S1nU Vs/ s H  o3U;  d  M
  UPM     nn[        R                  R
                  (       a  UR                  U 5        U$ s  snf )z
Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

Returns:
    List of HIPCC/NVCC flags that can be safely used with NVRTC.
r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexpr)torch.utils.cpp_extensionrr   rs   r   r   r   extend)rr   rs   nvrtc_unsupported_flagsflagcompatible_flagss        r#   _get_gpu_rtc_compatible_flagsry      sc     P 	# +*:Q.Q*   }} 23s
   	AAkernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc           
      b	  ^^ SSK n[        5       mSmS[        SS4UU4S jjnU R                  S5      nUcv  UR                  R                  UR                  R                  5       5      n	UR                  R                  (       a  U	R                   nOU	R                   U	R                   3n/ n
UR                  R                  (       a#  U
R                  SU 3R                  5       5        O"U
R                  SU 3R                  5       5        SS	KJn  U" S
5      nU H%  nU
R                  SU 3R                  5       5        M'     U(       a+  U H%  nU
R                  SU 3R                  5       5        M'     U(       a[  [        UR                  R                  5      S:  a"  [!        SUR                  R                   35      eUc  / nUR                  S5        U(       a)  U H#  nU
R                  UR                  S5      5        M%     [#        5       nU
R%                  U Vs/ s H  nUR                  S5      PM     sn5        ['        U
5      n[(        R*                  U-  " U
6 n[(        R,                  " 5       nU" TR/                  [(        R0                  " U5      UU S3R                  5       SSS5      5        UR                  S5      nU" TR3                  UU5      5        TR5                  UUU5      nUT:w  a  [(        R6                  " 5       nTR9                  U[(        R0                  " U5      5        [(        R:                  " UR<                  5      nTR?                  UU5        [A        SUR<                  RC                  5        35      e[(        R6                  " 5       nU" TRE                  U[(        R0                  " U5      5      5        [(        R:                  " UR<                  5      nU" TRG                  UU5      5        [(        R*                  " 5       nU" TRI                  UU[(        R0                  " U5      5      5        UR<                  b  UR<                  RC                  5       nOSnTRK                  [(        R0                  " U5      5        URL                  U4$ s  snf )a  
Compiles a CUDA kernel using NVRTC and returns the PTX code.

Args:
    kernel_source (str): The CUDA kernel source code as a string
    kernel_name (str): The name of the kernel function to compile
    compute_capability (str, None): The compute capability to target (e.g., "86").
                                       If None, will detect from current device.
    cuda_include_dirs (list, None): List of directories containing CUDA headers
    nvcc_options (list, None): Additional options to pass to NVRTC
    auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

Returns:
    Tuple[bytes, str]: The compiled PTX code and mangled kernel name
r   Nr,   r   c                    > U T:w  ar  [         R                  " 5       nTR                  U [         R                  " U5      5        UR                  b  UR                  R                  5       OSn[        SU 35      eg )Nr.   r/   )r   r0   rO   r1   r2   r3   r4   )r,   r5   r7   NVRTC_SUCCESSlibnvrtcs      r#   check_nvrtc#_nvrtc_compile.<locals>.check_nvrtc   so    ]"oo'G((g1FG ==, $$&) 
 m_=>> #r'   utf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrh   z-Iz12.8zPCH requires CUDA 12.8+, got z--pchz.cuzKernel compilation failed:
rJ   )'
torch.cudarp   rg   encoderh   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendrt   r   r   AssertionErrorry   ru   rC   r   r0   c_void_prQ   r1   r_   rU   c_size_tr[   create_string_bufferr2   r]   r4   r3   rW   rY   ra   rS   raw)rz   r{   r|   r}   r~   r   r   r   source_bytespropsoptionsr   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsrw   num_optionsoptions_arrayprogc_kernel_namereslog_sizelogbinary_sizebinaryc_mangled_namemangled_namer   r   s                                @@r#   _nvrtc_compiler      s   0  $%H M	?C 	?D 	? 	? !''0L !

001J1J1LM==$)$5$5#6$)KK=!> G}});(<=DDFG/0B/CDKKMN 8&v.'	I;'..01 ( *INNR	{+2245 + u}}!!"V+ #@ASAS@T!UVVLG$ "FNN6==12 # ;<NN5KL5KTDKK(5KLM g,K__{2W=M ??D##LLm3&&(	
	  &&w/M//mDE 
&
&t[-
HC m??$''fll8.DE))(..9##D#.9#)):J:J:L9MNOO //#K**4k1JKL(():):;F&&tV45 __&N$$T=&,,~:VW '%++224  d!34 ::|##o Ms   &R,c                   L    \ rS rSrS\R
                  SS4S jrS\SS4S jrS	r	g)
_CudaModuleiI  moduler   Nc                     Xl         0 U l        g r*   )_module_kernels)selfr   s     r#   __init___CudaModule.__init__J  s    02r'   name_CudaKernelc           	         XR                   ;   a  U R                   U   $ SSKJn  U" 5       n[        R                  " 5       n [        UR                  [        R                  " U5      U R                  UR                  S5      5      5        [        X@R                  5      nXPR                   U'   U$ ! [         a  n[        SU S35      UeS nAff = f)Nr   )r+   r   zNo kernel named 'z' in this module)r   torch.cuda._utilsr+   r   r   r8   r   r1   r   r   r   r4   AttributeError)r   r   r+   r6   funckernelrD   s          r#   __getattr___CudaModule.__getattr__N  s    == ==&& 	?*, 	V++LL&dkk'6J
 !||4F"(MM$M 	V #4TF:J!KLRUU	Vs   A-B0 0
C:C

C)r   r   )
__name__
__module____qualname____firstlineno__r   r   r   r   r   __static_attributes__r&   r'   r#   r   r   I  s/    3v 34 3V V Vr'   r   c                       \ rS rSrSrS\R                  S\R                  SS4S jr     SS\\	\	\	4   S	\\	\	\	4   S
\
S-  S\	S\S-  SS4S jjrS\	SS4S jrSrg)r   ig  zL
Represents a compiled CUDA kernel that can be called with PyTorch tensors.
r   r   r   Nc                 *    Xl         X l        SU l        g )Nr   )r   r   _max_shared_mem_bytes)r   r   r   s      r#   r   _CudaKernel.__init__l  s    	%&"r'   gridblockargs
shared_memstreamc                    SSK nUR                  R                  R                  5       nU(       d  / n/ n/ n	U GHv  n
[	        XR
                  5      (       a  U
R                  (       d1  U
R                  (       a  U
R                  5       (       d  [        S5      e[        R                  " U
R                  5       5      nUR                  U5        U	R                  [        R                  " U5      5        M  [	        U
[        5      (       a>  [        R                   " U
5      nU	R                  [        R                  " U5      5        GM  [	        U
["        5      (       a>  [        R$                  " U
5      nU	R                  [        R                  " U5      5        GMb  ['        S[)        U
5       35      e   [        R                  [+        U	5      -  " 5       n[-        U	5       H,  u  p[        R.                  " U
[        R                  5      X'   M.     Uc  SSKnUR                  R3                  5       nUS:  aS  U R4                  S:X  d  X@R4                  :  a4  U R4                  S:X  a  SOSU R4                   S3n[7        S	U S
U S35      e[9        UR;                  U R<                  US   US   US   US   US   US   UUR>                  US5      5        g)a  
Call the compiled CUDA kernel

Args:
    grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
    block (tuple): Block dimensions (block_x, block_y, block_z)
    args (list): List of arguments to pass to the kernel.
                 PyTorch tensor arguments will be automatically converted to pointers.
    shared_mem (int): Shared memory size in bytes
    stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.r;   rL   ) r   rh   _utilsr+   rA   Tensoris_cudais_cpu	is_pinned
ValueErrorr   r   data_ptrr   r1   rg   c_intfloatc_double	TypeErrortyperC   	enumeratecastr   current_streamr   r4   r8   r   r   _as_parameter_)r   r   r   r   r   r   r   r6   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgs                    r#   __call___CudaKernel.__call__q  sT   & 	**##<<>D 13C#||,,{{CJJ3==??$Y  ooclln5%%c*fll3/0C%%S)fll512C''!??3/fll845"=d3i[ IJJ+ 0 #f+58'FA$kk#v?LO ( >ZZ..0F "&&!+z<V<V/V --2 !T7788IJ 
 ":, /%& '33  	""		QQQaaa%%	
r'   shared_mem_bytesc                 z   US:  a  Xl         g [        5       n[        R                  R	                  5       n[        R
                  R                  (       a  UR                  S:w  a  SOSnO[        USS5      nX:  a  [        SU SU S35      eS	n[        UR                  U R                  UU5      5        Xl         g )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r+   r   rh   r   r   r   r   getattrr4   r8   r!   r   )r   r   r6   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizes         r#   set_shared_memory_config$_CudaKernel.set_shared_memory_config  s    i')9&*, zz779== &11X=:  %=uN ,+,<+= >!!/ 0 1GG  783&&		; 	
 &6"r'   )r   r   r   )r;   r;   r;   r   Nr   N)r   r   r   r   __doc__r   r   r   tuplerg   listr   r   r   r   r&   r'   r#   r   r   g  s    'V__ 'foo '$ ' &/&/ !_
CcM"_
 S#s]#_
 Tk	_

 _
 d
_
 
_
B(6 (6 (6r'   r   ptxkernel_namesc           
      \   SSK n[        5       n[        U [        5      (       a  U R	                  S5      n [
        R                  " 5       nUR                  R                  5       nU   [        UR                  [
        R                  " U5      U 5      5        SSS5        U(       d  [        U5      $ 0 nU Hc  n[
        R                  " 5       n[        UR                  [
        R                  " U5      XGR	                  S5      5      5        [        X5      Xg'   Me     U$ ! , (       d  f       N= f)a  
Loads a CUDA module from PTX code and returns a module object that can access kernels.

Args:
    ptx (bytes or str): The PTX code to load
    kernel_names (list, optional): List of kernel names to extract from the module.
                                  If None, will return a module object with __getattr__.

Returns:
    object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
           If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
r   Nr   )r   r+   rA   r   r   r   r   rh   r   r8   r   r1   r   r   r   )	r   r   r   r6   r   r   kernelsr   r   s	            r#   _cuda_load_moduler     s       '(G #sjj! __FZZ&&(F	G,,V\\&-A3GH 
 6"" G ''T"FKK,@	

 $D1  N! 
s   &0D
D+deviceoptional	allow_cpuc                    [        U [        5      (       a  U $ [        U [        5      (       a  [        R                  " U 5      n [        U [        R                  5      (       aD  U(       a  U R
                  S;  a  [        SU  35      eOU R
                  S:w  a  [        SU  35      e[        R                  R                  5       (       d5  [        U [        R                  R                  5      (       a  U R                  $ [        XU5      $ )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

If :attr:`device` is a torch.device object, returns the device index if it
is a CUDA device. Note that for a CUDA device without a specified index,
i.e., ``torch.device('cuda')``, this will return the current default CUDA
device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
CPU devices will be accepted and ``-1`` will be returned in this case.

If :attr:`device` is a Python integer, it is returned as is.

If :attr:`device` is ``None``, this will return the current default CUDA
device if :attr:`optional` is ``True``.
)rh   cpuz(Expected a cuda or cpu device, but got: rh   z!Expected a cuda device, but got: )rA   rg   r   r   r   r   r   jitis_scriptingrh   idx_torch_get_device_index)r   r   r   s      r#   r   r   -  s      &#&#f%&%,,''{{/1 #KF8!TUU 2[[F"@IJJ99!!##fejj//00::"6Y??r'   )NNNFr*   )FF)"r   r   typingr   r   cuda.bindingsr   r=   r<   r   torch._utilsr   r   r   r$   r(   r+   rg   r8   rG   rc   rn   rp   r   r   ry   boolr   rB   r   r   r   dictr   r&   r'   r#   <module>r     s    
    F&++ .+6;; +#&++ #	7 	7 	7  :V[[ :6FKK 6&$fkk $tCy 6 &*%) $O$O$O$ d
O$ d{	O$
 +O$ O$ 5#:O$dV V<S6 S6n 8<-	u-$(I$4-4]*++-b <A@@@48@@  !s   D 
DD