
    3j                       S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKr	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJr  S SKJrJrJr  S SKJrJrJrJr  S SKJ r J!r!J"r"  S SK#r#S SK$r#S SK%J&r&  S SK'J(r(  S S	K)J*r*  S S
K+J,r,J-r-J.r.J/r/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6J7r7J8r8J9r9  S SK:J;r;  S SK<J=r=  \>" \
R~                  R                  SS5      5      rA\"(       a  S SKBJCrC  S SKDJErEJFrFJGrG  SSKHJIrI  SSKJJKrK  SSKLJMrM  SrN\;" \OS5      rP " S S\Q5      rR " S S5      rS " S S5      rT\*R                  \*R                  -  rW\R                   " S S 5      5       rY\R                   " S! S"5      5       rZ " S# S$\Z5      r[ " S% S&5      r\ " S' S(5      r] " S) S*\Z5      r^ " S+ S,\\\^5      r_ " S- S.\]\^5      r` " S/ S0\Z5      ra " S1 S2\\\a5      rb " S3 S4\]\a5      rc " S5 S6\Z5      rd " S7 S8\\\d5      re " S9 S:\]\d5      rf " S; S<\\\Z5      rg " S= S>\]\Z5      rh " S? S@\\\Z5      ri\R                  SLSA j5       rk    SMSB jrl " SC SD5      rmSNSE jrnSOSF jro    SPSG jrp " SH SI5      rq " SJ SK5      rrg)Q    )annotationsN)CallableIterableSequence)FutureProcessPoolExecutorThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyIOTYPE_CHECKING)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCacheXPUCodeCache)Timer)do_bench_using_profilingget_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet.TORCHINDUCTOR_AUTOTUNE_POOL_INACTIVITY_TIMEOUT600)
ModuleType)ChoiceCallerPartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      \ rS rSrSrg)!NonzeroWorkspaceNotSupportedErrorK    N__name__
__module____qualname____firstlineno____static_attributes__r1       Z/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/_inductor/autotune_process.pyr/   r/   K       r8   r/   c                      \ rS rSrSr\SS j5       r\ S       SS jj5       r\SS j5       rSS jr	S r
SS	 jrSSS
 jjrSSS jjrSSS jjrSS jrSS jrSS jrSS jrSrg)TuningProcessO   z>
Class to launch and interact with a benchmarking subprocess.
c                   ^ ^ [         R                  S[        R                  " 5       [        R                  R                  [        5      5        U U4S jn U" 5         g! [         a     gf = f)z$
Entry point for the child process.
z3Started autotune subprocess %s. Visible devices: %sc                    >  [         R                  T5      u  pU c  g  U(       a  [        R                  R	                  U5        U " 5       n[         R                  UT5        Ma  ! [
         a  nUn S nAN)S nAff = fN)r<   recvosenvironupdate	Exceptionsend)job	extra_envresulte	read_pipe
write_pipes       r9   workloop,TuningProcess.process_main.<locals>.workloop_   so    !.!3!3I!>; 

)))4 UF ""6:6  ! Fs   -A$ $
A:.A55A:N)autotuning_logdebugrB   getpidrC   getr,   EOFError)rK   rL   rM   s   `` r9   process_mainTuningProcess.process_mainT   sQ    
 	AIIKJJNN/0	
	7	J 		s   A 
A+*A+Nc                T    [         R                  " X4U5        UR                  5         g r@   )pickledumpflush)objrL   rH   s      r9   rF   TuningProcess.sends   s!     	S$j1r8   c                .    [         R                  " U 5      $ r@   )rW   load)rK   s    r9   rA   TuningProcess.recvz   s    {{9%%r8   c                0    Xl         U R                  5         g r@   )devicestart)selfr`   s     r9   __init__TuningProcess.__init__~   s    

r8   c                   [         R                  R                  [         R                  R                  [        5      S5      n[         R
                  " 5       u  p#[         R
                  " 5       u  pE[         R                  " US5      U l        [         R                  " US5      U l        [        R                  " 5       U l        U R                  R                  U R                  [        R                  5        [        R                  US[         R                   " 5        3S[#        U5       3S[#        U5       3/n0 [%        5       ES['        5       [(        R*                  (       a  SOSS	.EnU R,                  b  [#        U R,                  5      U[.        '   [0        R2                  " UUX%4S9U l        [         R6                  " U5        [         R6                  " U5        SU l        g
)z$
Start the benchmarking subprocess.
z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)rB   pathjoindirname__file__pipefdopenrL   rK   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerQ   strr   r   r)   /profile_bandwidth_with_do_bench_using_profilingr`   r,   
subprocessPopenprocesscloserunning)rb   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdrm   s           r9   ra   TuningProcess.start   so    RWW__X68NO$&GGI!$&GGI!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01

#%
  #24 EE DG
 ;;"(+DKK(8C$%!''%8

 	!
!"r8   c                `    U R                   =(       a    U R                  R                  5       SL $ )z*
True if the subprocess is still running.
N)r   r   pollrb   s    r9   aliveTuningProcess.alive   s%     ||; 1 1 3t ;;r8   c                    U R                  5       (       d  U R                  5         [        R                  XR                  US9  g)z(
Push a work item to the child process.
rH   N)r   ra   r<   rF   rL   )rb   reqrH   s      r9   putTuningProcess.put   s/     zz||JJL39Er8   c                    U R                   R                  U5      (       d"  [        SU R                  R                   35      e[
        R                  U R                  5      u  p#[        U[        5      (       a  UeU$ ! [         a    U R                  5         e [         a    U R                  5         e [         a<    [        R                  SU R                  R                  5        U R                  5         e f = f)zs
Get a response from the child process. Raises TimeoutError on timeout;
raises EOFError if the subprocess crashes.
zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)rw   selectTimeoutErrorr   pidr<   rA   rK   killrS   r   rE   rO   	exception
isinstance)rb   timeoutrI   _s       r9   rR   TuningProcess.get   s    
	==''00"%DT\\EUEUDV#WXX%**4>>:IF fi((L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   A#A> >A:C8c                    U R                  5       (       a   [        R                  SU R                  5        U(       a  U R	                  5         gg)z3
Signal the child process to shut down gracefully.
N)r   r<   rF   rL   waitrb   r   s     r9   shutdownTuningProcess.shutdown   s4     ::<<tT__5IIK r8   c                    U R                  5       (       a  U R                  R                  5         U R                  5         g)z%
Wait for the child process to exit.
N)r   r   r   r   r   s    r9   r   TuningProcess.wait   s(     ::<<LL

r8   c                    U R                   R                  5         U R                  R                  5         U R                  R                  5         SU l        g)z
Close resources.
FN)rw   r   rK   rL   r   r   s    r9   r   TuningProcess.close   s;     	r8   c                    U R                  5       (       aD  [        R                  SU R                  R                  5        U R                  R                  5         U R                  5         g)z&
Send a SIGKILL to the child process.
z)Sending SIGKILL to autotune subprocess %dN)r   rO   errorr   r   r   r   r   s    r9   r   TuningProcess.kill   sH     ::<<  ;   LL

r8   c                B    U R                  SS9  U R                  5         g)z(
Gracefully restarts the child process.
Tr   N)r   ra   r   s    r9   restartTuningProcess.restart   s     	4 

r8   )r`   r   rK   r   rw   rL   )rK   	IO[bytes]rL   r   returnNoner@   )rZ   r   rL   r   rH   dict[str, str] | Noner   r   )rK   r   r   r   )r`   
int | Noner   bool)r   r   rH   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r3   r4   r5   r6   __doc__staticmethodrT   rF   rA   rc   ra   r   r   rR   r   r   r   r   r   r7   r1   r8   r9   r<   r<   O   s      < LP'4I	  & &+Z<F6
r8   r<   c                  \    \ rS rSrSrS
S jr\SS j5       rS
S jrSS jr	    SS jr
Srg	)TuningProcessPooli  z
Maintains a pool of TuningProcesses to benchmark kernels in parallel
across devices. By default, we create one TuningProcess per device and
set the sub-process environment to make only that device visible.
c                V   U R                  5       n[        R                  SU5        U Vs/ s H  n[        US9PM     snU l        [
        R                  " 5       U l        U R                   H  nU R                  R                  U5        M      [        [        U5      S9U l        gs  snf )z
Start the child processes.
z$Sub-process autotune device list: %sr`   max_workersN)get_device_listrO   rP   r<   	processesqueueQueueprocess_queuer   r	   lenexecutor)rb   devicesr`   ps       r9   rc   TuningProcessPool.__init__  s     &&(CWM FMMW6-v6WM9>A""1%   +s7|D Ns   B&c                    [         R                  (       d  S/$ [        5       n [        U 5      nUR	                  5       n[
        [        R                  ;   aR  [        R                  [
           R                  S5       Vs/ s H  n[        U5      PM     nn[        U5      U::  d   eU$ [        [        U5      5      $ s  snf )z4
Gather the list of devices to be used in the pool.
N,)r)   autotune_multi_devicer   r   device_countr,   rB   rC   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r9   r   !TuningProcessPool.get_device_list   s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS'R!s1v'RGSw<5(((NE%L!!	 Ts   >B<c                    U R                   R                  5         U R                   H  nUR                  SS9  M     U R                   H  nUR                  5         M     g)z%
Signal all child processes to exit.
Fr   N)r   r   r   r   )rb   r   s     r9   r   TuningProcessPool.shutdown5  sG     	 AJJEJ"  AFFH  r8   c                h   UR                   c   eSS/nU Vs0 s H,  o3[        R                  ;   d  M  U[        R                  U   _M.     nnU R                  R	                  5       nUR                  UR                   R                  US9   UR	                  [        R                  5      U R                  R                  U5        $ s  snf ! [         aC    [        R                  " SU S35        [        S5      s U R                  R                  U5        $ [         aq  n[        R                  " SU S35        [        U5      nS	U;   d  S
U;   a  UR                  5         [        S5      s SnAU R                  R                  U5        $ SnAff = f! U R                  R                  U5        f = f)z
Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
remove it from the queue, execute the benchmark in that subprocess, and return
the TuningProcess to the queue.
NTORCHINDUCTOR_CACHE_DIRTRITON_CACHE_DIRr   zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice 'cudaErrorLaunchFailurecudaErrorIllegalAddress)bmreqrB   rC   r   rR   r   	benchmarkr)   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   rE   r|   r   )rb   choiceenv_varsvrH   r   process_exception	error_msgs           r9   targetTuningProcessPool.target@  s    ||'''-/AB/7Kx!

?%Q

1%x	K$$((*FLL**i@	,;;BB4 ""7+= L  	 MM1& :W W
 <" ""7+!  	 MM.vh 7W W -.I(I5,	9!<""7+!	   ""7+sG   CCC /F:F 	F AF+F,F FF F1c           	     v    [        [        XR                  R                  U R                  U5      5      5      nU$ )z.
Benchmark each choice in a separate process.
)dictzipr   mapr   )rb   choicesresultss      r9   r   TuningProcessPool.benchmarki  s-     s7MM$5$5dkk7$KLMr8   )r   r   r   Nr   )r   zSequence[int | None])r   r'   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])r3   r4   r5   r6   r   rc   r   r   r   r   r   r7   r1   r8   r9   r   r     sC    E& " "(	',R+ 
+r8   r   c                  |    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SrS\S'   \    SS j5       rSS jrSr	g)
TensorMetai{  ztorch.devicer`   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNz
str | Nonenamec           
        SSK Jn  [        U[        5      (       a;  U Vs/ s H  o0R	                  U5      PM     nn[        S U 5       5      (       d   eU$ Un[        U[        R                  5      (       a  [        R                  " SUS9nUR                  5       nUc   eUR                  5       nUc   e[        UU[        R                  R                  R                  UR!                  5       5      [        R                  R                  R                  U" U5      5      [        R                  R                  R#                  UR%                  5       R&                  5      UR)                  5       S9$ s  snf )Nr   )#get_strides_with_layout_constraintsc              3  B   #    U  H  n[        U[        5      v   M     g 7fr@   )r   r   .0xs     r9   	<genexpr>*TensorMeta.from_irnodes.<locals>.<genexpr>  s     A&Qz!Z00&s   fake)r   layout)r`   r   r   r   r   r   ) torch._inductor.select_algorithmr   r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r+   graphsizevarsoptimization_hintsget_sizeoptimization_hint
get_layoutr   get_name)clsirnodesr   r   rI   noder   r`   s           r9   r  TensorMeta.from_irnodes  s+    	Ygx((>E Fg!1!1!!4gF FA&AAAAAMdBII&&99&6D    "!!!''""55dmmoFGG$$773D9 77##55doo6G6N6NO	
 		
 !Gs   E4c                    [        U R                  U R                  U R                  U R                  U R
                  S9$ )N)r`   r   
extra_size)r   r   r   r`   r   r   r   s    r9   	to_tensorTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r8   r1   )r  z)LayoutOrBuffer | Sequence[LayoutOrBuffer]r   TensorMeta | list[TensorMeta])r   torch.Tensor)
r3   r4   r5   r6   __annotations__r   classmethodr  r  r7   r1   r8   r9   r   r   {  sP    ((++KD*
?
	&
 
<
r8   r   c                      \ rS rSrSr          SS jr      SS jrSS jrSS.     SS jjrSS.     SS	 jjr	S
r
g)BenchmarkRequesti  a  
Only handle triton template benchmark for now. The extern kernel benchmark
can be done inside the same process since they usually don't cause crash.

Important: Instances of this class and subclasses have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                2  ^ Xl         [        U[        5      (       a	  U/U l        OX l        T(       aQ  [        T[        [
        45      (       a6  [        T5      S:  a  [        U4S jT 5       5      (       d   eTS   U l        OTU l        X@l	        SU l
        g )Nr(   c              3  n   >#    U  H*  nS   H   n[        TS   U5      [        X5      :H  v   M"     M,     g7f))r`   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r9   r   ,BenchmarkRequest.__init__.<locals>.<genexpr>  s=      / Q .q148GA<LL Q M/s   25r   F)kernel_namer   r   input_tensor_metatupler   r   r  r#  
extra_argsbenchmark_with_cudagraphs)rb   r%  r&  r#  r(  s      ` r9   rc   BenchmarkRequest.__init__  s     ''448I7JD"7H"*-?%"O"O%&* /    
 '9&;D# 3ED#$).&r8   c                   [         er@   NotImplementedErrorrb   outinput_tensorss      r9   make_run_fnBenchmarkRequest.make_run_fn  s
     "!r8   c                    g r@   r1   r   s    r9   cleanup_run_fnBenchmarkRequest.cleanup_run_fn  s    r8   Nr/  c                   [         er@   r,  rb   fnr/  r0  s       r9   do_benchBenchmarkRequest.do_bench  s
     "!r8   c                  [         R                  [        R                  5      nU(       a  [        R                  " 5       nUcp  U R
                  (       a  U R                  (       d   S5       e[        U5      S:X  d   e[        S U R
                   5       5      nU R                  R                  5       nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       n U R                  " USU06nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       nU R                  (       a  [        R                   " U5      nOU R"                  " U/UQUP76 nU(       a1  [        R                  " 5       W-
  n	[         R%                  SU WWU	5        U R'                  5         U$ ! [         a#    [         R                  S5        [        S5      s $ f = f)NzJInput and output tensor meta must be populated when input_tensors is emptyr   c              3  @   #    U  H  oR                  5       v   M     g 7fr@   )r  r   s     r9   r   -BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !P9OA++--9Os   r/  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rO   isEnabledForloggingDEBUGtimer&  r#  r   r'  r  r1  r/   infor   r)  r*   benchmark_gpu_with_cuda_graphr:  rP   r4  )
rb   r/  r0  rP   start_tscreate_tensor_elapser9  load_elapseresbench_elapses
             r9   r   BenchmarkRequest.benchmark  s   
 ++GMM:yy{H ;))d.E.E \E }%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!=:c:B ))+0Kyy{H));;B?C--8]8C8C99;1L  H$ 	
1 1 	  RS<	 s   (F5 5*G"!G")r)  r(  r&  r%  r#  )
r%  r|   r&  r  r#  r  r(  Iterable[Any]r   r   r0  r  r/  r  r   zCallable[[], None]r   r0  r  r/  torch.Tensor | Noner   r   )r3   r4   r5   r6   r   rc   r1  r4  r:  r   r7   r1   r8   r9   r  r    s    // 9/ :	/
 "/ 
/>"*"1="	"
 $(	" %" !	"
 
" $(/$/ !/ 
	/ /r8   r  c                  ^    \ rS rSrSr     S         S	S jjrSS.     S
S jjrSrg)_TestBenchmarkRequesti  z
Supports unit testing. Defined in this file instead of the test file so the
TuningProcess sub-process can unpickle these objects.
Nc                @    Xl         X l        X0l        X@l        XPl        g r@   )rI   r`   sleepexccrash)rb   rI   r`   rR  rS  rT  s         r9   rc   _TestBenchmarkRequest.__init__  s     

r8   r6  c                  U R                   b=  [        R                  R                  [        S 5      [        U R                   5      :X  d   eU R                  (       a   [        R                  " U R                  5        U R                  (       a  U R                  eU R                  (       a  [        R                  " S5        U R                  $ )Nr(   )r`   rB   rC   rR   r,   r|   rR  rB  rS  rT  rz   exitrI   r.  s      r9   r   _TestBenchmarkRequest.benchmark,  sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r8   )rT  r`   rS  rI   rR  )        NNNF)
rI   r   r`   r   rR  zfloat | NonerS  zException | NonerT  r   rM  )r3   r4   r5   r6   r   rc   r   r7   r1   r8   r9   rP  rP    sv     !" $  	
   HL*1D	 r8   rP  c                  0    \ rS rSrSS.     SS jjrSrg)GPUDeviceBenchmarkMixini:  Nr6  c                  [        S / UQUP 5       5      n[        U5      S::  d
   SU 35       e[        S U 5       S5      n[        U5      n[        U5      S:X  a  [        [	        U5      5      nOUR                  5       nUR                  U5         [        R                  " XS9nUR                  5         S S S 5        U$ ! , (       d  f       W$ = f)Nc              3    #    U  H{  n[        U[        R                  5      (       d  M$  [        UR                  R
                  5      (       d  MJ  UR                  R                  c  Mc  UR                  R                  v   M}     g 7fr@   )r   torchTensorr   r`   typeindexr   tensors     r9   r   3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>A  s^      $
/&%,,/   v}}))*   ##	  FMM/s   #B"BB(Br(   zCan not mix devices c              3     #    U  HA  n[        UR                  R                  5      (       d  M)  UR                  R                  v   MC     g 7fr@   )r   r`   r`  rb  s     r9   r   rd  J  s5      +F&--,,- #""+s
   (AAcudar   )
r!   r   nextr   itercurrent_devicer`   r*   r   synchronize)	rb   r9  r/  r0  device_idx_setdevice_typer   
device_idxrH  s	            r9   r:   GPUDeviceBenchmarkMixin.do_bench;  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0''?C((* 1 
	 10 
s   %C
Cr1   rM  r3   r4   r5   r6   r:  r7   r1   r8   r9   r[  r[  :  s/    
 $(	 % !	
 
 r8   r[  c                  0    \ rS rSrSS.     SS jjrSrg)CPUDeviceBenchmarkMixini]  Nr6  c               .    [         R                  " U5      $ r@   )r*   benchmark_cpur8  s       r9   r:   CPUDeviceBenchmarkMixin.do_bench^  s     ((,,r8   r1   rM  ro  r1   r8   r9   rq  rq  ]  s/    
 $(	- %- !	-
 
- -r8   rq  c                     ^  \ rS rSrSr       S                               S	U 4S jjjr      S
S jrS rSS jrSr	U =r
$ )TritonBenchmarkRequestig  z
Represents a standalone benchmark request for a Triton Template.

Important: Instances of this class have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                   > [         TU ]  XX45        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl	        Xl
        Xl        Xl        g r@   )superrc   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpackworkspace_sizeworkspace_zero_fill)rb   r%  r&  r#  r(  ry  rz  r{  r|  r}  r~  r  r  r  r  r  	__class__s                   r9   rc   TritonBenchmarkRequest.__init__o  sT    $ 	9KX& 0$"#6 %:"$8!(
,#6 r8   c                  [         R                  " U R                  U R                  5      n[        R                  SU R                  U R                  5        [        X0R                  5      R                  n[        U R                  5      nU R                  bu  SSKJn  [        R                  " U R                  4[        R                   UR"                  S9nU R$                  (       a  UR'                  5         UR)                  U5      nXuU'   SUR*                  l        0 n	SS Kn
SU
R1                  U5      R2                  ;   a  SU	S'   UR"                  R4                  S:X  a  SnOPUR"                  R4                  n[7        U5      nUR9                  U R:                  R"                  R(                  5      n[=        [        X0R                  5      [        R>                  R@                  RB                  RD                  5      (       a"  [F        RH                  " U/UQUPUQ70 U	DSU0D6$ [F        RH                  " U/UQUPUQ70 U	DUS	S
.D6$ )Nz"benchmark module key: %s, path: %sr   )WORKSPACE_ARG_PLACEHOLDERr   r`   FwarmupcpustreamT)r  benchmark_run)%r   load_by_key_pathrz  ry  rO   rP   r!  r%  runr   r(  r  r  r  r^  emptyuint8r`   r  zero_ra  __self__with_bandwidth_infoinspect	signature
parametersr`  r   get_raw_streamr#  r   	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rb   r/  r0  mod
run_methodr(  r  workspace_tensorworkspace_index
warmup_argr  r  rl  r   s                 r9   r1  "TritonBenchmarkRequest.make_run_fn  s+    **4+@+@$BRBRS0!!	
 S"2"2377
$//*

 *R${{$$&kkzz 
 '' &&((../HIO*:'27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 
 $$  	
    $$  	
  " r8   c                    [         R                  " U R                  U R                  5      n[	        XR
                  5      nUR                  5         UR                  S   R                  U l        g Nr   )	r   r  rz  ry  r!  r%  
precompile	launchersn_regs)rb   r  kernels      r9   r  !TritonBenchmarkRequest.precompile  sT    **4+@+@$BRBRS../&&q)00r8   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r%  ry  rz  r   s    r9   __str__TritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr8   )r  r  rz  ry  r  r~  r}  r{  r|  r  r  r  )r   r   r   r   r   NF) r%  r|   r&  r  r#  r  r(  rK  ry  r|   rz  r|   r{  r   r|  r   r}  r   r~  r   r  r   r  r   r  r   r  r   r  r   r   r   rL  r   r|   r3   r4   r5   r6   r   rc   r1  r  r  r7   __classcell__r  s   @r9   rv  rv  g  s    " $%%&$%%)$)!77 97 :	7
 "7 7 7 7 7 !7  #7 "7 7 7 #7  "!7" 
#7 7>E*E1=E	EN1U Ur8   rv  c                      \ rS rSrSrg)TritonGPUBenchmarkRequesti  r1   Nr2   r1   r8   r9   r  r    r:   r8   r  c                      \ rS rSrSrg)TritonCPUBenchmarkRequesti  r1   Nr2   r1   r8   r9   r  r    r:   r8   r  c                     ^  \ rS rSrSr  S               SU 4S jjjr      SS jrSS.SU 4S jjjrSS jrS	 r	SS
 jr
SrU =r$ )ExternKernelBenchmarkRequesti  a  
A class to handle extern kernel benchmark requests. This allows extern kernels
(like aten::mm) to be benchmarked in a subprocess, similar to Triton kernels.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
Nc                ^   > [         TU ]  XX45        XPl        U=(       d    0 U l        Xpl        g r@   )rx  rc   callable_pathkwargshas_out_variant)	rb   r%  r&  r#  r(  r  r  r  r  s	           r9   rc   %ExternKernelBenchmarkRequest.__init__  s,     	9KX*l.r8   c                   U R                  5       nU R                  (       a  [        R                  " U/UQ7SU06$ [        R                  " U/UQ76 $ )Nr/  )to_callabler  r  r  )rb   r/  r0  r9  s       r9   r1  (ExternKernelBenchmarkRequest.make_run_fn   sN     $$RA-ASAA $$R8-88r8   r6  c                 >^^ Ub  UR                  5       S:X  a  gU R                  (       d  [        T5      S:X  a  [        TU ]  " TSU06$ U R                  5       mT" T6 nUbt  [        R                  R                  R                  R                  U[        UR                  5       5      [        UR                  5       5      5        UR                  U5        U R                  (       a  [         R"                  " UU4S j5      $ [$        R&                  (       a  [)        UU4S j5      $ [         R                  " TT0 5      $ )Nr   rY  r/  c                    > T " T6 $ r@   r1   algor0  s   r9   <lambda>8ExternKernelBenchmarkRequest.benchmark.<locals>.<lambda>  s
    D-0r8   c                    > T " T6 $ r@   r1   r  s   r9   r  r    s
    m8Lr8   )numelr  r   rx  r   r  r^  _C_dynamoguardsassert_size_strider'  sizestridecopy_r)  r*   rD  r)   r}   r   )rb   r/  r0  out_newr  r  s     ` @r9   r   &ExternKernelBenchmarkRequest.benchmark  s    ?syy{a/3}#5#:7$m===##%DM*G  ''::U388:.cjjl0C 		'"--"@@0  EE/0LMM((}bAAr8   c                    g r@   r1   r   s    r9   r  'ExternKernelBenchmarkRequest.precompile!      r8   c                    SSK Jn  [        XR                  5      nU R                  (       a!  [
        R                  " U40 U R                  D6$ U$ )Nr   )extern_kernels)r  r  r!  r%  r  r  r  )rb   r  r9  s      r9   r  (ExternKernelBenchmarkRequest.to_callable%  s>     	D^%5%56;;$$R74;;77	r8   c                "    SU R                    S3$ )NzExternKernelBenchmarkRequest())r  r   s    r9   r  $ExternKernelBenchmarkRequest.__str__1  s    .t/A/A.B!DDr8   )r  r  r  )NT)r%  r|   r&  r  r#  r  r(  rK  r  r|   r  zdict[str, Any] | Noner  r   r   r   rL  )r0  r  r/  rN  r   r  )r3   r4   r5   r6   r   rc   r1  r   r  r  r  r7   r  r  s   @r9   r  r    s     )- $// 9/ :	/
 "/ / &/ / 
/ /	9*	91=	9		9 RV B B,
E Er8   r  c                      \ rS rSrSrg)ExternKernelGPUBenchmarkRequesti5  r1   Nr2   r1   r8   r9   r  r  5       	r8   r  c                      \ rS rSrSrg)ExternKernelCPUBenchmarkRequesti;  r1   Nr2   r1   r8   r9   r  r  ;  r  r8   r  c                  z   ^  \ rS rSrSr                SU 4S jjr      S	S jrS
S jrSS jrSr	U =r
$ )SubgraphBenchmarkRequestiA  z
Benchmark request for subgraph choices.

Pre-compiles the subgraph in the main process and stores
the module path/cache key for loading in subprocess.
c                J   > [         TU ]  XX45        XPl        X`l        Xpl        g r@   )rx  rc   ry  rz  sym_input_values)	rb   r%  r&  r#  r(  ry  rz  r  r  s	           r9   rc   !SubgraphBenchmarkRequest.__init__I  s'     	9KX& 0 0r8   c                  ^^^ [         R                  " U R                  U R                  5      mU R                  mUUU4S j$ )Nc                 .   > TR                  / TQT Q5      $ r@   )call)r0  r  r  s   r9   r  6SubgraphBenchmarkRequest.make_run_fn.<locals>.<lambda>^  s    sxx C"2 C] CDr8   )r   r  rz  ry  r  )rb   r/  r0  r  r  s     `@@r9   r1  $SubgraphBenchmarkRequest.make_run_fnX  s5     **4+@+@$BRBRS00DDr8   c                    g r@   r1   r   s    r9   r  #SubgraphBenchmarkRequest.precompile`  r  r8   c                <    SU R                    SU R                   S3$ )NzSubgraphBenchmarkRequest(z, r  )r%  ry  r   s    r9   r   SubgraphBenchmarkRequest.__str__d  s&    *4+;+;*<Bt?O?O>PPQRRr8   )rz  ry  r  )r%  r|   r&  r  r#  r  r(  rK  ry  r|   rz  r|   r  z	list[int]r   r   rL  r   r  r  r  s   @r9   r  r  A  s    11 91 :	1
 "1 1 1 $1 
1E*E1=E	ES Sr8   r  c                      \ rS rSrSrg)SubgraphGPUBenchmarkRequestih  r1   Nr2   r1   r8   r9   r  r  h  r:   r8   r  c                      \ rS rSrSrg)SubgraphCPUBenchmarkRequestil  r1   Nr2   r1   r8   r9   r  r  l  r:   r8   r  c                     ^  \ rS rSrSr S             SU 4S jjjrS r      SS jrSS jrS r	SS jr
SS	 jrSS
 jrSS jrSrU =r$ )CUTLASSBenchmarkRequestip  aM  
A class to handle CUDA (CUTLASS) benchmark requests. This class is for
managing the lifecycle of a CUDA kernel benchmark, including compiling
the source code, managing workspace memory, and executing the kernel.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
c                F  > [         TU ]  XX45        XPl        SU l        S U l        S U l        SU l        SU l        SU l        X`l	        US:X  a  [        O[        U l        [        U5      U l        U R                  R                  U R                  S5      u  U l        U l        g )Nr   F xpuso)rx  rc   source_coder  	workspaceDLL_workspace_size_updatedhash_keysource_filerl  r   r   codecache_clsr   r   write)rb   r%  r&  r#  r(  r  rl  r  s          r9   rc    CUTLASSBenchmarkRequest.__init__z  s     	9KX&#$.2&*',$ "&-8E-A\} 8 E*.*<*<*B*Bd+
't'r8   c                    [         R                  SU 5        U R                  R                  U R                  S5        [         R                  SU 5        g)zk
Precompile the CUDA source code to populate the CUDACodeCache.
This may happen in a separate thread pool.
Precompiling %sr  Done precompiling %sN)rO   rP   r  compiler  r   s    r9   r  "CUTLASSBenchmarkRequest.precompile  sB    
 	.5""4#3#3T:3T:r8   c          	       ^ U R                  5         U R                  5         [        U5      U/-    Vs/ s H  n[        UR	                  5       5      PM     nn[
        R                  SU R                  U R                  U R                  U R                  UU R                  5        [        U R                  R                  U R                  R                  5       5      5      n[        U R                  U R                  5      n[        S5      nU R                   S:  af  ["        R$                  " U R                   S-   S-  ["        R&                  UR(                  S9U l        [        U R*                  R	                  5       5      n[,        R.                  " U/UQU R                  QSPUPUP76 n U" 5         U$ s  snf ! [0         a-  n	[3        U	5      mU4S jn
U R5                  5         U
s Sn	A	$ Sn	A	ff = f)zW
Create a function to run the CUDA/XPU kernel with the given input and output tensors.
zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         r  Nc                    > [        T 5      er@   )RuntimeError)err_msgs   r9   raise_runtime_error@CUTLASSBenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7++r8   )ensure_dll_loadedupdate_workspace_sizer   r   data_ptrrO   rP   r%  r  r  r  r(  r   r  ri  r!  r  r^  zerosfloat64r`   r  r  r  r
  r|   r4  )rb   r/  r0  rc  args
stream_ptrr  workspace_ptrretrJ   r  r  s              @r9   r1  #CUTLASSBenchmarkRequest.make_run_fn  s    	 ""$:>}:MQTPU:UV:U*+:UVMMHHOO	
 !!001F1F1U1U1WX

 TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
[ WH  	'!fG, !&&	's#   #F5,F: :
G1"G,&G1,G1c           
     $   U R                   (       a  g U R                  5         [        [        R	                  S U R
                   5       5      5      n[        US-   5       Vs/ s H  n[        S 5      PM     nn[        U R                  R                  U R                  R                  5       5      5      n[        U R                  U R                  5      n[        5       nU" / UQU R                  Q[!        U5      PS PUP76   U R                  R#                  5         UR$                  U l        [(        R+                  SU R&                  U R                  U R,                  U R.                  U R                  UU R                  5        SU l         g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr@   )r   )r   metas     r9   r   @CUTLASSBenchmarkRequest.update_workspace_size.<locals>.<genexpr>  s     G0F))0Fs   r(   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r   fromkeysr&  r   r   r   r  ri  r!  r  r%  r   r(  r
   rj  valuer  rO   rP   r  r  )rb   unique_input_countr   r  r  r  c_workspace_sizes          r9   r  -CUTLASSBenchmarkRequest.update_workspace_size  sl   ''  MMG0F0FGG
 )..@1.D(EF(E1(EF!!001F1F1U1U1WX

 TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	))+.44 hMMHHOO		
 (,$; Gs   "Fc                    U R                   c9  U R                  R                  U R                  S5      u  U l         U l        U l        U R                   R                  5         g )Nr  )r  r  r]   r  r  r  openr   s    r9   r  )CUTLASSBenchmarkRequest.ensure_dll_loaded  sJ    888<8J8J8O8O  $95DHdmT%5 	r8   c                n    U R                   b!  U R                   R                  5         S U l         S U l        g r@   )r  r   r  r   s    r9   r4  &CUTLASSBenchmarkRequest.cleanup_run_fn   s(    88HHNNDHr8   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nr  z, self.source_file=z, self.hash_key=)r%  r  r  r   s    r9   r  CUTLASSBenchmarkRequest.__str__  s0    #$""$$8t'7'7&99JDMM;KLLr8   c                X    U R                   R                  5       nS US'   S US'   SUS'   U$ )Nr  r  Fr  )__dict__copyrb   states     r9   __getstate__$CUTLASSBenchmarkRequest.__getstate__	  s7    ""$e!k+0'(r8   c                :    U R                   R                  U5        g r@   )r)  rD   r+  s     r9   __setstate__$CUTLASSBenchmarkRequest.__setstate__  s    U#r8   )
r  r  r  r   rl  r  r  r  r  r  )rf  )r%  r|   r&  r  r#  r  r(  rK  r  r|   rl  r|   r   r   rL  r   r  )r   dict[str, Any])r,  r2  r   r   )r3   r4   r5   r6   r   rc   r  r1  r  r  r4  r  r-  r0  r7   r  r  s   @r9   r  r  p  s      "

 9
 :	

 "
 
 
 

 
0;6*61=6	6p$,LM$ $r8   r  c                  j   ^  \ rS rSr            SU 4S jjrS r      SS jrS	S jrSrU =r	$ )
CppBenchmarkRequesti  c                `   > [         TU ]  XX45        XPl        [        U5      U l        S U l        g r@   )rx  rc   r  r   r  r  )rb   r%  r&  r#  r(  r  r  s         r9   rc   CppBenchmarkRequest.__init__  s.     	9KX& --1r8   c                    [         R                  SU 5        [        R                  " U R                  SS9  [         R                  SU 5        g )Nr  r  rl  r  )rO   rP   r   r]   r  r   s    r9   r  CppBenchmarkRequest.precompile%  s<     	.5$**>3T:r8   c               h   [         R                  " U R                  SS9U l        [	        U5      U/-    Vs/ s H  o3R                  5       PM     nn[        R                  SU R                  U R                  UU R                  5        [        U R                  U R                  5      n[        S U R                   5       5      (       d   e[        R                  /[        U5      [        [	        U R                  5      5      -   -  Ul        [         R"                  " U/UQU R                  Q76 $ s  snf )Nr  r8  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr@   )r   ctypesc_ulonglong)r   args     r9   r   2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>;  s      R/3:c6#5#566/s   '))r   r]   r  r  r   r  rO   rP   r%  r(  r!  r  r<  r=  r   argtypesr  r  )rb   r/  r0  rc  r  r  s         r9   r1  CppBenchmarkRequest.make_run_fn,  s     $$T%5%55I04]0Cse0KL0Kf!0KLXHHOO	
 TXXt'7'78
R$//RRRRR%112ID122


   

 __
 	
! Ms   D/c                "    SU R                   < 3$ )Nr  )r%  r   s    r9   r  CppBenchmarkRequest.__str__G  s    #$""$%%r8   )r  r  r  )r%  r|   r&  r  r#  r  r(  rK  r  r|   r   r   rL  r  )
r3   r4   r5   r6   rc   r  r1  r  r7   r  r  s   @r9   r4  r4    sn    22 92 :	2
 "2 2 
2;
*
1=
	
6& &r8   r4  c                  ^   ^  \ rS rSrSr            SU 4S jjr      SS jrSrU =r$ )CuteDSLBenchmarkRequestiK  z;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.c                   > [         TU ]  XX45        UR                  5       n[        R                  " U5      u  U l        U l        g r@   )rx  rc   finalize_allr   r  rz  ry  )rb   r%  r&  r#  r(  r  finalized_coder  s          r9   rc    CuteDSLBenchmarkRequest.__init__N  s>     	9KX$1132=2C2CN2S/t/r8   c          	     n  ^^^	 [         R                  " U R                  U R                  5      nSSKJn  U R                   SU 3n[        X55      (       dG  [        U5       Vs/ s H   n[        [        X65      5      (       d  M  UPM"     nn[        SU SU 35      e[        X55      m	UU	U4S jnU$ s  snf )z
Create a function to run the CuteDSL kernel with the given input and output tensors.
Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
r(   )MAIN_SUFFIXr   z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 ~   > [        S5      n U R                  TR                  R                  5      nT" / TQTP7SU06$ )Nrf  r  )r   r  r`   ra  )r   r  r0  kernel_funcr/  s     r9   
run_kernel7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernelq  s@    7?%44SZZ5E5EFFBBsB6BBr8   )r   r  rz  ry  codegen.cutedsl.cutedsl_kernelrK  r%  hasattrdircallabler!  r
  )
rb   r/  r0  r  rK  main_func_namer   	availablerN  rM  s
    ``      @r9   r1  #CuteDSLBenchmarkRequest.make_run_fn[  s     **4+@+@$BRBRS 	@ ,,-Q{m<s++*-c(S($hws?Q6R(IS??OOghqgrs  c2	C
  Ts   #B2B2)rz  ry  )r%  r|   r&  r  r#  r  r(  ztuple[Any, ...]r  r&   r   r   rL  )	r3   r4   r5   r6   r   rc   r1  r7   r  r  s   @r9   rE  rE  K  sn    ETT 9T :	T
 $T #T 
T*1=	 r8   rE  c                 Z    [        5       n [        R                  " U R                  5        U $ r@   )r   atexitrx   r   )pools    r9   get_tuning_process_poolrZ  y  s    D
OODMM"Kr8   c                4    [        5       R                  U 5      $ )zG
Do benchmarking in a subprocess and return the perf number (latency).
)rZ  r   )r   s    r9   benchmark_in_sub_processr\    s     #$..w77r8   c                      \ rS rSr% SrSrS\S'   \R                  " 5       r	S\S'   Sr
S	\S
'   S r\S 5       r\S 5       rSS jrSS jrSS jrS rSS jrSS jrSS jrS r\S 5       rSrg)AutotuneProcessPooli  zf
Singleton pool manager for running autotuning (precompilation + benchmarking)
in a separate process.
NzAutotuneProcessPool | None	_instancezthreading.Lock_lockFr   _shutdown_for_inactivityc                t    U R                  5       U l        S U l        S U l        U R	                  5       U l        g r@   )
_init_pool_pool_warmup_future_warmup_start_time_init_timer_timerr   s    r9   rc   AutotuneProcessPool.__init__  s0    151B
2604$($4$4$6r8   c                    U R                   c:  U R                     U R                   c  U " 5       U l         SSS5        U R                   $ U R                   $ ! , (       d  f       U R                   $ = f)z*Get or create the singleton pool instance.N)r_  r`  r  s    r9   get_instance AutotuneProcessPool.get_instance  sV     == ==($'ECM  }}s}}	  }}s   A
A-c                    [         R                  (       d   S5       eU R                  c*  U R                  5       U l        U R	                  5       U l        U R                  $ )zGet the process pool.zFTo use AutotuneProcessPool, pipeline_max_autotune_gemm must be enabled)r)   pipeline_max_autotune_gemmrd  rc  rg  rh  r   s    r9   rY  AutotuneProcessPool.pool  sQ     00 	
T	
0 ::*DJ**,DKzzr8   c                L    [         S:  a  [        [         U R                  5      $ g r  ) AUTOTUNE_POOL_INACTIVITY_TIMEOUTr   _on_inactivity_timeoutr   s    r9   rg  AutotuneProcessPool._init_timer  s!    +a/94;V;VWWr8   c                T    U R                   b  U R                   R                  5         g g r@   )rh  record_callr   s    r9   _record_activity$AutotuneProcessPool._record_activity  s!    ;;"KK##% #r8   c                   [         R                  S[        5        U R                     U R                  b   U R                  R                  SS9  S U l        S U l        S[        l        S S S 5        g ! , (       d  f       g = f)NzAAutotuneProcessPool shutting down due to inactivity (timeout=%ds)Fr   T)	rO   rC  rr  r`  rd  r   rh  r^  ra  r   s    r9   rs  *AutotuneProcessPool._on_inactivity_timeout  sb    O,	

 ZZzz%

###/!
DK
 <@8 ZZs   A A00
A>c                    [         R                  " S5      n[        SUS9n[        R                  " U R
                  5        [        R                  S5        U$ )z
Get or create the process pool.

Uses ProcessPoolExecutor with 'spawn' context for CUDA safety.
ProcessPoolExecutor is lazily initialized - workers are not spawned
until the first submit() call, making this property non-blocking.
spawnr(   )r   
mp_contextz2AutotuneProcessPool created (workers spawn lazily))mpget_contextr   rX  rx   	_shutdownrO   rC  )rb   ctxrY  s      r9   rc  AutotuneProcessPool._init_pool  sH     nnW%"
 	'PQr8   c                   U R                   c  U R                     U R                   c  [        R                  " 5       U l        U R
                  R                  [        [        R                  R                  R                  R                  S9U l         U R                   R                  U R                  5        [        R!                  S5        SSS5        U R                   $ U R                   $ ! , (       d  f       U R                   $ = f)z
Submit a warmup job to eagerly spawn workers and initialize CUDA.

This is optional - call it early to hide spawn latency.
Returns the warmup future which can be ignored or awaited.
N)fp32_precisionzWarmup job submitted)re  r`  rB  perf_counterrf  rY  submit_init_autotune_subprocessr^  backendsrf  matmulr  add_done_callback_on_warmup_completerO   rC  r   s    r9   warm_upAutotuneProcessPool.warm_up  s     &&&..2.?.?.AD+*.))*:*:1',~~':':'A'A'P'P +; +D' ''99$:R:RS"''(>?  """t"""  """s   B1C++
Dc                *   SnU R                   b"  [        R                  " 5       U R                   -
  n UR                  5       n[        R                  SUU5        U R                  5         g! [         a  n[        R                  SU5        UeSnAff = f)z/Callback invoked when the warmup job completes.NzEAutotuneProcessPool warmup completed successfully in %.4f seconds: %sz4AutotuneProcessPool warmup failed after %.4f seconds)	rf  rB  r  rI   rO   rC  rw  rE   r   )rb   futurewarmup_elapsed_timerI   rJ   s        r9   r  'AutotuneProcessPool._on_warmup_complete  s    """."&"3"3"58O8O"O	]]_FW#
 !!# 	  F# G	s   7A+ +
B5BBc                   ^  T R                   R                  " U/UQ70 UD6nT R                  b  UR                  U 4S j5        U$ )z-Submit a job to the pool and return a Future.c                $   > TR                  5       $ r@   )rw  )r   rb   s    r9   r  ,AutotuneProcessPool.submit.<locals>.<lambda>	  s    t/D/D/Fr8   )rY  r  rh  r  )rb   r9  r  r  r  s   `    r9   r  AutotuneProcessPool.submit  s?    !!"6t6v6;;"$$%FGr8   c                    U R                   b!  U R                   R                  5         SU l         U R                  b!  U R                  R                  SS9  SU l        gg)zShutdown the pool on exit.NFr   )rh  quitrd  r   r   s    r9   r  AutotuneProcessPool._shutdown  sN    ;;"KKDK::!JJU+DJ "r8   c                    U R                   bD  U R                     U R                   b!  U R                   R                  5         SU l         SSS5        gg! , (       d  f       g= f)z+Explicitly shutdown the singleton instance.Nr_  r`  r  rk  s    r9   shutdown_instance%AutotuneProcessPool.shutdown_instance  sH     ==$==,MM++-$(CM  %s   /A
A!)rd  rh  re  rf  )r   zTimer | Noner   )r   Future[Any])r  r  r   r   )r3   r4   r5   r6   r   r_  r  	threadingLockr`  ra  rc   r  rl  propertyrY  rg  rw  rs  rc  r  r  r  r  r  r7   r1   r8   r9   r^  r^    s    
 -1I)0%NN,E>,%*d*7    
&@"(#(* ) )r8   r^  c                 Z    [         R                  =(       a    [        R                  (       + $ r@   )r)   ro  r^  ra  r1   r8   r9   use_pipelined_autotuningr    s!    )) 	=#<<<r8   c                    SSK nUR                  R                  5       (       a  UR                  " SSS9  XR                  R                  R
                  l        g)z1
Warmup function run in the autotune subprocess.
r   Nr(   rf  r   T)r^  rf  is_availabler  r  r  r  )r  r^  s     r9   r  r  &  sC      zz  Af%0>NN-r8   c                     U R                  5       nU$ ! [         a#    [        R                  SU SS9  [	        S5      s $ f = f)a
  
Run autotuning benchmarks in a subprocess.

This function is submitted to AutotuneProcessPool and runs in isolation
to prevent GPU contention with the main compilation process.

Args:
    picklable_choices: List of picklable choice information

Returns:
    timing
zFailed to benchmark choice %sT)exc_infor   )r   rE   rO   warningr   )benchmark_requesttimings     r9   run_autotune_in_subprocessr  5  sQ     ",,. + 	 	
 U|s    *A Ac                      \ rS rSr% SrSrS\S'   \R                  " 5       r	SSS jjr
\SS j5       rS rSSS	 jjr\SS
 j5       rSrg)PrecompileThreadPooliU  z
Thread pool for running precompilation asynchronously.

This allows the main compilation process to continue while
precompilation happens in background threads.
NzPrecompileThreadPool | Noner_  c                     [        US9U l        g )Nr   )r	   	_executor)rb   r   s     r9   rc   PrecompileThreadPool.__init__`  s    +Dr8   c                    SSK Jn  U R                  c@  U R                     U R                  c  U " U" 5       5      U l        S S S 5        U R                  $ U R                  $ ! , (       d  f       U R                  $ = f)Nr   )get_num_workers)r  r  r_  r`  )r  r  s     r9   rl  !PrecompileThreadPool.get_instancec  s]    D== ==($'(9$:CM  }}s}}  }}s    A  
A9c                    [         R                  " 5       n[        R                  " UR                  U5      nU R
                  R                  " U/UQ70 UD6$ r@   )contextvarscopy_contextr  r  r  r  r  )rb   r9  r  r  r  s        r9   r  PrecompileThreadPool.submitm  sE    &&(sww+~~$$R9$9&99r8   c                4    U R                   R                  US9$ )Nr   )r  r   r   s     r9   r  PrecompileThreadPool._shutdowns  s    ~~&&D&11r8   c                    U R                   bC  U R                     U R                   b   U R                   R                  SS9  S U l         S S S 5        g g ! , (       d  f       g = f)NFr   r  rk  s    r9   r  &PrecompileThreadPool.shutdown_instancev  sK    ==$==,MM+++7$(CM  %s   .A
A )r  )   )r   r   )r   r  )F)r   r   r   )r3   r4   r5   r6   r   r_  r  r  r  r`  rc   r  rl  r  r  r  r7   r1   r8   r9   r  r  U  sX     .2I*1NNEE  :2 ) )r8   r  c                  d    \ rS rSrSr0 r\SS j5       r\S	S j5       r	\      S
S j5       r
Srg)AsyncAutotuneri  a  
Handles asynchronous autotuning of kernel choices in a separate process.

This class manages the lifecycle of autotuning:
1. Accepts precompiled choices from the main process
2. Submits benchmarking work to AutotuneProcessPool
3. Returns results via a Future

Usage:
    autotuner = AsyncAutotuner(choices)
    autotuner.start()  # Kicks off async benchmarking
    timings = autotuner.get_results()  # Blocks until complete
c                (    U R                  5       U-   $ r@   )r  )r   
inputs_keys     r9   get_choice_hashAsyncAutotuner.get_choice_hash  s     :--r8   c                    U H  n[         R                  X25      nU[         R                  ;   a  M.  [        USS5      c   S5       e[        R                  5       R                  [        UR                  5      nU[         R                  U'   M     g)z
Start asynchronous autotuning in a subprocess.

This method:
1. Extracts picklable benchmark requests from choices
2. Submits benchmarking work to AutotuneProcessPool
3. Returns immediately (non-blocking)
r   Nzbmreq is None for choice)	r  r  choice_hash_to_futurer!  r^  rl  r  r  r   )r  r   r  r   choice_hashautotune_futures         r9   ra   AsyncAutotuner.start  s     F(88LKnBBB67D1= *= 2>>@GG*O
 APN00= r8   c                    0 nU H;  n[         R                  XB5      n[         R                  U   R                  5       X4'   M=     U$ )z
Get autotuning results, blocking until complete.

Args:
    timeout: Maximum time to wait in seconds. None means wait forever.

Returns:
    Dict mapping ChoiceCaller to benchmark timing
)r  r  r  rI   )r  r   r  timingsr   r  s         r9   get_resultsAsyncAutotuner.get_results  sE     F(88LK,BB;OVVXGO  r8   r1   N)r   r%   r  r|   r   r|   )r   list[ChoiceCaller]r  r|   )r   r  r  r|   r   zdict[ChoiceCaller, float])r3   r4   r5   r6   r   r  r   r  r  ra   r  r7   r1   r8   r9   r  r    sc     . . P P6 (69	" r8   r  )r   r   r   r   )r  r|   r   r   )r  r  r   r   )s
__future__r   rX  r  r<  dataclassesr  r@  multiprocessingr~  rB   rW   r   ru   r~   rz   r  rB  r   collections.abcr   r   r   concurrent.futuresr   r   r	   r
   r   r   r   typingr   r   r   r^  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   r   $torch._inductor.compile_worker.timerr   torch._inductor.utilsr   r   r   r   r   torch._loggingr    torch.utils._ordered_setr!   r   rC   rR   rr  typesr$   r  r%   r&   r'   r  r)   runtime.benchmarkingr*   virtualizedr+   r,   r3   rO   rE   r/   r<   r   r  r  LayoutOrBuffer	dataclassr   r  rP  r[  rq  rv  r  r  r  r  r  r  r  r  r  r4  rE  cacherZ  r\  r^  r  r  r  r  r  r1   r8   r9   <module>r     s   "        	     
    8 8 N N 2 2 ) )  $ C .   7  - /
 $'JJNNCUK$       -  . "8\:		 	t tno od RYY& .
 .
 .
b g g gT, D   F- -vU- vUr	 79O 		 79O 	JE#3 JEZ	9		9	$S/ $SN	"9;S 		"9;S 	a$57G a$H4&13C 4&n+57G +\  8'8&8S) S)l'
@') ')TC Cr8   