
    3j_                   f   % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SKJrJr  S SKJrJrJrJrJr  S SKJr  S SKJr  SS	KJrJ r   \(       a  S S
K!J"r"J#r#J$r$  S SK%J&r&  S SK'J(r(  SSK)J*r*  S SK+r+S SK,r,S SK-r,S SK.J/s  J0r1  S SK2J3r3J4r4  S SK5J6r6  S SK7J8r8J9r9  S SK:J;r;  S SK<J=r=J>r>  S SK?J@r@  S SKAJBrB  S SKCJDrDJErEJFrF  S SKGJHrH  SSKIJJrJJKrKJLrLJMrMJrJNrN  SSKOJPrP  SSKQJRrRJSrSJTrT  SSKUJVrVJWrW  SSKMJXrXJYrYJZrZJ[r[  SSK\J]r]J^r^  SSK_J`r`  SSKJaraJbrbJcrcJdrdJereJfrf  SSKgJhrh  SS KiJjrjJkrk  SS!KlJmrmJnrn  SS"KoJprpJqrq  SS#KrJsrs  SS$K/JtrtJuruJvrvJwrwJxrxJyryJzrzJ{r{J|r|J}r}J~r~JrJrJrJrJrJrJr  SS%KJr  \GR                  " \5      r\,GR                  GR                  \S&5      r\,GR                  GR                  \S'5      r\,GR                  GR                  \S(5      r\,GR                  GR                  \S)5      r\S*   rS+\S,'   \" S-5      r\" S.5      r\GR,                   " S/ S05      5       r\GR,                   " S1 S25      5       r " S3 S45      r\GR,                   " S5 S65      5       r\GR,                   " S7 S8\5      5       r " S9 S*5      r\GR:                  SlS: j5       rSmS; jrSnS< jrSoS= jr\GR,                  " S>S?9 " S@ SA5      5       rSpSB jr " SC SD5      r        SqSE jr " SF SG\5      r " SH SI\5      r " SJ SK\5      r    SrSL jr        SsSN jr " SO SP\5      r " SQ SR\5      r " SS ST\5      r " SU SV\5      r " SW SX\5      r St       SuSY jjr      SvSZ jrSwS[ jr            SxS\ jr                SyS] jr\GR,                   " S^ S_5      5       r\GRl                  " 5       rSzS` jrS{Sa jr    S|Sb jrS}Sc jrS}Sd jrS~Se jrS~Sf jrS~Sg jrS~Sh jr " Si SM5      r " Sj Sk5      rg)    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericTYPE_CHECKING	TypeAliasTypeVar)	ParamSpec
OrderedSet   )ComputedBuffer	Pointwise)CallableIteratorSequence)
ModuleType)EnterCudaStreamContextLine)PythonWrapperCodegen)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)get_stream_name)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)DevicePropertiesReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder   PartitionType_T_Pc                  z    \ rS rSr% SrS\S'   SrS\S'   SrS\S'   S	 r\	SS
 j5       r
\	 S   SS jj5       rSrg)FusionResultm   Nzbool | Noneshould_fusezCallable[[], bool] | Nonecallable_fnLambdaFuture | Nonefuturec                V    U R                   S LU R                  S L-  (       d   S5       eg )NzLFusion result should contain either fusion decision or callable_fn, not both)rf   rg   selfs    S/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/_inductor/scheduler.py__post_init__FusionResult.__post_init__s   s0      ,1A1A1MN 	
Z	
N    c                    [        US9$ )N)rf   rd   )clsrf   s     rm   fuseFusionResult.fusex   s    44rp   c                    [        XS9$ )Nrg   ri   rr   )rs   rg   ri   s      rm   from_callableFusionResult.from_callable|   s     CCrp    )rf   boolN)rg   Callable[[], bool]ri   rh   )__name__
__module____qualname____firstlineno__rf   __annotations__rg   ri   rn   classmethodrt   rx   __static_attributes__rz   rp   rm   rd   rd   m   sf    #K#-1K*1"&F&

 5 5 LPD,D6ID Drp   rd   c                  L    \ rS rSr% S\S'   S\S'   S\S'   SrS\S	'   SS
 jrSrg)PendingFusion   r}   rg   r_   node1node2Nrh   ri   c                2    U R                   U R                  4$ r|   r   r   rk   s    rm   get_fusion_nodesPendingFusion.get_fusion_nodes   s    

DJJ''rp   rz   )return+tuple[BaseSchedulerNode, BaseSchedulerNode])r~   r   r   r   r   ri   r   r   rz   rp   rm   r   r      s$    ##"&F&(rp   r   c                  D   \ rS rSrSr\SS j5       r\SS j5       r\      SS j5       r	\SS j5       r
\      SS j5       r\      SS j5       r\SS	 j5       r\      SS
 j5       r\SS j5       r\      SS j5       r\SS j5       r\SS j5       rSrg)MixOrderReduction   z
This class contains utility functions to decide if we should fuse reductions
reducing across different dimensions of the same input tensor.
c                p    U R                  5       =(       a     [        S U R                  5        5       5      $ )Nc              3     #    U  Hl  n[        U[        5      (       d  M  UR                  5       (       d  M1  [        UR                  [        5      (       d  MR  UR                  R
                  S Lv   Mn     g 7fr|   )
isinstanceSchedulerNodeis_reductionnoder   _split_size.0subnodes     rm   	<genexpr>7MixOrderReduction.is_split_reduction.<locals>.<genexpr>   sZ      +
+'=1 1 $$& 1 7<<8	 1GLL$$D0+s   A6A6A6A6)r   all	get_nodesr   s    rm   is_split_reduction$MixOrderReduction.is_split_reduction   s3      " 
s +
>>++
 (
 	
rp   c                   U R                  U5      (       Ga  S nS nUR                  5        GH  n[        U[        5      (       a4  UR	                  5       (       a  [        UR
                  [        5      (       d  MO  UR
                  R                  c   e[        R                  R                  R                  [        UR
                  R                  5      5      nUR
                  R                  c   e[        R                  R                  R                  [        UR
                  R                  5      5      nUc  UnUnGM  [        R                  R                  R                  X%5      (       d   U SU 35       e[        R                  R                  R                  X65      (       a  GM   U SU 35       e   Uc   eX#4$ UR                  S   $ )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrZ   graphsizevarssimplifyrY   _original_reduction_rangesstatically_known_equalsgroup)rs   r   xnumelrnumelr   	curxnumel	currnumels          rm   get_numel_rnumel"MixOrderReduction.get_numel_rnumel   s   !!$''FF>>+w66,,.."7<<@@||44@@@GG,,55!',,"?"?@	 ||>>JJJGG,,55!',,"I"IJ	 >&F&F77++CC  4 	{34  77++CC  4 	{34 1 ,8 %%%##::a= rp   c                    U R                  U5      nU R                  U5      n[        U5      S:w  d  [        U5      S:w  d  X4:X  a  g[        U5      [        [        U5      5      :H  $ )N   F)r   lentuplereversed)rs   r   r   g1g2s        rm   has_mix_reduction_orders*MixOrderReduction.has_mix_reduction_orders   sX     !!%(!!%(r7a<3r7a<28RyE(2,///rp   c                   SnUR                   R                   H.  n[        U[        5      (       d  M  UR                  U:X  d  M,  Un  O   U(       d  gUR
                  nUR                   R                  nU(       dI  [        U[        5      (       d   [        U5       5       eUR                  S   R                   R                  nU(       d   e[        U5      [        UR                  5      -
  (       d  g[        R                  R                  R                  [!        UR"                  5      [!        UR%                  5       5      5      (       a  gg)z0
The access to 'buf' is not a broadcast access.
NFr   T)read_writesreadsr   r4   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r"   rZ   r   r   r   rY   sizevalues)rs   bufr   	found_depdepr   r   s          rm   _is_full_access!MixOrderReduction._is_full_access   s   
 	##))C#y))chh#o	 *
 %%00
d$677HDJ<H7Q33>>Jz:&E4F4F)GG
 7733)..)=9J9J9L+M
 
 rp   c                    / nUR                  5       UR                  5       -  nU HD  nU R                  XQ5      (       d  M  U R                  XR5      (       d  M3  UR                  U5        MF     U$ r|   )used_buffer_namesr   append)rs   r   r   outcommon_readsr   s         rm   get_common_read!MixOrderReduction.get_common_read   sb     ..053J3J3LLC""3..33F3Fs3R3R

3   
rp   c                <    [        U R                  X5      5      S:  $ Nr   )r   r   rs   r   r   s      rm   has_common_read!MixOrderReduction.has_common_read   s     3&&u4599rp   c                    U R                  U5      n[        R                  R                  R	                  US   US   -  SS9$ )Nr   r   fallback)r   rZ   r   r   optimization_hint)rs   r   r   s      rm   	get_numelMixOrderReduction.get_numel  s>    !!$'ww11"Q%"Q%-!1LLrp   c                $    U R                  U5      $ r|   )r   r   s      rm   get_fusion_score"MixOrderReduction.get_fusion_score	  s    
 }}U##rp   c                (   [         R                  R                  (       d  g[        R                  R
                  (       a  gUR                  5       (       a  UR                  5       (       d  gUR                  5       R                  nUS;  d  [        U5      S:w  a  gUR                  5       (       a  UR                  5       (       d  gUR                  UR                  5       -  (       d"  UR                  UR                  5       -  (       a  gU R                  X5      (       d  g[        R                  X5      n[!        U5      S:X  a  gU R#                  U5      (       a  XpeOU R#                  U5      (       a  X!peOgU R%                  U5      nUu  p[         R                  R&                  (       d  Sn
[        R                  R(                  R+                  [,        R.                  " X-  U
5      5      (       d  g[        R                  R(                  R+                  [,        R.                  " XS-  5      5      (       d  g[        R                  R(                  R+                  [,        R.                  " US5      5      (       d  g[1        S UR3                  5        5       5      (       a  g[        R                  R(                  R5                  U	S	5      (       d  g[        R7                  U5      (       a  g[9        S
 UR3                  5        5       5      nU$ )z@
Check whether we can fuse two reductions with mix loop orders.
F)cudaxputritonr   i  P r   i   c              3     #    U  H]  nUR                  5       (       d  M  UR                  R                  R                  [        R
                  [        R                  4;  v   M_     g 7fr|   )r   r   datareduction_hintrD   INNERDEFAULTr   s     rm   r   -MixOrderReduction.can_fuse.<locals>.<genexpr>[  sU      
 7##%GLL,,##%%
 7s   A'AA'i @  c              3     #    U  H9  nUR                  5       (       d  M  UR                  R                  5       S ;   v   M;     g7f)>   sumprodN)r   r   get_reduction_typer   s     rm   r   r   q  s@      
 2##%GLL++-
 2s
   A$A)r(   r   mix_order_reductionrZ   r   cpp_wrapperrU   
get_devicer   rL   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   r   )rs   r   r   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   s               rm   can_fuseMixOrderReduction.can_fuse  s_   
 }}00 77||~~U\\^^&&(--.";/8;!!##5+=+=+?+?OOe7799OOe7799  ++E99 )88F|!!!%((*/Z##E***/Z!!/2
 }}@@ #J
 77##11%((4;
2STT
 77##11%((42JKK
 77##11%((42FGG  
 +446
 
 
 
 ww44T9EE//@@  
 &//1
 
 
rp   c                $    U R                  X5      $ r|   )r  r   s      rm   are_mix_order_reductions*MixOrderReduction.are_mix_order_reductions|  s     ||E))rp   c                h   ^ ^ [        U U4S jTR                  R                   5       5      (       d  gg)Nc              3  \   >#    U  H!  nTR                  UR                  T5      v   M#     g 7fr|   )is_contiguous_loadr   )r   r   rs   r   s     rm   r   7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>  s)      
>TsC""388T22>Ts   ),FT)r   r   r   )rs   r   s   ``rm   r   $MixOrderReduction.is_contiguous_node  s1     
>B>N>N>T>T
 
 
 rp   c                :   SSK Jn  UR                  5        H  n[        U[        5      (       d   eUR
                  nUR                  UR                     nU Vs/ s H   owR                  U:X  d  M  UR                  PM"     nn[        U5      S:X  a  M}  U Hy  n	UR                  U	   n
UR                  n[        UR                  5       5      n[        R                   R"                  R%                  U
UU5      nUS   S:X  a  Mm  US   S:X  a  Mx      g   M     gs  snf )Nr   )MemoryUsageTyper   FT)torch._inductor.loop_bodyr  r   r   r   _bodymemory_usageLOADbuffer_name
index_namer   indexing_exprsr   listkeysrZ   r   r   stride_vars)rs   r   parent_noder  r   	loop_bodyentrieseindex_namesr  
index_exprr   var_symbolsr  s                 rm   r  $MixOrderReduction.is_contiguous_load  s   =))+DdM2222

I,,_-A-ABG18QAMMS<P<1<<KQ;1$ *
&55jA
&11
 #:??#45gg..:: $B1,B10D  * ,2 + Rs   D1Drz   Nr   r_   r   r{   )r   r_   r   ztuple[sympy.Expr, sympy.Expr]r   r_   r   r_   r   r{   )r   strr   r_   r   r{   )r   r_   r   r_   r   	list[str])r   r_   r   intr   r_   r   r_   r   r$  )r   r"  r  r_   r   r{   )r~   r   r   r   __doc__staticmethodr   r   r   r   r   r   r   r   r   r  r  r   r  r   rz   rp   rm   r   r      sv   
 
 
 #! #!J 	0%	0.?	0		0 	0  B 	%	.?			 	 :%:.?:	: :
 M M $%$.?$	$ $ h hT *%*.?*	* *
    rp   r   c                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg) SchedulerBufferi  	Scheduler	schedulerz	ir.Bufferr   BaseSchedulerNode | Nonedefining_op)default_factorylist[NodeUser]usersrA   
mpi_bufferc                D    U R                   nUc   eUR                  5       $ r|   )r-  get_name)rl   ops     rm   defining_op_name SchedulerBuffer.defining_op_name  s#    ~~{{}rp   c                @    [        U R                  R                  5      $ r|   )hashr   r   rk   s    rm   __hash__SchedulerBuffer.__hash__  s    DIINN##rp   c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rR   r3  	writeliner   r   r~   layoutget_aliasespformatget_mutationsr   r0  indentgetrawvalue)rl   resultr   users       rm   	debug_strSchedulerBuffer.debug_str  s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ r|   r   r3  rk   s    rm   r3  SchedulerBuffer.get_name      yy!!##rp   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr+   CommBufferLayoutrZ   r   wrapper_codecodegen_allocationhasattrkernelr3  inplace_update_buffersr+  name_to_donated_buffername_to_bufcodegen_inplace_reuse)rl   input_buffer_nameinput_buffers      rm   allocateSchedulerBuffer.allocate  sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rp   c                &   U R                   c   e[        U R                   R                  [        R                  5      (       d  [        U R                   5      (       a  gU R                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)r   r   r@  r+   r?   rV   r0  
OutputNode)rl   uses     rm   can_freeSchedulerBuffer.can_free  sm    yy$$$dii&&66:SII;
 ;
 ::C#((J//  rp   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g r|   )idr   merger  r   r0  )rl   r0  rF  rd  s       rm   	set_usersSchedulerBuffer.set_users   sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
rp   c                T    U R                   c   eU R                   R                  5       $ r|   )r   rQ  rk   s    rm   rA  SchedulerBuffer.get_aliases
  s%    yy$$$yy5577rp   c                T    U R                   c   eU R                   R                  5       $ r|   )r   rR  rk   s    rm   rC  SchedulerBuffer.get_mutations  %    yy$$$yy++--rp   c                R    U R                   R                  5       R                  5       $ r|   )r   rS  r   rk   s    rm   r   SchedulerBuffer.get_device  s    yy((*5577rp   )r0  Nr   r"  r   r$  r   Noner   r{   )r0  r/  r   rv  r   zSequence[str]r   torch.device | None)r~   r   r   r   r   dataclassesfieldr  r0  rA   r1  r5  r9  rH  r3  r_  re  rj  rA  rC  r   r   rz   rp   rm   r)  r)    sv    
O))'--dCE>C.9.?.?3/J+ 
$$($?B
+8.8rp   r)  c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBufferi  Nr,  r-  rz   )r~   r   r   r   r-  r   r   rz   rp   rm   r~  r~    s    ,0K)0rp   r~  c                     \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   S\S'   SrS\S'   S\S'   S\S'   SrS\S'   S\S'   S\S'   SrS\S'   SVS  jrSWS! jr	SXS" jr
SXS# jrSXS$ jrSYS% jrSXS& jrSZS' jr      S[S( jrS\S) jrS]S* jrS^S+ jrS_S, jr      S`S- jrSZS. jrSaS/ jrSaS0 jrSZS1 jrSZS2 jr    SbS3 jrSXS4 jrSXS5 jr\SaS6 j5       r\SaS7 j5       r \S^S8 j5       r!\S^S9 j5       r"ScS: jr#SdS; jr$SeS< jr%SfS= jr&S^S> jr'S^S? jr(S^S@ jr)S^SA jr*S^SB jr+S^SC jr,S^SD jr-S^SE jr.SgSF jr/S^SG jr0SZSH jr1 Sh     SiSI jjr2\SjSJ j5       r3\SjSK j5       r4\SjSL j5       r5      SkSM jr6      SlSN jr7\SmSO j5       r8SnSP jr9\SnSQ j5       r:SoSR jr;SpSS jr<\=    SqST j5       r>SUr?g)rr_   i  OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager$  min_input_distancemax_input_distance	min_order	max_orderrB   mpi_nodedict[str, str]mutation_renamesNir.Operation | Noner   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_namefloat | Noneoverride_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFr{   writtenc                     Xl         S U l        g )Nc                     / $ r|   rz   )rO  kwargss     rm   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>5  s    Brp   )r+  debug_device_str)rl   r+  s     rm   __init__BaseSchedulerNode.__init__2  s    $-& 	rp   c           	     z   Xl         [        5       U l        SU l        SU l        [        [
           " 5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l        U R                   Vs0 s H  o3R                  5       U_M     snU l        0 U l        g s  snf s  snf )Nr   F)r+  r   r-  )r   r   r   r  r  r"  r  r  get_outputsr)  r+  r  r3  r  r  )rl   r   outputr   s       rm   _init_from_node!BaseSchedulerNode._init_from_node8  s    	#"#"#$
   **,
 - .. 
 -
 @D||L| 3|L !#
  Ms   B3	B8c                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)r   r~   r3  rk   s    rm   __repr__BaseSchedulerNode.__repr__R  s'    t*%%&fT]]_,?qAArp   c                   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
U R                   SU SU R                   SU S35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR!                  S5         UR                  U R#                  5       5        UR+                  5       R-                  5       $ ! , (       d  f       N]= f! [$         a    [&        R)                  SSS9   NOf = f)#Longer form printout for trace logsr<  (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = .min_input_distance = .max_input_distance = z.outputs = [
        r>  Ignoring error in debug_str()Texc_info)r3  rR   splicer   r~   getattrrB  r   writesr  r   r  r  rD  r  rH  r?  debug_str_extra	ExceptionlogwarningrE  rstrip)rl   r   r   r   s       rm   rH  BaseSchedulerNode.debug_strU  s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU VT445 6T445 6 	
	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   7FF$ 
F!$GGc                    g)N rz   rk   s    rm   r  !BaseSchedulerNode.debug_str_extrap      rp   c                $    U R                  U 5      $ r|   )r  rk   s    rm   _debug_str_for_device'BaseSchedulerNode._debug_str_for_devices  s    $$T**rp   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Nr   r  , F)shorten	multiline)r  r   r   torch	_inductorr+   r   
str_helperget_size	Reductionget_reduction_sizer   )rl   
maybe_datadata_strs      rm   debug_str_short!BaseSchedulerNode.debug_str_shortv  s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""rp   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r  infor  r   r  rk   s    rm   log_detailsBaseSchedulerNode.log_details  s,    6####		
rp   c                    gNFrz   )rl   self_dep	other_deps      rm   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_pair       rp   c                    S U R                   R                  5        5        Vs0 s H  nX!;   d  M
  X!U   _M     snU l        U R                  U R                   R	                  U R                  5      5        g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr|   r   r   r   s     rm   r   9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>  s     Q-Pc-P   )r   reads_and_writesr  set_read_writesrename)rl   renamesr   s      rm   update_mutated_names&BaseSchedulerNode.update_mutated_names  ss     RT-=-=-N-N-PQ!
Q  D$-Q!

 	T--44T5J5JKL!
s
   	A7	A7c                X    U R                  U R                  R                  U5      5        g r|   )r  r   	with_readrl   r   s     rm   add_fake_depBaseSchedulerNode.add_fake_dep  s!    T--77<=rp   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7fr|   )rA  rC  r   r   s     rm   r   =BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s*      
@ROO4!2!2!44@Rs   35)r   r  rk   s    rm   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutation  s%     
@D@P@P@R
 
 	
rp   c                f    Xl         U R                   R                  U l        U R                  5         g r|   )r   r   r  
prune_deps)rl   rws     rm   r  !BaseSchedulerNode.set_read_writes  s&    "&"2"2"8"8rp   c                b   ^ U R                  5       n[        U4S jU 5       5      nX1-
  U l        g )Nc              3  F   >#    U  H  nTR                  X5      v   M     g 7fr|   )get)r   kmutation_real_names     rm   r   3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s      !U1"4"8"8">">   !)used_or_aliased_buffer_namesr   r  )rl   future_used_buffersr  used_bufferss     ` rm   set_last_usage BaseSchedulerNode.set_last_usage  s-     88:!!U!UU&<rp   c                J    U R                    H  nUR                  5         M     g r|   )r  r_  )rl   r   s     rm   mark_runBaseSchedulerNode.mark_run  s    <<CLLN  rp   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7fr|   r  r  s     rm   r   6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s      
W HHW   )r   	itertoolschainr   r   r  rk   s    rm   r   #BaseSchedulerNode.used_buffer_names  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rp   c                  ^ [        5       m[        R                  " U R                  R                  U R                  R
                  5       Vs/ s H7  n[        U[        5      (       a  UR                  (       a  M+  UR                  PM9     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R!                  U5      (       aD  UR#                  U4S j[        R                  R                  U   R%                  5        5       5        [        U5      S:  a  M  T$ s  snf )z
Returns buffer names used by this node, including aliases.

Note: is_fake WeakDeps are excluded since they are purely for ordering
and should not affect buffer lifetime.
r   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7fr|   rz   )r   alias
used_namess     rm   r   ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s(      "5 J.	 E"5s   
	)r   r
  r  r   r   r  r   r6   is_faker   r   popaddrZ   r   name_to_bufferr  extendrQ  )rl   r   depsr  s      @rm   r  .BaseSchedulerNode.used_or_aliased_buffer_names  s     '1l
 !t'7'7'='=t?O?O?V?VW
WsG,, CHHW 	 

 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m !
s   *E;Ec                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7fr|   )r   r+  available_buffer_namesr   r   rl   s     rm   r   /BaseSchedulerNode.prune_deps.<locals>.<genexpr>  s0      -
.xxt~~DDD C.s   (8	8r   r  rk   s   `rm   r  BaseSchedulerNode.prune_deps  s#    ", -
..-
 #
rp   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                  > [        U [        5      (       d  gU R                  TR                  R                  ;  a  gTR                  R                  U R                     R                  5       nU[        R                  R                  ;   $ r  )	r   r6   r   r+  r[  r5  rZ   r   removed_operations)r   op_namerl   s     rm   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sb    c7++xxt~~999nn00:KKMGagg8888rp   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fr|   rz   r   r   r$  s     rm   r   4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
1C\#5FCC1   !	!r   r3   r   r{   )r   r   r   r  remove_reads)rl   	to_remover$  s   ` @rm   prune_weak_deps!BaseSchedulerNode.prune_weak_deps  sN    	9  
++11
 
	 	T--::9EFrp   c                D    [        XU R                  R                  5        g r|   )_prune_redundant_depsr+  r[  )rl   name_to_fused_nodes     rm   prune_redundant_deps&BaseSchedulerNode.prune_redundant_deps  s     	d8R8RSrp   c                T    U R                   c   eU R                   R                  5       $ r|   )r   get_operation_namerk   s    rm   r3  BaseSchedulerNode.get_name  rp  rp   c                "    U R                  5       $ r|   r3  rk   s    rm   get_first_name BaseSchedulerNode.get_first_name  s    }}rp   c                B    [        S U R                  5        5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   r8  r   r   s     rm   r   8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     G6Fd--//6F   )r   r   rk   s    rm   r   %BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGrp   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   r8  r   r   s     rm   r   5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     ALS,,..Lr?  )r   r  rk   s    rm   get_buffer_names"BaseSchedulerNode.get_buffer_names  s    ADLLAAArp   c                B    [        S U R                  5        5       5      $ )Nc              3  d   #    U  H&  n[        U[        5      =(       a
    [        US S9v   M(     g7f)T)disallow_fp32_opsNr   r   r-   r   ns     rm   r   ABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>   s6      
 & q-( G+AFG%s   .0r   r   rk   s    rm   can_codegen_in_low_precision.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
rp   c                B    [        S U R                  5        5       5      $ )Nc              3  f   #    U  H'  n[        U[        5      =(       a    [        U5      v   M)     g 7fr|   rJ  rK  s     rm   r   @BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s,      
% q-(K-H-KK%s   /1rN  rk   s    rm   r-   -BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rp   c                    U /$ r|   rz   rk   s    rm   r   BaseSchedulerNode.get_nodes  s	    vrp   c                    U R                   $ r|   )r  rk   s    rm   r  BaseSchedulerNode.get_outputs  s    ||rp   c                     U R                   U   $ r|   )r  )rl   buf_names     rm   
get_outputBaseSchedulerNode.get_output  s    ##H--rp   c                T    U R                   c   eU R                   R                  5       $ r|   )r   r   rk   s    rm   r   BaseSchedulerNode.get_device  s%    yy$$$yy##%%rp   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)r   r   rl   devices     rm   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rp   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ r|   )r   rU   r   rb  s     rm   rU   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rp   c                    gr  rz   rk   s    rm   r   BaseSchedulerNode.is_reduction"      rp   c                    gr  rz   rk   s    rm   is_native_matmul"BaseSchedulerNode.is_native_matmul%  rj  rp   c                    gr  rz   rk   s    rm   is_split_scanBaseSchedulerNode.is_split_scan(  rj  rp   c                    gr  rz   rk   s    rm   is_templateBaseSchedulerNode.is_template+  rj  rp   c                    gr  rz   rk   s    rm   	is_externBaseSchedulerNode.is_extern.  rj  rp   c                    gr  rz   rk   s    rm   
is_foreachBaseSchedulerNode.is_foreach1  rj  rp   c                    gr  rz   rl   read_deps     rm   can_inplaceBaseSchedulerNode.can_inplace4  rj  rp   c                    gr  rz   rk   s    rm   has_side_effects"BaseSchedulerNode.has_side_effects7  rj  rp   c           	     0  ^  SSK Jn  [        T [        5      (       a  [        R
                  (       a  [        R                  R                  T R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gT R(                  [        R                  R*                  -  T R,                  R.                  -  nSU 4S jjnT R1                  5        GH  nUR2                  nUc   eUR5                  5       (       a  UR7                  5       (       dn  UR9                  5       (       dY  UR;                  5       [        R                  R<                  ;   d-  [        UR?                  5       [@        RB                  5      (       a  M  T RD                  RF                   GH  nURH                  T R,                  RJ                  ;   a$  T R,                  RJ                  URH                     nO/T R,                  RL                  RO                  URH                  5      nU(       d  M  [        R                  RP                  RS                  UT 5      (       d  M  [        URT                  [V        5      (       a  M  URX                  c   eURX                   Vs/ s H%  nUR2                  R;                  5       U;  d  M#  UPM'     n	nT R,                  R[                  URH                  T 5      n
U
(       a  GMQ  []        U	5      S:X  d  GMc  U	S   R^                  (       d  GMz  U	S   R2                  T L d  GM  UR2                  c  GM  [        UR2                  R?                  5       [@        R`                  [@        Rb                  [@        Rd                  [@        RB                  45      (       a  GM  URT                  (       am  [        URT                  R2                  [@        Rf                  [@        Rh                  45      (       a*  []        UR2                  R7                  5       5      S:  a  GM  U" UR2                  UR2                  5      (       d  GM  U" U5      (       d  GM  [        R                  Rj                  Rm                  UR;                  5       UR;                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rn                  Rq                  UR;                  5       5        [        R                  Rn                  Rq                  UR;                  5       5        UR;                  5       [        R                  Rr                  UR;                  5       '     GM     GM     gs  snf )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r   )can_match_buffer_size	mutationsNrO  c                  >^ U R                   R                  T5      nU R                  5       m[        5       nU R                   H  nUR
                  n[        U[        5      (       d  M&  UR                  5       U R                   R                  ;  d  U R                   R                  U5      ULa  Mn  UU4S jUR                  R                  5        5       -  n[        U5      S:  d  M    g   g)Nc              3  L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7fr|   r  )r   orZ  s     rm   r   ^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>m  s&      Evv) AEs   $	$r   FT)r+  get_fused_noder3  r   r0  r   r   r_   r9  r1  r   r  r   )buf_to_be_inplaced
fused_noder  rG  	user_noderZ  rl   s        @rm   single_index_in_fused_nodeKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodeU  s    
 ,55DDTJJ)224H %/LD*00 II	!)->?? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= ' 1* rp   r   )r  r)  r   r{   ):codegen.wrapperr  r   r   r(   inplace_buffersrZ   r   has_featurer   r.   INPLACE_BUFFERSrX  r  r  codegensimd
SIMDKernelr  rW  r   r"  r+  completed_operationsr  r   rP  rQ  rR  r3  removed_buffersrS  r+   rT  r   r   r   rZ  r[  r  rU  	can_reuser-  NopKernelSchedulerNoder0  has_cross_stream_hazardr   r}  r?   r>   MutationLayoutSHOULDREMOVEFallbackKernelr=   rO  make_inplacer  r  rY  )rl   r  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usesr  s   `          rm   decide_inplace_update'BaseSchedulerNode.decide_inplace_update:  s&   
 	; t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) NNgg(()nn112 	 	D ##%CxxH''',,..88::..00<<>QWW%<%<< h668":M:MNN((..99 E EE $ E Edii PI $ : : > >tyy II I,,66y$GG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # &
 /3nn.T.T		4/+
 43/14*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = = " 3 3	! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V1)..#((KK6yAA
 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77G } / &6&s   "X.Xc                   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH5  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	S
S9S   nUR                  SUR                  SS5      R                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM8     [        U5      S:X  a  g UR                  U5        SU l        g )Nr  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr  {z{{}z}}r  \z\\z#pragma CMT END ORIGINr   T)r(   comment_originr  r   get_originsr4  r   targetmetarsplitreplacer   
writelines)	rl   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            rm   codegen_originating_info*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3 6 y>Q 	)$rp   c                "    U R                  SSS9$ )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrk   s    rm   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
rp   c                "    U R                  SSS9$ )NTFr  r  rk   s    rm   get_read_buffer_sizes'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
rp   c                "    U R                  SSS9$ )NFTr  r  rk   s    rm   get_write_buffer_sizes(BaseSchedulerNode.get_write_buffer_sizes   s    55 6 
 	
rp   c                L    [        U R                  XS9R                  5       SS9$ )Nr  r   )start)r   get_read_write_buffer_accessesr   )rl   r  r  s      rm   r  3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s1     //+ 0 fh	
 	
rp   c                  ^ ^^^^^ [        T [        5      (       a  0 $ [        T [        5      (       a!  [        T R                  [        5      (       a  0 $ [        T [        5      (       af  [        T R                  [
        R                  5      (       a=  T R                  R                  [        R                  R                  R                  L a  0 $ SS jm[        T [        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[         R"                  " [$        5      nU(       a:  T R&                  R(                   H   nX4R*                     R-                  U5        M"     U(       a:  T R&                  R.                   H   nX4R*                     R-                  U5        M"     U(       a&  [1        S T R&                  R(                   5       5      O	[1        5       nU(       a&  [1        S T R&                  R.                   5       5      O	[1        5       nSU 4S jjm[        T [2        5      (       a  [1        UU 4S jU 5       5      nXg-
  nXW-
  n0 nXV-   H  n	[5        U4S	 jX9    5       5      mU	[6        R8                  R:                  ;   a  [6        R8                  R:                  U	   n
O>U	[6        R8                  R<                  ;   a  [6        R8                  R<                  U	   n
OM      SUUU U4S
 jjmT" U
5      nX;  a  XU	'   M  X==   U-  ss'   M     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size

Returns memory accesses per buffer.
c                R    [         R                  R                  R                  U SS9$ )Nr   r   )rZ   r   r   r   )ss    rm   try_size_hintGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint:  s"    77##55a!5DDrp   r   r       eAc              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   CBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>O  s     B+ACxx+Ar  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r  T       C+BCxx+Br  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7fr|   r   )r   rG  s     rm   r   \BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>[  s     !>))r  r   )r+  r[  r0  r   r   )r   r   r0  buf_usesrl   s       rm   is_materializedIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedY  sG    NN..s399E!!>!>>Hx*V"44599rp   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7fr|   r   )r   r   r  rl   s     rm   r   r  _  s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7fr|   rz   )r   r   
node_numels     rm   r   r  h  s     $R;QCZ;Qs   c                  > U (       d  g[        U [        R                  5      (       a  U R                  5       $ [        U R                  [
        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       a  M$  [        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [!        U4S jU R#                  5        5       5      $ T	" [%        U R'                  5       5      5      n[)        U R+                  5       5      [-        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7fr|   )rZ   r   
get_buffer)r   mut_nameget_buf_bytess     rm   r   ZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>  s/      (@H &agg&8&8&BCC(@   25)r   r+   TorchBindObjectr  r@  r>   r+  r[  r3  r0  r   rc  r_   r=   r  r?   r   rR  rY   r  rN   	get_dtypemin)
r   r0  totrG  	sched_buf	buf_elemsbuf_accessed_elemsr  rl   r  s
         rm   r  GBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesq  s[    c2#5#566,,..

,=>> !NN66s||~FLLEC %%dii<<$)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rp   )r  z
sympy.Exprr   r$  )r   r"  r   Sequence[BaseSchedulerNode]r   r{   )r   z4ir.Buffer | ir.TensorBox | ir.TorchBindObject | Noner   r$  )r   r  ExternKernelSchedulerNoder   r=   r+   r  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater   rY   
get_rangesr$  collectionsr   r  r   r   r   r   r  r   r   r   rZ   r   r  graph_inputs)rl   r  r  buf_accessesr   r   r  r  buf_byte_accessesrZ  r   	buf_bytesr  r  r  r  r  s   `           @@@@@rm   r  0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233Id566:II{<
 <
 It677499b&7&788		%%||%%BBC I	E dM**&doo/23 1! 456J
 SJ"..t4''--XX&--c2 . ''..XX&--c2 /
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d.//( )%) O -F+E,.H!$$R<;Q$R!R177111gg,,X6QWW111gg**84#I## #J &c*I0.7(+!+y8+g 'j ! rp   c                ^   U R                   c  g U R                   R                  5       nUc  g [        U5      nUc  g [        U[        R
                  5      (       a  UR                   R                  n[        R                  R                  R                  USS9n[        S   S==   U-  ss'   U$ )Nr   r   inductor
flop_count)r   get_origin_noder9   r   r  SymIntexprrZ   r   r   r   r   )rl   fx_nodeflopsresolved_flopss       rm   estimate_flops BaseSchedulerNode.estimate_flops  s    99))++-?w'=eU\\**JJOOE));;EA;N\*n<*rp   c                T    U R                   b  U R                   $ U R                  5       $ r|   )r  _get_estimated_runtimerk   s    rm   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtime  s)    **6222**,,rp   c                ,   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       a  [        U R                  [        R                  5      (       d   e [        R                  (       av  [        U 5      n[        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ [!        U 5      nUc  [#        U R                  5      nUR%                  X6S9  U$ [#        U R                  5      $ [/        U R                  5      (       a  g[1        U 5      nUb  U$ UR                  R3                  5       n	 [5        5       n
[7        U	5      S-  nU
S::  a  [9        SU
 35      eUS::  a  [9        SU 35      e U R=                  5       nUS:X  d  Uc  U R?                  5       U
-  nUS-  nU$ SnU R?                  5       nUc  SOUnX-  U-  S	-  nX-  n[A        UU5      nUS-  nU$ ! [&         a  n[(        R+                  U5         SnAgSnAf[,         a  n[(        R+                  U5         SnAgSnAff = f! [:         a     gf = f)
z3
Returns estimated op runtime in milliseconds (ms)
r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!r   r  r   rS  rU   r;   rS   r   r+   IRNoder)   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr2   r1   	set_value
ValueErrorr  r  	TypeErrorrX    maybe_estimate_runtime_benchmarkmaybe_get_dtyperO   rM   AssertionErrorr  r	  r  max)rl   r   r@  	cache_keycache	cache_valmsr  retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     rm   r  (BaseSchedulerNode._get_estimated_runtime  s   
 nnq!--/2))+of-.. ##dii3333LL I$ OI68E %Y 7I ,))U;;;;((HNBz=diiHOOIO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%< }-#X	o    :  		sD   AH3 63H3 *H3 A J 3
J=IJ$I>>J
JJc                    g r|   rz   rk   s    rm   get_template_node#BaseSchedulerNode.get_template_node      rp   c                0    U R                  5       nUc   eU$ r|   r/  )rl   templates     rm   get_template_node_or_throw,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###rp   c                `    [        S [        U 5       5       5      nU SU nX   nXS-   S nX#U4$ )zA
For the list of nodes, get the prologue, template, and epilogue
c              3  X   #    U  H   u  pUR                  5       (       d  M  Uv   M"     g 7fr|   rr  )r   irL  s      rm   r   CBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     P,<DAaa,<s   *	*Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        rm   get_prologue_template_epilogue0BaseSchedulerNode.get_prologue_template_epilogue  sH     PIe,<PP.)-!+-.00rp   )r   r  r  r  r  r  r   r  r  r   r+  r  r  )r+  r*  r   rv  )r   ir.Operationr   rv  rs  )r   r#  ru  r  r4   r  r4   r   r{   r  r  r   rv  )r   r3   r   rv  rw  )r  r  r   rv  r  r  r  r  r   rv  r   r  r1  dict[str, BaseSchedulerNode]r   rv  r   r  )r   zSequence[SchedulerBuffer])rZ  r"  r   r)  ry  r|  zdependencies.Depr   r{   T)r  rR   r  r{   r   rv  rt  )r  r{   r  r{   r   r$  )r  r{   r  r{   r   zdict[str, int]r   
int | Noner   r  r   zir.TemplateBuffer | None)r   zir.TemplateBuffer)r>  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])@r~   r   r   r   r   r   r  r  r  r  r  rH  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r-  r2  r3  r9  rI   r   rE  rO  r-   r   r  r[  r   rd  rU   r   rl  ro  rr  ru  rx  r}  r  r  r  r  r  r  r  r  r	  r  r  r/  r5  r'  rC  r   rz   rp   rm   r_   r_     s   BB NN''$$ $D
$""///33((''GT
#4B*6+#
!.7	
M>


=#2=HV=	=
6
G T">T	T
. H H B B 
 
 
 
.&;:IX 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
L!!L!37L!	L!\  $- U Un
 1&1	S1 1rp   c                 R    [         R                  R                  R                  5       $ r|   )r  r  	codecache
LocalCacherz   rp   rm   r  r  &  s    ??$$//11rp   c                  ^ [        U R                  SS5      nU R                  R                  nU R                  R                  / UQU R                  R                  QU R                  R
                  5      nU R                  R
                  n[        R                  " X#45      u  pESS jm[        U4[        U4S jU 5       5      -   5      nU$ )Npython_kernel_namer  c                    [        U [        R                  5      =(       a/    [        U [        R                  [        R                  45      (       + $ r|   )r   r+   r  GeneratorStateOpaqueObjectStater  s    rm   _is_tensor_ir@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir5  s<    !RYY' 

!!2#7#781
 -
 	
rp   c              3  t   >#    U  H-  nT" U5      (       a  [        UR                  5       5      OS v   M/     g 7fr|   )r   r  )r   ar]  s     rm   r   <get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr><  s+     U9a}Q'7'7ajjl#TA9s   58rw  )
r  r   inputsfill_non_provided_argsconstant_argsr  pytreetree_flattenr"  r   )snoderX  rO  r  	flat_argsflat_args_pytree_specr  r]  s          @rm   r  r  +  s     -A2F::D::,,*$*))*

D ZZF'-':':D>'J$I

 	
U9U
U	VI rp   c                   [        U [        5      (       d  g [        R                  R                  R
                  [        R                  R                  R                  [        R                  R                  R                  S.n[        U R                  SS5      nX!;  a  g [        U R                  [        R                  5      (       d  g X   $ )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmrX  r  )r   r  r  opsatenmmbmmaddmmr  r   r+   ExternKernel)rg  mms_fnsrX  s      rm   _get_mm_like_fnrr  A  s    e677"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//22&&rp   c           	     d  ^ ^ S nS n[         R                  (       a  [        T 5      nUc  g UnU U4S jnOg [        T 5      n[	        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ SSKJ	m  U" 5       u  pxSSK
Jn	  U	R                  UUUSSSS9n
UR                  XJS	9  U
$ )
Nc                    > T" T 5      $ r|   rz   )rg  snode_args_kwargss   rm   r  2maybe_estimate_runtime_benchmark.<locals>.<lambda>Z  s    !25!9rp   r   )ru  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationr  )r(   !runtime_estimations_mms_benchmarkrr  r  r  r  r   r  utilsru  $torch._inductor.runtime.benchmarkingrw  	benchmarkr  )rg  bench_fnargs_kwargs_fnmm_fnr  r   r!  rO  r  rw  r"  ru  s   `          @rm   r  r  Q  s    HN//&=99%@I&(EY'I)U++++(!#LD@			! 
 
B 
OOIO(Irp   T)slotsc                  \    \ rS rSr% S\S'   S\S'   S\S'   S\S'   SS jrSS	 jrSS
 jrSrg)	WhyNoFuseiw  r"  name1name2reasonztuple[Any, ...]rO  c                X    UR                  5       U l        UR                  5       U l        g r|   )r3  r  r  rl   r   r   s      rm   r  WhyNoFuse.__init__~  s    ^^%
^^%
rp   c                F    Xl         X l        [        R                  U 5        g r|   )r  rO  
fusion_logdebug)rl   r  rO  s      rm   __call__WhyNoFuse.__call__  s    	rp   c                p    SU R                    SU R                   S3U R                  U R                  -  -   $ )Nzcannot fuse z with r<  )r  r  r  rO  rk   s    rm   __str__WhyNoFuse.__str__  s6    djj\

|2>KK$))#
 	
rp   )rO  r  r  r  Nr   r_   r   r_   r   rv  )r  r"  rO  r   r   rv  rs  )	r~   r   r   r   r   r  r  r  r   rz   rp   rm   r  r  w  s&    JJK
&

rp   r  c                    [        U [        [        45      (       a  [        U [        S9n [
        R                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nkey   )rD  r      )	r   r   setsortedr"  pprintrB  textwraprD  )objrF  s     rm   rB  rB    sU    #
C())Sc"^^C*Fv~HOOFG4566Mrp   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)rc  i  c                &    [        U/5      U l        g r|   r  r  s     rm   r  OutputNode.__init__  s    ",cU"3rp   c                    gr  rz   rk   s    rm   r   OutputNode.is_reduction  rj  rp   c                    g)Nrz   rz   rk   s    rm   rQ  'OutputNode.get_inputs_that_alias_output  r  rp   c                    g)NOUTPUTrz   rk   s    rm   r3  OutputNode.get_name  s    rp   )r  N)r   r5   r   rv  rw  rx  rs  )
r~   r   r   r   r  r   rQ  r3  r  r   rz   rp   rm   rc  rc    s    4 Hrp   rc  c                  ^ ^^^^ [         R                  " 5       mT R                   HU  n[        U[        5      (       a  M  TUR
                     R                  5       nTTU   R                  5       ==   S-  ss'   MW     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r   c                  > [        U [        5      (       ap  TU R                     R                  5       nTTU   R	                  5          S:  =(       a     TR
                  R                  U TU   T5      nTU   T:H  nU=(       d    U$ g)Nr   F)r   r6   r   r5  r3  r+  fusable_weak_dep)r   r#  is_redundantis_self_depr[  name_to_dep_countr1  r   s       rm   r$  +_prune_redundant_deps.<locals>.should_prune  s    c7##!#((+<<>G,"7+446 nn55'0$  -W5=K.;.rp   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fr|   rz   r'  s     rm   r   (_prune_redundant_deps.<locals>.<genexpr>  s      .,s2C.r)  Nr*  )r  r   r  r   r6   r   r5  r3  r   r  r   r+  )r   r1  r[  r   r#  deps_to_pruner  r$  s   ```   @@rm   r0  r0    s     '2&9&9&;&&#w''!#((+<<>G09BBDEJE '
    .. M "&"9"9M"IT--::=IJ rp   c                  ^   ^  \ rS rSrS	U 4S jjrS
S jrSS jrSS jrSS jrSS jr	Sr
U =r$ )r  i  c                  > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        [        U[        R                  5      (       a`  UR                  5       (       aJ  [        R                  " UR                  S   R                  5      nSnUR                  5       nXSU44U l        g g g Nr   r   )superr  r  r  get_read_writesr   r+   UserDefinedTritonKernelcan_fuse_epiloguemathr   mutable_argsshapeget_device_or_errorr   )rl   r+  r   numelr   rc  	__class__s         rm   r  "ExternKernelSchedulerNode.__init__  s    #T"T1134dB6677D<R<R<T<TIId//2889EF--/F &/2DJ =U7rp   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = rX  )r3  r  r   rk   s    rm   r  )ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbrp   c                    gNTrz   rk   s    rm   ru  #ExternKernelSchedulerNode.is_extern  r1  rp   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )Nr  )r   rW  r  rk   s    rm   r  *ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVrp   c                   [        U R                  [        R                  5      (       a[  U R                  R	                  5       (       a<  [
        R                  " U R                  R                  S   R                  5      nU// 4$ / / 4$ r   )	r   r   r+   r  r  r  r   r  r  )rl   r  s     rm   r  $ExternKernelSchedulerNode.get_ranges  sd    tyy""<"<==		++--IIdii44Q7==>EGR= Bxrp   c                    [        U R                  [        R                  5      (       d   eU R                  R	                  U5      $ r|   )r   r   r+   rp  r  )rl   wrappers     rm   r  !ExternKernelSchedulerNode.codegen  s2    $))R__5555yy  ))rp   r   r+  r*  r   rE  r   rv  rs  rw  r   Sequence[Sequence[sympy.Expr]]r  r   r   rv  )r~   r   r   r   r  r  ru  r  r  r  r   __classcell__r  s   @rm   r  r    s(    
3cW* *rp   r  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )r  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g r|   )r  r  r  r  r  rl   r+  r   r  s      rm   r  NopKernelSchedulerNode.__init__  s5    #T"T1134rp   rz   r  )r~   r   r   r   r  r   r  r  s   @rm   r  r    s    5 5rp   r  c                    ^  \ rS rSr% SrS\S'   S\S'         S#U 4S jjr  S$     S%S jjr  S$     S&S	 jjr      S'S
 jr	S(S jr
S)S jrS*S jrS)S jr      S+S jrS)S jr      S,S jrS-S jrS.S jrS/S jrS/S jrS/S jrS/S jrS0S jrS1S jr    S2S jrS3S jr S4   S5S jjr\S6S j5       r\S6S j5       rS7S jr\S8S  j5       r \S/U 4S! jj5       r!S"r"U =r#$ )9r   i   zi
A SchedulerNode is a node for scheduling that encapsulates either
a ComputedBuffer or a TemplateBuffer.
z tuple[Sequence[sympy.Expr], ...]_sizesr@   r  c                f   > [         TU ]  U5        U R                  U5        U R                  5         g r|   )r  r  r  _compute_attrsr  s      rm   r  SchedulerNode.__init__	  s,    
 	#T"rp   c                   [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        nX0l        U R                  R                  5       nU R                  R                  U5      R                  nXE" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer  )r   r   r+   r   TemplateBuffersimplify_and_reorderr  r  r  r+  get_backendgroup_fnr   r(   loop_ordering_after_fusionrU   r   r  extract_read_writesr*   )rl   r  r  bodyrc  r  should_normalizes          rm   r  SchedulerNode._compute_attrs  s;   
 $))b&7&79J9J%KLLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8Hrp   c                   [        S U R                  R                   5       5      nU R                  UUS9  U(       aD  U R	                  U R                  R                  U5      R                  U R                  5      5        g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7fr|   r   r6   r5   r  s     rm   r   8SchedulerNode.recompute_size_and_body.<locals>.<genexpr>8  $      0
1CZgwEW5XCC1   .	.r  )r   r   r   r  r  r  r  r  )rl   r  r  	fake_depss       rm   recompute_size_and_body%SchedulerNode.recompute_size_and_body3  s    
 &0 0
++110
 &
	 	'A&? 	 	
     **95<<T=R=RS rp   c                   [        S U R                  R                   5       5      nU R                  [        R
                  " U R                  /U R                  Q7SU06R                  U5      R                  U R                  5      5        U R                  R                  U 5        U(       a!  SSKJn  UR                  R!                  5         g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7fr|   r  r  s     rm   r   5SchedulerNode.refresh_dependencies.<locals>.<genexpr>I  r  r  r  r   SIMDScheduling)r   r   r   r  r*   r  r  r  r  r  r  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rl   r  need_clear_tiling_cacher  r  s        rm   refresh_dependencies"SchedulerNode.refresh_dependenciesD  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #rp   c                    U R                   R                  U5      U l         U R                   R                  U l        U R	                  SSS9  g )NFTr  r  )r  reorder_iter_loopssizesr  r   )rl   	new_orders     rm   apply_new_loop_order"SchedulerNode.apply_new_loop_orderb  sA    ZZ22

 jj&&!!E4!Prp   c                   U R                   R                  5       n[        U R                   R                  5      U-
  n[	        [        U5      5      n[	        [        X"U-   5      5      nU R                  XC-   5        [        U R                  S   5      S:X  d   eU R                  S   U R                  S   S   U R                  S   S   44U l        g )Nr   r   r   )r  get_original_num_rdimsr   	iter_varsr   ranger  r   )rl   	num_rdims
num_pwdimspwdimsrdimss        rm   swap_pw_red_dimension#SchedulerNode.swap_pw_red_dimensionj  s    JJ557	--.:
uZ()eJY(>?@!!%.14::a=!Q&&&ZZ]TZZ]1%5tzz!}Q7G$HH
rp   c                D    U R                   R                  5       U l         U $ r|   )r  extract_pw_from_reductionrk   s    rm   r  'SchedulerNode.extract_pw_from_reductiont  s    ZZ99;
rp   c                   [         R                  U 5      (       d  g [        U R                  [        R
                  5      (       d   eU R                  R                  5          U R                  5         S S S 5        g ! , (       d  f       g = fr|   )r   r   r   r   r+   r   with_original_inner_fnr  rk   s    rm   cancel_reduction_split$SchedulerNode.cancel_reduction_splitx  s[     33D99$))R%6%67777YY--/! 0//s   !A;;
B	c                   [        U R                  [        R                  [        R                  45      (       d   eU R
                  R                  X5      U l        U R
                  R                  U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        U R                  SSS9  g )NTr  )r   r   r+   r   r  r  #expand_dimension_for_pointwise_noder  r  r  r+  r  r  r   r   )rl   	dimension	new_rangerc  r  s        rm   r  1SchedulerNode.expand_dimension_for_pointwise_node  s     $))b&7&79J9J%KLLLLZZCC

 jj&&..0>>--f5>>ht{{34
 	!!D$!Orp   c                    U R                   R                  5       U l         U R                   R                  U l        U R	                  SSS9  g )NTFr  )r  merge_loopsr  r  r   rk   s    rm   r   SchedulerNode.merge_loops  s<    ZZ++-
jj&& 	!!D%!Prp   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g[        R                  SU R                  5       5        g)Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r   num_varsdecide_loop_order_to_matchr,   num_loop_reorderingloop_ordering_logr  r3  r  )rl   r  r  r  
self_sizess        rm   r  'SchedulerNode.reorder_loops_by_dep_pair  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W rp   c                N   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        H  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      n[        U[        R                  5      (       a  Mf  UR                  U S[        UR                  5       35        M     [        U R                   ["        5      (       aS  UR                  SU S35        UR                  [$        R&                  " U R                   R)                  5       S	5      5        U R*                  c   eUR-                  U R/                  5       5        S
R1                  U5      $ )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r  )r3  r   r  r   r  r   r6   r   rZ   r   r  r+   r  r   rB  r@  r  r@   r  rD  rH  r   r  r  join)rl   r   linesr   rZ  r   s         rm   r  SchedulerNode.debug_str_extra  sM   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2!#r'9'9::LLH:Z

8K7L!MN 7 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyrp   c                    U R                   $ r|   )r  rk   s    rm   r  SchedulerNode.get_ranges      {{rp   c                d   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      =(       a0    U R                  S L =(       d    U R                  R                  (       + $ Ntype(self.node)=)
r   r   r+   r   r  r   r{   r   r  has_partial_accumulaterk   s    rm   r   SchedulerNode.is_reduction  s    $))b&7&79J9J%KLL 	
tDII !	
L DII0023 
JJ$Gdjj&G&G"G	
rp   c                    [        U R                  [        R                  5      (       d   S[	        U R                  5      < 35       eU R                  R                  5       S:H  $ )Nr2  dot)r   r   r+   r   r   r   rk   s    rm   rl  SchedulerNode.is_native_matmul  sM    $))R%6%677N<LDO;M9NN7yy++-66rp   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ r1  )r   r   r+   r   r  r   r   	SplitScanrk   s    rm   ro  SchedulerNode.is_split_scan  s|    $))b&7&79J9J%KLL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rp   c                J    [        U R                  [        R                  5      $ r|   r   r   r+   r  rk   s    rm   rr  SchedulerNode.is_template  s    $))R%6%677rp   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ r|   r<  rk   s    rm   r/  SchedulerNode.get_template_node  s'    &tyy"2C2CDDtyyN$Nrp   c                f    U R                  5         U R                  5         U R                  U5        g r|   )r  r  r  )rl   
index_varss     rm   runSchedulerNode.run  s#    ""$Z rp   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ r|   )	r  r   mapr   dictzipr
  r  from_iterable)rl   rA  r  r   s       rm   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rp   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f! [         a"    [        R                  SU R                  5        e f = f)a  
Generate code for this node using the provided index variables.

This method sets up the appropriate context for code generation, including
simplifying indexing expressions based on the variable ranges, and then
calls the node's body function with the index variables.

Args:
    index_vars: A sequence of sequences of sympy expressions representing
                the index variables for each dimension of the computation.
NzError in codegen for %s)rI  rZ   set_ops_handlerrG   get_ops_handlerrX  set_current_noder  r  r  fatalr   )rl   rA  r   s      rm   r  SchedulerNode.codegen  s     00<
	!!"213D3D3F
"ST))$/

J' 0 UT// UT  	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U(       a  U R                   O[        U R                   5      u  p#[        R                  " U R                  U[
        R                  R                  /[        U5      -  /S9$ )zL
Get the memory dependencies in either the pointwise or the reduction axes.
)hidden_args)	r  r   r*   r  r  r   SZeror   )rl   	pointwise
keep_sizesignore_sizess       rm   "pointwise_or_reduction_read_writes0SchedulerNode.pointwise_or_reduction_read_writes  sR     3<4;;$++AV 
//JJ
%'',,#lBS1S0T
 	
rp   c                     U R                  SS9$ )z8
Get the memory dependencies in the non-reduction axes.
TrU  rX  rk   s    rm   r  #SchedulerNode.pointwise_read_writes  s    
 666FFrp   c                     U R                  SS9$ )z4
Get the memory dependencies in the reduction axes.
Fr[  r\  rk   s    rm   reduction_read_writes#SchedulerNode.reduction_read_writes&  s    
 666GGrp   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )rA  rC  s     rm   r   ,SchedulerNode.can_inplace.<locals>.<genexpr>0  s     ?,>S  ,>r?  r   ztype(write_dep)=)rr  r   r  r   r   r  r   r*   r4   r<  iterr   r   r   )rl   r|  	write_deps      rm   r}  SchedulerNode.can_inplace-  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXrp   c                8   [        5       n[        U R                  [        5      (       a  U R                  R	                  5        H  nUR
                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_addrx  r  r   r   r   r  )r   r   r  r@   r   r4  r  r  r   rO  r  )rl   buffers_store_as_atomic_addr   s      rm   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers:  s    7A|#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*rp   c                |   > U R                   b!  U R                   R                  S5      (       a  g[        TU ]  5       $ )Ndevice_assert_asyncT)r  has_opr  r  rl   r  s    rm   r  SchedulerNode.has_side_effectsN  s5     ::!djj&7&78M&N&Nw'))rp   )r  r  r   )r+  r*  r   z%ir.ComputedBuffer | ir.TemplateBufferr   rv  NN)r  'tuple[dict[Any, Any], list[Any]] | Noner  zCallable[_P, _T] | Noner   rv  )r  ru  r  zCallable[..., Any] | Noner   rv  )r  r{   r  r{   r   rv  )r  Sequence[int]r   rv  ru  r   r_   )r  r$  r  r$  r   rv  rF  rs  r  rw  rR  )rA  Sequence[sympy.Expr]r   rv  )rA  r  r   zdict[sympy.Expr, sympy.Expr])rA  r  r   rv  rN  )rU  r{   r   r  )r   r  rM  rI  )$r~   r   r   r   r&  r   r  r  r  r   r  r  r  r  r  r   r  r  r  r   rl  ro  rr  r/  rB  rI  r  rX  rI   r  r_  r}  rm  r  r   r  r  s   @rm   r   r      s   
 -,O 4 
	 OS=A$K $; 
	F OS?C$K $= 
	";;8<;	;<QI"PP),P	P"
Q!.7	. ,
7
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *rp   r   c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fr|   r   rE  )r   r   group_snodes     rm   r   2refresh_group_node_dependencies.<locals>.<genexpr>_  s/      
Pxx{;;== CP   "2	2)
r   r  r*   
ReadWrites
merge_listr   r   unionr  r  )r|  r   r  s   `  rm   refresh_group_node_dependenciesr  V  s     F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8r*  c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R!                  5        Vs0 s H  oDR#                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr|   r  r   r  s     rm   r   "init_group_node.<locals>.<genexpr>w       H5G5Gr  c              3  8   #    U  H  oR                   v   M     g 7fr|   )r  r  s     rm   r   r  x  r  r  c              3  8   #    U  H  oR                   v   M     g 7fr|   )r  r  s     rm   r   r  y        )&8&8r  c              3  8   #    U  H  oR                   v   M     g 7fr|   )r  r  s     rm   r   r  |  r  r  )r   r   GroupedSchedulerNoder   r+  r   r   r  r   r  r  r  r  r  r  r  r  r3  r  )r|  r+  r   r  r   s        rm   init_group_noder  h  s   
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK%( )&1&8&8) &K" &) )&1&8&8) &K" (3'>'>'@#'@'@#K 
B#s   D6D6D;c                    ^  \ rS rSr% SrS\S'   \      S#S j5       rS$S jrS%S jr	\
S&S j5       r      S'S	 jrS(U 4S
 jjr\
S)S j5       rS)S jr\
S*S j5       rS+S jrS)S jrS)S jr      S,U 4S jjr\
S*S j5       r\
S*S j5       rS-S jrS)S jr\
S.S j5       r\
S.S j5       r\
S.S j5       r\
S.S j5       r\
S/S j5       rS0S jr\
S.S j5       rS1S jr S2S jr!S3S jr"S)S  jr#\
S.U 4S! jj5       r$S"r%U =r&$ )4r   i  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
rS  r   c                   UR                   UR                   L d   e[        U[        [        45      (       d   eUR	                  5       (       aA  [        U[
        5      (       a,  [        UR                  [        R                  5      (       d   eO[        U[        [        45      (       d   e[        [        R                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ r|   )r+  r   r   r   rr  r  r   r+   r=   r  r
  r  r   )rs   r   r   r>  s       rm   rt   FusedSchedulerNode.fuse  s     %//111%-1C!DEEEE:e5N#O#Oejj"..9999em5G%HIIIIY__U__%68IJK5??E**rp   c                    U R                    HA  n[        U[        5      (       d   eUR                  5       (       d   eUR	                  5         MC     U $ r|   )r   r   r   r   r  rl   r   s     rm   r  ,FusedSchedulerNode.extract_pw_from_reduction  sK    {{Gg}5555''))))--/ # rp   c                x    U R                    H*  n[        U[        5      (       d   eUR                  5         M,     g r|   )r   r   r   r  r  s     rm   r  (FusedSchedulerNode.swap_pw_red_dimension  s/    {{Gg}5555))+ #rp   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fr|   rr  ru  r	  r=  s     rm   r   4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  =       0''))T^^-= *D'')) 0
   .AAr   r  filterr   r   r   rl   fpsr#  s      rm   r	  !FusedSchedulerNode.estimate_flops  K      $ 0	
 s8q=#h
rp   c                   U R                  5       (       a  gSnU R                   Hh  n[        U[        5      (       d    gUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          gUR
                  S   nMj     SnUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g[        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g)	z0
Return true if a loop reordering is performed.
FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)rr  r   r   r   r   r  r&  r  r   r#  r$  r3  r,   r%  r  r  )rl   r  r  r'  rg  r  s         rm   r  ,FusedSchedulerNode.reorder_loops_by_dep_pair  sH    
[[Ee]33%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-rp   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ r|   )r$  r   r\  s    rm   r  -FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C/Drp   r  )r  r  r  r0  r  r   )rl   r+  r   r  s      rm   r  FusedSchedulerNode.__init__  s6    #0%'
%DEKK
rp   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r*  r   r3  rl   r  s     rm   r3  FusedSchedulerNode.get_name  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ r   r   r3  rk   s    rm   r9  !FusedSchedulerNode.get_first_name      {{1~&&((rp   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf r|   r   r  r   rE  r  s     rm   rE  #FusedSchedulerNode.get_buffer_names  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ r|   r   r  r  rl   rF  r   s      rm   r  FusedSchedulerNode.get_outputs  /    (*KKDMM$**,-  rp   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r  r  )r=  r   r3  rH  r   r  r  r  rD  r*  r  )rl   r:  r   r+  s       rm   r  "FusedSchedulerNode.debug_str_extra  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r   r  )rl   r   
snodes_strs      rm   r  "FusedSchedulerNode.debug_str_short  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        5       n[        U R                  5       H/  nUR                  X5        UR                  UR                  5        M1     g r|   )r  r  r   r   r   updater  )rl   r  r  r   r  s       rm   r  !FusedSchedulerNode.set_last_usage	  sQ    
 	2G 0:|T[[)D 3H&&t7 *rp   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf r|   )r   r  r   r   r  s     rm   r   $FusedSchedulerNode.used_buffer_names  s0    !MA"5"5"7!MNN!Mr  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf r|   )r   r  r   r  r  s     rm   r  /FusedSchedulerNode.used_or_aliased_buffer_names  s5    8<D1,,.D
 	
Dr  c                    U R                   $ r|   r  rk   s    rm   r   FusedSchedulerNode.get_nodes   r/  rp   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r  r  rk   s    rm   r  FusedSchedulerNode.__repr__#  s'    t*%%&gdmmo->a@@rp   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )r   r  s     rm   r   2FusedSchedulerNode.is_reduction.<locals>.<genexpr>(  s     9[>>##[r?  r   r   rk   s    rm   r   FusedSchedulerNode.is_reduction&  s    9T[[999rp   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )rl  r  s     rm   r   6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>,  s     =A%%''r?  r  rk   s    rm   rl  #FusedSchedulerNode.is_native_matmul*  s    ====rp   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )ro  r  s     rm   r   3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>0  s     :k??$$kr?  r  rk   s    rm   ro   FusedSchedulerNode.is_split_scan.  s    :dkk:::rp   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   r9  r  s     rm   r   1FusedSchedulerNode.is_template.<locals>.<genexpr>4  s     8Kq==??Kr?  r  rk   s    rm   rr  FusedSchedulerNode.is_template2  s    8DKK888rp   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g r|   )r   rr  r/  rl   r   s     rm   r/  $FusedSchedulerNode.get_template_node6  s3    KKD!!--//   rp   c                     U R                   S   $ r   r  rk   s    rm   r   FusedSchedulerNode.get_device=  s    zz!}rp   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )r  r  s     rm   r   >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>B  s     EA--//r?  r  rk   s    rm   r  +FusedSchedulerNode.has_aliasing_or_mutation@  s    EEEErp   c                    [         er|   NotImplementedError)rl   r  s     rm   r  'FusedSchedulerNode.update_mutated_namesF      !!rp   c                    [         er|   r  )rl   r   s     rm   r  FusedSchedulerNode.add_fake_depI  r  rp   c                    [         er|   r  r{  s     rm   r}  FusedSchedulerNode.can_inplaceL  r  rp   c                   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
U R                   SU SU R                   SU S35        UR                  5          U R                  5        H"  nUR	                  UR!                  5       5        M$     SSS5        UR#                  S5         UR	                  U R%                  5       5        UR-                  5       R/                  5       $ ! , (       d  f       N]= f! [&         a    [(        R+                  SSS9   NOf = f)r  r=  c              3  L   #    U  H  n[        U5      R                  v   M     g 7fr|   )r   r~   rK  s     rm   r   /FusedSchedulerNode.debug_str.<locals>.<genexpr>R  s     F+QQ 0 0+s   "$r<  r  r  r  r  r  r  r  r  z.outputs = [
            Nr>  r  Tr  )r3  r*  r   rR   r  r   r~   rB  r   r  r  r   r  r  rD  r  rH  r?  r  r  r  r  rE  r  )rl   r   node_typestrr   r   s        rm   rH  FusedSchedulerNode.debug_strO  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU VT445 6T445 6 
	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   	7FF( 
F%(G	G	c                r   > U R                   b  [        S U R                    5       5      $ [        TU ]  5       $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )r  r=  s     rm   r   6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>n  s     G;4,,..;r?  )r   r   r  r  rr  s    rm   r  #FusedSchedulerNode.has_side_effectsk  s0    ;;"G4;;GGGw'))rp   )r   r0  r   r_   r   r_   r   r   rw  ru  rO  rF  )r+  r*  r   rS  r   rv  rs  rI  r   r  rH  rL  rw  rR  )r   torch.devicerG  )r   r3   r   rv  rM  )'r~   r   r   r   r&  r   r   rt   r  r  rI   r	  r  r  r3  r9  rE  r  r  r  r  r   r  r   r  r   rl  ro  rr  r/  r   r  r  r  r}  rH  r  r   r  r  s   @rm   r   r     s    $#
+%
+.?
+	
+ 
+,
  ")!).7)	)VL = =) N N	B/8#28HV8	8 O O 
 

A : : > > ; ; 9 9   F F
"""*8 * *rp   r   c                  V   ^  \ rS rSrSU 4S jjr      SS jrS	S jrS	S jrSrU =r	$ )
FusedMixOrderReductionsir  c                l  > [         R                  U5      (       d  [         R                  U5      (       d   eX!p!Xl        X l        [        TU ]  UR                  [        UR                  5       5      [        UR                  5       5      -   5        [         R                  U R                  5      U l
        g r|   )r   r   r   r   r  r  r+  r  r   r   r  )rl   r   r   r  s      rm   r   FusedMixOrderReductions.__init__s  s     33E::$77>>>> 5

OOT%//"34tEOO<M7NN	
 '00<
rp   c           	     H   [        U[        5      (       a   e[        U[        5      (       a   eU R                  R                  XSS9(       d  g[        R                  U5      (       a  [        R                  U5      (       d  gSS jn    SS jnU(       a/  U" X45      U" U5      -  (       d  U" U5      U" X45      -  (       a  gUR                  5       (       + =(       d@    [        R                  " [        U R                  R                  XSS95      U R                  :  $ )z
node1 is from the current mix order reduction; node2 is another node we want to fuse in.

other_nodes are passed in to check if fusion will introduce producer/consumer relationship
between the inner and outer reduction. If yes, we don't fuse.
Fallow_mix_order_reductionc                B    [        5       nUR                  " S U  5       6 $ )Nc              3  8   #    U  H  oR                   v   M     g 7fr|   )r   rK  s     rm   r   TFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>  s     :Eq{{Er  r   r  r>  r   s     rm   _get_ancestorsAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors  s    ,C99:E:;;rp   c                B    [        5       nUR                  " S U  5       6 $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr|   )r   rK  s     rm   r   ZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>  s     F14466r?  r  r	  s     rm   _get_operation_namesGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names  s      ,C99FFGGrp   )count_bytes)r>  tuple[BaseSchedulerNode, ...]r   r  )r   r  r+  r  r   r   r   typingcastr$  score_fusion_memoryr  )rl   r   r   other_nodesr
  r  s         rm   sub_node_can_fuse)FusedMixOrderReductions.sub_node_can_fuse  s    e%<====e%<====
 ~~&&uu&U //
 
#66u==	<	H0	H	H ~.1Ek1RR{+.BE>.RR ""$$ {{T^^77RW7X zz	
rp   c                   [        U[        5      (       dU  U R                  U R                  XR                  45      =(       d'    U R                  U R                  XR                  45      $ U R                  U R                  UR                  U R                  UR                  45      =(       a/    U R                  U R                  UR                  [        5       5      $ r|   )r   r  r  r   r   r   rl   others     rm   can_fuse_with%FusedMixOrderReductions.can_fuse_with  s    %!899))

EJJ= J''

EJJ=IJ ))

EKK$**ekk)B K((U[[%'JKrp   c                b   U R                   R                  5       nU R                  R                  U5      n[	        U[
        5      (       aW  UR                  U R                   UR                   5      nUR                  U R                  UR                  5      n[        XE5      $ U R                  U R                   XR                  45      (       a1  UR                  U R                   U5      n[        X`R                  5      $ UR                  U R                  U5      n[        U R                   U5      $ r|   )	r   r   r+  r  r   r  rt   r   r  )rl   r  rc  backendfused_node1fused_node2r  s          rm   	fuse_with!FusedMixOrderReductions.fuse_with  s    &&(..,,V4e455!,,tzz5;;?K!,,tzz5;;?K*;DD%%djj%**GG$\\$**e<
.z::FF$\\$**e<
.tzz:FFrp   )r   r   r  r  )r   r_   r   r_   r  r  )r  r_   )
r~   r   r   r   r  r  r  r"  r   r  r  s   @rm   r  r  r  s<    
=2
 2
 !2
 3	2
h
KG Grp   r  c                  z   ^  \ rS rSr        SU 4S jjr\      S	S j5       rS
S jrSS jrSS jr	Sr
U =r$ )$FusedExternTritonKernelSchedulerNodei  c                0  > [        UR                  [        R                  5      (       d   e[        R
                  " [        [           X#/5      n[        TU ]%  X5        X l
        X0l        U R                  R                  U l        UR                  U l        g r|   )r   r   r+   r  r  r  r  r_   r  r  kernel_nodefused_epiloguer  r  )rl   r+  r'  r(  r   r  s        rm   r  -FusedExternTritonKernelSchedulerNode.__init__  sv     +**B,F,FGGGGT"34{6ST+&,))33%--rp   c                   UR                   n[        UR                  5      S:X  d   eUR                  [	        [        UR                  5      5      R                     nUR                  R                  [        U5      5        [        X1U5      $ Nr   )r+  r   r  r[  r<  rd  r   r0  removeNodeUserr%  )rs   r   r   r+  original_mutated_buffers        rm   epilogue_fuse2FusedExternTritonKernelSchedulerNode.epilogue_fuse  sy     OO	 5++,111"+"7"7e../055#
 	 %%,,Xe_=3IeLLrp   c                   [        U R                  R                  [        R                  5      (       d   e[        U R
                  R                  [        R                  5      (       d   eU R
                  R                  R                  5       (       d   e[        R                  " U R
                  R                  R                  S   R                  5      nSSKJn  UR                  U R                  /U5      u  pESSKJn  U" U R                  /U5      nSSKJn  U" XGU 5      n	U	R'                  5       n
U R
                  R                  R)                  XR                  R                  U
45      $ )Nr   r  )SIMDKernelFeatures)FusedUserDefinedTritonKernel)r   r(  r   r+   r   r'  r  r  r  r   r  r  torch._inductor.codegen.simdr  get_tiling_and_scores,torch._inductor.codegen.simd_kernel_featuresr2  torch._inductor.codegen.tritonr3  r  codegen_with_epilogue_fusion)rl   r  r  r  tilingr  r2  kernel_featuresr3  fused_user_kernelnew_kernel_srcs              rm   r  ,FusedExternTritonKernelSchedulerNode.codegen  s#   $--22B4E4EFFFF$**//1K1KLLLL$$668888		$**//<<Q?EEF?"88$:M:M9NPUV	S,d.A.A-BEJO8RVW*224$$AA))..?
 	
rp   c                    gr  rz   rk   s    rm   ru  .FusedExternTritonKernelSchedulerNode.is_extern	  r1  rp   c                6    U R                   R                  5       $ r|   )r'  r  rk   s    rm   r  /FusedExternTritonKernelSchedulerNode.get_ranges		  s    **,,rp   )r(  r'  r  r  )r+  r*  r'  r  r(  r   r   rv  )r   r  r   r   r   r   r  rw  r  )r~   r   r   r   r  r   r/  r  ru  r  r   r  r  s   @rm   r%  r%    sw    .. /. &	.
 
. M(M M 
	M M 
,- -rp   r%  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodei	  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g r|   )r  r3  read_to_node)rl   producerr   s      rm   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for	  sG     '')C||~!2!22((88 * rp   c                   [         [           " 5       nUR                  R                   H  nUR                  U R
                  R                  ;  a  M)  U R
                  R                  UR                     R                  5       nX@R                  ;   d  Mk  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g r+  )r   r_   r   r   r   r+  r[  r5  name_to_noder  r   r<  rd  )rl   consumer	producersrd	node_names        rm   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_for	  s     013	&&,,Bwwdnn88822277;LLNI---d//	:; - y>QY((rp   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7fr|   )r+  r  )r   lrrF  s      rm   r   6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>8	  s.      )ADA ""++A11A   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  rx  r  r  rC  r   r   r   rG  r   rG  r+  r  rO  r  )rs   rF  rK  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rm   r  #ForeachKernelSchedulerNode.can_fuse/	  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
rp   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rx  r  r  rC  r]  r`  rG  r   r   rt   rO  r   rG  r  r+  )rs   rF  rK  r]  r`  r^  r_  rS  rT  fused_nodesrZ  r   new_noderY  s                 rm   rt   ForeachKernelSchedulerNode.fuse^	  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GONUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        [)        UR0                  UR0                  5      T l        [-        UR2                  UR2                  5      T l        UR5                  5       (       a  [7        U[8        5      (       d   eXEpO[7        U[8        5      (       d   eXTpU
R:                  T l        T R:                  R=                  UR:                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     T R                   VVVs0 s H(  oR>                  RA                  5         H  u  pX_M	     M*     snnnT l        UT l!        US   RE                  5       nU(       d   eU[F        RH                  " S5      444T l%        [         [L        RN                  RP                     " 5       T l)        UT l*        g s  snnnf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fr|   r{  r  s     rm   r   6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>	  s5        xxt'<'<'>>	 C r~  r   combo_kernel)+rE  rJ  r  r  r   r   r   r   r+  r   r   r0  r  r*   r  r  r   r  r  r  r  r  r  r  r  r  rx  r   rC  r   r  r  itemsr]  r   r   Exprr   r  fxNoder  r`  )rl   r+  r   r]  r^  r_  r`  r   r  r   foreach_noder   rg  r  vrc  r  s   `               rm   r  #ForeachKernelSchedulerNode.__init__	  s    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN&)..0N0N'D# '*..0N0N'D# %%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 #'++@"-:O:O:U:U:W$!:W+@D  *C&%%'v

> :<>?
!%((--02.@s   "/Mc           
        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H,  n[        U[        [        [        [        45      (       a  M*  UPM.     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     n	nU	(       a   [        R                  S[	        U	5      U	5        U Vs/ s H  o"U	;  d  M
  UPM     nn[        R                  (       av  U Vs/ s H  o"R                  5       (       d  M  UPM     n
nU
(       a  [        R                  S[	        U
5      5        U Vs/ s H  o"R                  5       (       a  M  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d grouped nodes are filteredz;ComboKernels: %d FusedMixOrderReductions nodes are filteredz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %szCComboKernels: %d reduction nodes are filtered (pointwise_only mode))r   r  r  r  r   r   r  r  r  r  rC  rr  r(   combo_kernels_pointwise_onlyr   )rs   r>  r  externr   grouped	mix_orderfiltered_nodesforeach_nodestemplate_nodesreduction_nodess              rm   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodes	  sl    #OUj4M&N!UOIIAF5;UVTyy(&&(VU
 $Kez!5I'J1eKII=G !&P1A7N)OQ	PIIMI 
*-(+	  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GIIBN#
 &4O^7N!^O ..*8M.QNN<Lq.OM		Y( *8PA~~?OaNPy P
 VK Q



 H P N Qs   KKK#KK)KK8K*)K K #K% K%2K*K*K/7K/*	K47K4K94K9&K>K>c                d   U R                  5       n/ nSn[        U VVVs/ s H>  nU  H4  n[        U[        5      (       d  M  UR	                  5         H  nUPM     M6     M@     snnn5      nU GH/  n[        [        5      n	U Hi  nUR                  5       n
U
(       a"  U
R                  S:X  d  U
R                  S:X  a  M<  UR                  5       U-  (       a  MV  X   R                  U5        Mk     U	R                  5        H  n[        [        5      nU H0  nXR                  R                  US5         R                  U5        M2     UR                  5        H=  nUR                  [        S[!        U5      U5       Vs/ s H	  nXX-    PM     sn5        M?     M     GM2     U$ s  snnnf s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   mpsra  r   )_topological_sort_nodesr   r   r  rE  r   r  r   r   r   r   r   node_to_streamr  r  r  r   )r+  sorted_nodesgrouped_nodesmax_num_nodesr   r   rZ  excluded_buffer_namesr>  device_groupsrc  device_nodesstream_groupsstream_nodesr:  s                  rm   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels/
  s    !88:1; *)E!Dd$;<  !% 5 5 7H	  !8	 ! )2
 "E D!  *v{{e3v{{e7K ))+.CC%,,T2  !. 4 4 6DOPTDU(D!":":">">tQ"GHOOPTU )$1$8$8$:L!(( &+1c,.?%O%O )Q->?%O %;	 !7% ": K@s   F&"F& F-4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g r|   rC  r  )custom_group_algorithms    rm   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsf
  s    
 # 	#Drp   c                ,    [         R                  U 5      $ r|   r  r+  s    rm   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsn
  s     *KKIVVrp   c                    [         er|   r  rk   s    rm   r  #ForeachKernelSchedulerNode.mark_runt
  r  rp   c                    [         er|   r  rk   s    rm   r  "ForeachKernelSchedulerNode.codegenw
  r  rp   c                    gr  rz   rk   s    rm   rx  %ForeachKernelSchedulerNode.is_foreachz
  r1  rp   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r  r   rk   s    rm   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodes}
  s     DKK  rp   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7fr|   )r   r  s     rm   r   7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>
  s     1UA++--r?  )r  r
  r  rH  r   rk   s    rm   r   $ForeachKernelSchedulerNode.get_nodes
  s(     IOO111U1UUVVrp   c                <    U R                   S   R                  5       $ r   )r   r9  rk   s    rm   r9  )ForeachKernelSchedulerNode.get_first_name
  s    {{1~,,..rp   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g r|   )r0  r+  r[  r   r2  )rl   r1  r   s      rm   r2  /ForeachKernelSchedulerNode.prune_redundant_deps
  s5     	d8R8RSKKD%%&89  rp   )r   r`  r   r  r  r  r  rJ  r   r  r  rE  r+  r   r  r]  r0  )rF  r_   r   r,  )rK  r_   r   r,  rF  r_   rK  r_   r   r{   )rF  r_   rK  r_   r   rC  )NNF)r+  r*  r   rS  r]  r{   r^  r,  r_  r,  r`  r{   r   rv  r>  rS  r   rS  )r+  r*  r   list[list[BaseSchedulerNode]])r  r  r   rv  ru  rw  r   rS  rL  rs  rJ  )r~   r   r   r   r&  rG  rO  r   r  rt   r  rx  r'  r  r  r   r  r  r  r  rx  r  r   r9  r2  r   r  r  s   @rm   rC  rC  	  s   
)	!)	!& ,
 ,
\ >
(>
4E>
	#>
 >
J 1504 %L/L/ (L/ $(	L/
 .L/ .L/ L/ 
L/ L/\ ?+?	 ? ?B 00	&0 0h 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :rp   rC  c                     ^  \ rS rSr% SrS\S'   \SS j5       r S       SU 4S jjjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jr\SS j5       rSS jrSS jr\SS j5       rSrU =r$ )r  i
  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
rS  r   c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7fr|   r  )r   r   r+  s     rm   r   .GroupedSchedulerNode.create.<locals>.<genexpr>
  s     B64>>Y.6s   )r+  r   r1  r3  )rs   r   grouped_snoderg  r+  s       @rm   createGroupedSchedulerNode.create
  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>rp   c                H   > [         TU ]  U5        [        XU5        X0l        g r|   )r  r  r  temp_grouping)rl   r+  r   r  r  s       rm   r  GroupedSchedulerNode.__init__
  s$     	#0 +rp   c                B   U R                   (       a  U R                  $ U R                   H)  nXR                  R                  UR	                  5       '   M+     U R                  R                  U R	                  5       	 U R                  R                  U R                  5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)r  r   r+  r1  r3  
fuse_nodes)rl   rg  s     rm   unpackGroupedSchedulerNode.unpack
  so    
 ;;[[EBGNN--enn.>? !NN--dmmo>~~((55rp   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g r|   )r  r   r  r  r  )rl   fake_deps     rm   r  !GroupedSchedulerNode.add_fake_dep
  s5    T--77AB##H-rp   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf r  r  r  s     rm   r3  GroupedSchedulerNode.get_name
  r  r  c                <    U R                   S   R                  5       $ r   r  rk   s    rm   r9  #GroupedSchedulerNode.get_first_name
  r  rp   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf r|   r  r  s     rm   rE  %GroupedSchedulerNode.get_buffer_names
  r  r  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ r|   r  r  s      rm   r   GroupedSchedulerNode.get_outputs
  r  rp   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fr|   r  r=  s     rm   r   6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>
  r  r  r   r  r  s      rm   r	  #GroupedSchedulerNode.estimate_flops
  r  rp   c                    U R                   $ r|   r  rk   s    rm   r   GroupedSchedulerNode.get_nodes
  r/  rp   c                b    U R                   (       a  U R                   S   R                  5       $ S $ r   )r   r   rk   s    rm   r   GroupedSchedulerNode.get_device
  s$    .2kkt{{1~((*CtCrp   c                    gr  rz   )rs   rF  rK  s      rm   r  GroupedSchedulerNode.can_fuse
  r  rp   )r  )r   rS  r   r  )F)r+  r*  r   rS  r  r{   r   rv  r  )r  r3   r   rv  rs  rI  r  rO  rL  ry  r  )r~   r   r   r   r&  r   r   r  r  r  r  rI   r3  r9  rE  r  r	  r   r   r  r   r  r  s   @rm   r  r  
  s     $#  $	++ (+ 	+
 
+ +6. = =) N N  "D  rp   r  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr   c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr   Nrz   r   sl_asl_bs      rm   r   5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7fr  rz   r  s      rm   r   r    r  r  r  )rJ   absr   rG  )	r`  bslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          rm   	index_cmp"pick_loop_order.<locals>.index_cmp  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   r  )r`  r$  r  r$  r   r$  )		functools
cmp_to_keyr  r   r  r   r(   pick_loop_orderssort)r  r  priority_idxr  orderpis   ``    rm   pick_loop_orderr  
  s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g r|   )r3  r   r"  r5  rZ   r   r  r   
name_to_opoperation_namebuffersr   r,  
operations)	orig_noderb  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rm   _replace_operation_bufferr  &  s]    !))+&&(MmS))j9JC.P.PPP224//1LlC((Z8H#-N-NNN	01!M	+,*77??  +DGGOO8$$AGGOOD,4AGG=)77##I.DGGh''AGGt'/AGG|$rp   c                j    U R                  5       nUR                  5       nXC-
  nXS-  nUSU-   -  nXr-  $ r+  )r  r  )r   r   epilogue_runtimetemplate_write_bytesepilogue_read_bytesextra_bytesextra_bytes_ratioextra_memory_ratios           rm    _estimate_fused_epilogue_runtimer  B  sO     779557%<K $: +a2C.CD00rp   c                    US:  a  gUR                   nUc  gU(       d   eX4R                  =(       d    S-  nX-  nX-  nXW-  n	XX-  n
X4$ )Nr{  )r   r  )r   r       )regs_per_multiprocessor	warp_size)unfused_n_regsfused_n_regsfused_n_spills	num_warpsdevice_propsregs_per_smthreads_per_blockregs_per_block_unfusedregs_per_block_fusedblocks_unfusedblocks_fuseds              rm   "_occupancy_before_and_after_fusionr  O  sl      66K9!%;%;%ArB+?'; :N6L''rp   c                    SnSn[        X#XEU5      u  pUSU -  :  =(       a    U
S:  nU
S:g  =(       a    X:  =(       d    X-  U:  =(       d    U$ )z=
Determine whether to fuse an epilogue into a GEMM template.
r  g      ?r   r   r  )r  )ms1ms2r  r  r  r  r  MIN_ACCEPTED_OCCUPANCYREGRESSED_OCCUPANCY_RATIOr
  r  ,epilogue_dominated_with_sufficient_occupancys               rm   _fuse_epiloguer  j  su      # $Fn$ N 47S=3U\TUEU0
 2 . 	8(+DD	87rp   c                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)r-  i  BaseSchedulerNode | OutputNoder   Fr{   r}  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ r|   )r8  r   r3  r}  r  rk   s    rm   r9  NodeUser.__hash__  s+    TYY'')4+;+;T\\JKKrp   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ r|   )r   r-  r3  r}  r  r  s     rm   __eq__NodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
rp   c                6    U R                   R                  5       $ r|   rK  rk   s    rm   r3  NodeUser.get_name  rM  rp   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ r|   )r   r-  r}  r  r  s     rm   ri  NodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rp   rz   Nrt  )r  objectr   r{   rs  )r  r-  r   r-  )r~   r   r   r   r   r}  r  r9  r  r3  ri  r   rz   rp   rm   r-  r-    s3    
((K GTL
$
rp   r-  c                 "    [         R                  $ r|   )r(   r}  rz   rp   rm   *used_non_deterministic_runtime_estimationsr"    s    333rp   c                   [        5       nU R                  5       n[        U[        R                  5      (       a  UR                  [        UR                  5      [        UR                  5      -  [        UR                  5      -  5        [        U[        R                  5      (       a$  UR                  [        UR                  5      5        U$ Ub
   SU 35       eU$ )z=Get free symbols from a node's layout (size, stride, offset).z*Expect layout to be None but found layout=)r   maybe_get_layoutr   r+   Layoutr  r"   r   strideoffsetr  get_layout_symintsr  )r   free_symbol_usesr@  s      rm   r(  r(    s    1;""$F&"))$$%6==)*6==)*	

 fb;;<<##$6v}}$EF  ~T!KF8TT~rp   c                .   [        U [        5      (       a(  [        5       R                  " S U R                   5       6 $ U R
                  c   eU R
                  R                  5       nUR                  " S U R
                  R                  5        5       6   U$ )z{
Gets symbols used in a scheduler node, including free symbols from
the node's operations and layout symints from outputs.
c              3  8   #    U  H  n[        U5      v   M     g 7fr|   get_scheduler_node_symbol_uses)r   rg  s     rm   r   1get_scheduler_node_symbol_uses.<locals>.<genexpr>  s     M,U33r  c              3  8   #    U  H  n[        U5      v   M     g 7fr|   )r(  )r   ir_nodes     rm   r   r.    s     	M5L'
W
%
%5Lr  )	r   r   r   r  r   r   get_free_symbol_usesr  r  )r   r)  s     rm   r-  r-    s     $*++|!!MM
 	
 99   yy557	MTYY5J5J5L	M rp   c                z    U R                  5       nUb  UR                  b  UR                  $ [        R                  $ z4Check per-template flag, fall back to global config.)r/  allow_epilogue_fusionr(   epilogue_fusionrA  tbs     rm   _is_epilogue_fusion_enabledr8    8    		(	(	*B	~"22>'''!!!rp   c                z    U R                  5       nUb  UR                  b  UR                  $ [        R                  $ r3  )r/  allow_prologue_fusionr(   prologue_fusionr6  s     rm   _is_prologue_fusion_enabledr=    r9  rp   c                ~    U R                  5       =(       a'    UR                  5       (       + =(       a    [        U 5      $ r|   )rr  r8  r   s     rm   is_epilogue_fusionr?    4     	/!!##	/'.rp   c                ~    UR                  5       =(       a'    U R                  5       (       + =(       a    [        U5      $ r|   )rr  r=  r   s     rm   is_prologue_fusionrB    r@  rp   c                <    [        X5      =(       d    [        X5      $ r|   )r?  rB  r   s     rm   is_template_fusionrD    s    e+O/A%/OOrp   c                *    [        X5      (       a  U$ U $ r|   )r?  r   s     rm   template_fusion_pw_noderF    s    &u445?%?rp   c                    ^  \ rS rSrSrSlS jrSlU 4S jjrSmS jrSnS jrSoS jr	SpS jr
SqS	 jr\SrS
 j5       r\R                  SsS j5       rSnS jrStS jrSuS jrSnS jrSnS jrSnS jrSnS jrSvS jr    SwS jrSnS jrSxS jrSyS jrSnS jrSnS jrSnS jrSwS jrSnS jr    SzS jr  S{       S|S jjr!      S}S jr"    S~S  jr#SnS! jr$          SS" jr%SS# jr& S{     SS$ jjr'      SS% jr(SS& jr)        SS' jr*        SS( jr+      SS) jr,          SS* jr-    SS+ jr.    SS, jr/      SS- jr0S{SS. jjr1SS/ jr2      SS0 jr3      SS1 jr4      SS2 jr5        SS3 jr6      SS4 jr7        SS5 jr8      SS6 jr9      SS7 jr:SS8 jr;        SS9 jr<      SS: jr=  S         SS; jjr>      SS< jr?        SS= jr@SS> jrA        SS? jrBSSS@ jjrC   S           SSA jjrD      SSB jrE      SSC jrF    SSD jrG    SSE jrHSnSF jrISnSG jrJSnSH jrK    SSI jrLSSJ jrMSSK jrNSSL jrO      SSM jrPSSN jrQ\RSSO j5       rS    SSP jrT  SSQ jrU    SSR jrV      SSS jrW      SST jrX    SSU jrY    SwSV jrZ    SwSW jr[    SwSX jr\  SSY jr]      SSZ jr^SS[ jr_SnS\ jr`      SS] jra      SS^ jrb      SS_ jrcSnS` jrdSSa jre    SSb jrfSSc jrgSSd jrhSnSe jri\SSf j5       rj\SSg j5       rkSSh jrlSnSi jrmSSj jrnSkroU =rp$ )r*  i  z
A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
optimizations such as fusion, reorder, and graph partition.
c                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initrl   r>  s     rm   r  Scheduler.__init__  s#    ./JJu 0//s   '
5c           
     
  >^  [         TT ]  5         T [        R                  l        0 T l        [        [        5      T l        [        R                  " 5       T l        [        5       T l        [        / [        R                  R                  R                  5       Q[        R                  R                   R                  5       Q[        R                  R"                  R                  5       Q5      T l        U Vs/ s H  nT R'                  U5      PM     snT l        S T l        S T l        T R/                  5         T R$                  R1                  [        R                  R                   R                  5       5        T R(                   H  nUR3                  5         M     S T l        T R7                  5       T l        T R(                   Vs0 s H  o"R;                  5       U_M     snT l        T R(                   VVs0 s H*  o3R?                  5         H  oDR;                  5       U_M     M,     snnT l         T R<                  RC                  5       T l"        0 T l#        0 T l$        [        5       T l%        [L        RN                  " T R(                  T R@                  T RD                  5      T l        T RQ                  5         T RS                  T R(                  5      T l        T RU                  5         T R(                   Vs0 s H  o"R;                  5       U_M     snT l"        T RW                  5         T RY                  5         [Z        =R\                  [_        T R(                  5      -  sl.        SSK0J1nJ2n  U" T R(                  5        [_        T R(                  5      T l3        T Ri                  5         T RS                  T R(                  5      T l        [        [j        [l        [l        4      " 5       T l7        [p        Rr                  b%  [p        Rr                  " T R(                  5      T l        [p        Rt                  (       a'  SSK;J<n  UR{                  T 5        T RW                  5         0 T l>        0 T l?        ST l@        0 T lA        T R                  5         T R                  T R(                  5      T l        [p        R                  b%  [p        R                  " T R(                  5      T l        [        S T R(                   5       5      (       a  T RU                  5         T R                  5         T R                  5         [p        R                  (       d  [p        R                  (       aA  [        5       (       a2  [        R                  R                  R                  R                  5         [p        R                  (       a#  [        SSSS	9   T R                  S S
9  S S S 5        T R                  5         [p        R                  (       a  SSKUJTn  U" T R(                  T R@                  T RD                  [        [        R                  R                  R                  5       5      [        [        R                  R                  5       5      5      T l        [p        R                  (       Gd;  [p        R                  (       Ga%  [p        R                  (       d#  SSKUJYn	  U	" T R(                  T R@                  5        [        5       (       a  [        R                  (       a|  [p        R                  (       d  [        R                  (       aR  Sn
T R(                   H!  n[        UR                  5      (       d  M  Sn
  O   U
(       a  SSK&Jan  U" T R(                  5        [        R                  (       a  SSKcJdn  U" SS U 4S jS9  [L        R                  " T R(                  5      T l        T R                  5         [p        R                  (       a~  [p        R                  R                  (       a_  [p        R                  R                  (       a@  T R                  T R(                  5      T l        T R                  T R(                  5      T l        T R                  5         [        R                  Rp                  R                  R                  (       a  T R                  5         U" T R(                  5        [        R                  R                  T R(                  5        T R                  5         [        5       T lt        0 T lu        S T lv        [        S5      R                  U 4S j5        [        5       T ly        g s  snf s  snf s  snnf s  snf ! , (       d  f       GN= f)Nr   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotuneFc              3  B   #    U  H  n[        U[        5      v   M     g 7fr|   )r   r%  r=  s     rm   r   "Scheduler._init.<locals>.<genexpr>q  s"      
" tABB"   z#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     SSS.$ )N#scheduler_nodes_before_comm_overlapstring)r   encodingrz   rz   rp   rm   r  !Scheduler._init.<locals>.<lambda>  s     E$,)rp   c            
        > SR                  [        TR                  5       V Vs/ s H0  u  pSU  S3UR                  5       -   SUR	                  5        3-   PM2     snn 5      $ s  snn f )Nz

zsnode[r>  z buffer_names:)r*  r=  r>  rH  rE  )r:  rL  rl   s     rm   r  r`    so    v{{
 )2$**(=	 )> %QCqMkkm, .q/A/A/C.DEF )>	(s   7A$
)metadata_fn
payload_fngraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r>  rk   s   rm   r  r`    s%     33+/+>+>*-djj/rp   )zr  r  rZ   r   r+  backendsr<  _post_grad_graph_counterri  r
  count_graph_partition_counterr   r  r  r  	constantstorchbind_constantsr  create_scheduler_noder>  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersrZ  r3  rJ  r  r[  copyr1  r  r  seen_template_fusionsr'   decide_global_ordering_of_commsr]   topological_sort_scheduledead_node_eliminationcompute_ancestorscompute_input_distancesr,   ir_nodes_pre_fusionr   torch._inductor.debugrM  rN  rj  create_foreach_nodesr   r"  logged_slow_fusionr(   _pre_fusion_custom_passdistributed_max_autotune_gemmr  rO  scheduler~  buff_to_stream_multi_stream_nodesstream_idx_to_user_obj_idx_populate_stream_assignmentsr  _post_fusion_custom_passr   r   finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodes_enforce_conditional_orderingrW  memoryget_output_namesdeterministic reorder_for_compute_comm_overlaprX  r"  r)   6runtime_estimations_align_across_all_distributed_ranksr}  r  rS   r   rY  reorder_sink_verbose_loggingtorch._loggingrZ  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   r^   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_index_current_stream_ctxr   add_rowremoved_ops)rl   r>  rL  r   r   rM  rN  rO  rW  rX  has_collectivesrY  rZ  r  s   `            rm   rI  Scheduler._init  s    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCUd003UC
7;6:'')##**177+<+<+A+A+CDJJDOO  <@# $$& 	# &*ZZ;
%/JJL!OZ;

 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13 L 	"
 ::JJ##

 	!!#33DJJ?
""$<@JJ"GJq::<?J"G $$& 	##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ//. ))$/""$ =?.0). :<'))+__TZZ0
**688DDJ 



 
 
 &&(,,.$$(;(;&((OO,,AASSU5&* $
 ..D.A 	**, ))70

  ''177//44671773356DJ ###(O(O(O11UAJJ 0 0
 ;<< WW<<#PP #( JJD$TYY//*. ' # K4::V 88; !  CCDJJODJ""$ ""((CCDDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :< GK '//	
 -7LA D;
8
H #Hh s$   6ee#61e(5e.;e33
fc                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)r-  )rZ   r   graph_inputs_originalr   r+   DonatedBufferr~  )rl   name_to_donated_bufr   s      rm   rv  Scheduler.get_donated_buffers  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"rp   c                  ^
 SSK Jm
  0 n[        R                  " S5      nU R                   H  nT
nUR
                  bC  UR
                  R                  5       nUb&  XQ;  a  [        U5      nXaU'   XPR                  U'   X   nX@R                  U'   UR                  5        H  nX@R                  U'   M     M     [        U
4S jU R                  R                  5        5       5      (       a  [        S U R                   5       S5      nUb  U R                   H  nUR
                  n	UR                  5       b  M"  [        U	[         R"                  5      (       d  MC  [        U	R$                  [         R&                  5      (       d  Mn  [         R&                  " US9U	l        M     [        U
4S jU R                  R                  5        5       5      U l        g)a  Populate node_to_stream and buff_to_stream from IR node stream_idx.

Reads the stream_idx field set on IR nodes during lowering to determine
which stream each scheduler node should run on. This field is propagated
from 'custom.stream' FX node metadata via IRNode.current_stream_idx().
r   )DEFAULT_STREAM_IDXNc              3  ,   >#    U  H	  oT:g  v   M     g 7fr|   rz   )r   r  r  s     rm   r   9Scheduler._populate_stream_assignments.<locals>.<genexpr>'  s     M0L1&&0Ls   c              3  f   #    U  H'  oR                  5       c  M  UR                  5       v   M)     g 7fr|   r   rK  s     rm   r   r  )  s      RA||~s   11rc  c              3  .   >#    U  H
  nUT:g  v   M     g 7fr|   rz   )r   
stream_idxr  s     rm   r   r  7  s      '
:
 ,,:s   )stream_constantsr  r
  rm  r>  r   get_stream_idxr<  r  r~  rE  r  r   r   r   r   r+   Bufferr@  r?   r  )rl   user_obj_to_stream_idxstream_idx_counterr   r  user_obj_idxnew_stream_idxr   rc  r0  r  s             @rm   r  &Scheduler._populate_stream_assignments  s~    	9 24&__Q/JJD+Jyy$#yy779+#A)-.@)A?M|<JV77G!7!EJ(2% ,,.+5##C( /! , M0C0C0J0J0LMMMRRTXF ! JJD"iiG)1&w		::&w~~r}}EE *,f)E ' $' '
"1188:'
 $
 rp   c                    U R                   $ )z7Check if any nodes are assigned to non-default streams.)r  rk   s    rm   _has_multi_stream_nodes!Scheduler._has_multi_stream_nodes<  s    '''rp   c                    U R                   R                  X5      nU R                  R                  X R                  R                  US5      5      $ )zAReturn the stream index for a buffer, resolving mutation renames.r   )r  r  r  )rl   rZ  reals      rm   get_buf_streamScheduler.get_buf_stream@  sB    $$((<""&&t-@-@-D-DXq-QRRrp   c                    U R                  5       (       d  gU R                  U5      U R                  R                  US5      :g  $ )zTrue if buf_name was produced on a different stream than node.

Resolves mutation renames so that mutated buffers inherit the
stream of their original definition.
Fr   )r  r  r~  r  )rl   rZ  r   s      rm   r  !Scheduler.has_cross_stream_hazardE  s?     ++--""8,0C0C0G0Ga0PPPrp   c                6    [         R                  R                  $ r|   rZ   r   current_devicerk   s    rm   r  Scheduler.current_deviceO  s    ww%%%rp   c                .    U[         R                  l        g r|   r  rb  s     rm   r  r  S  s    !'rp   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r  r  r>  )rl   r  s     rm   r  Scheduler.debug_draw_graphW  s1    ::>>:DASH+6 Irp   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r  isEnabledForloggingINFOr  r>  r  )rl   labelr   s      rm   debug_print_nodesScheduler.debug_print_nodes^  sD    GLL))HHUE"

  " # *rp   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r   r+   r   r  r   rp  r  r  r  s     rm   rq  Scheduler.create_scheduler_noded  s    !- 	
@	
- ==??)$55r00"2C2CDEE ,,boo..,T88%d++rp   c                   [        5       n/ nU R                  R                  5       n[        R                  R
                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                   V	s/ s H  oR!                  5       U;  d  M  U	PM     sn	[#        U5      -   U l        g s  snf s  snf s  sn	f )Nr   Fr]  r`  )r   r1  r  rZ   r   listsr   r   rJ  r  r  r(   combo_kernels_autotunerC  r   r>  r3  r  )
rl   removed_node_namesfe_nodeskept_node_namesnamesr   r   r`  fe_noder   s
             rm   r  Scheduler.create_foreach_nodesq  sK   .8l11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E# EE-E E ;E c                X  ^ ^'^(^)  " U'4S jS[         [           5      m'[        R                  " T'5      m(T R                   H  nUR                  5        H  nUR                  5       n[        UR                  R                  [        R                  5      (       a  [        UR                  5       5      S:  a  Me  UR                  5        HW  nUT(;   a6  UT(;   a0  T(U   nT(U   nXV-   nT( H  nT(U   UL d
  T(U   UL d  M  UT(U'   M     M?  UT(;   a
  T(U   T(U'   MO  T(U   T(U'   MY     M     M     SU)U 4S jjm)  S         SU(U)4S jjjn	0 n
[        R                  R                   R#                  5        H  n[        U[$        R&                  5      (       a  UR(                   H  nSX'   M	     M;  [        U[        R*                  5      (       d  M\  UR-                  5        Vs/ s H&  n[        U[$        R&                  5      (       d  M$  UPM(     nnU H  nUR(                   H  nSX'   M	     M     M     SnT R                   Hz  nUR                  c   e[/        UR                  R1                  5       S S	9nU H?  n[        U[$        R2                  5      (       d   eS
nX;  d  M-  UR                  5       X'   MA     M|     T R                   GHd  n[4        R7                  SUR                  5        U(       a  UR                  c   e[/        UR                  R9                  S
S9S S	9nU Hi  nX;   d   U SU
 35       eX   =nc  M  T R:                  U   R                  5        H+  nUR=                  [?        UR                  5       5      5        M-     Mk     [        UR@                  RB                  5      S:X  aQ  [E        [G        UR@                  RB                  5      5      =n(       a"  [        U[H        5      (       a  URJ                  nOSnUR                  5        GHB  n[        URM                  5       5      S::  d   eURM                  5        GH
  nT)" U5      nU	" UU5        UR=                  [?        UUS95        T(U   RN                   H  nUR                  5       UR                  5       :X  a  M'  [        UR                  [P        5      (       d   eUR                  R                  5        Hc  nUR                  5       nT)" U5      nUUR                  5       ;   nUR=                  [S        UUR                  5       U(       + S95        U	" UUS
S9  Me     M     GM     GME     [        R                  RT                  UR                  5           H3  nU	" UUS
S9  UR=                  [S        UUR                  5       S
S95        M5     [        R                  RV                  UR                  5           H%  nU	" UUSS9  UR=                  [?        U5      5        M'     UR@                  RX                   H<  n[        U[R        5      (       a  M  U	" URZ                  XR]                  U5      5        M>     UR_                  T R`                  5        UR                  5        H  nURM                  5        Hz  nUR                  5       T R`                  T)" U5      '   UR                  5       T R`                  U'   T Rb                  Re                  UU5      T Rb                  UR                  5       '   M|     M     GMg     [        R                  Rg                  5        H4  n[4        R7                  SU5        U	" U[i        [?        U5      5      5        M6     U(       a  [        R                  Rj                   H  nUR9                  S
S9 H  nX;   d   U SU
Rm                  5        35       eX   =n(       d  M/  T R:                  U   Ro                  5        H5  n[4        R7                  SUU5        U	" U[i        [?        U5      5      5        M7     M     M     T R`                   H  nU[        R                  R                   ;   aF  U	" U[i        [?        U5      5      5        [        R                  Rp                  Rs                  U5        Mg  U[        R                  Rt                  ;   d  M  U	" U[i        [?        U5      5      5        M     [w        [        R                  R                   Rm                  5       5       V Vs0 s H	  u  n nUU _M     n!n n[        R                  Rp                   Vs/ s H  nU!U   PM
     sn[        R                  l<        T R                   HF  nUR                  5        H/  nUR{                  T(UR                  5          RN                  5        M1     MH     T R|                   H.  nT R|                  U   R{                  T(U   RN                  5        M0     [        5       n"U"R                  S5        T(RO                  5        Ha  u  nn#U"R                  5          U#RN                   V$s/ s H  n$U$R                  5       PM     n%n$U"R                  SU SU% S35        SSS5        Mc     U"R                  S5        U"R                  5       R                  5       n&[        R7                  S5        [        R7                  SU&5        gs  snf s  snn f s  snf s  sn$f ! , (       d  f       M  = f)zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
c                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListi  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g r|   )rh  r   
membership)rl   rh  r  s      rm   r  :Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
rp   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g r|   )r  rh  r   r  )rl   	node_users     rm   r   8Scheduler.compute_dependencies.<locals>.DedupList.append  s3    /

!!),##I.rp   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf r|   )r   r  r  rh  )rl   r  new_membershipr  	new_items	DedupLists        rm   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)rh  r  rt  )rh  zlist[_T] | Noner  zOrderedSet[_T] | Noner   rv  )r  ra   r   rv  )r  DedupList[_T]r   r  )	r~   r   r   r   r&  r  r   r  r   )r  s   rm   r  r    s@     *.48=&= 2= 	=/< <rp   r  r   c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ r|   )r  )rL  r  rl   s    rm   r  .Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677Hrp   Fc                N   > TT" U 5         R                  [        XU5      5        g r|   )r   r-  )used_by_namer  r}  r  name_to_usersr  s       rm   add_user0Scheduler.compute_dependencies.<locals>.add_user  s'     &./669rp   Nc                    U R                   $ r|   r  r\  s    rm   r  0Scheduler.compute_dependencies.<locals>.<lambda>  s    AFFrp   r  Tzscheduling %s)unbacked_onlyc                    U R                   $ r|   r  r\  s    rm   r  r    s    !&&rp   z not in )rj  mutating_bufr  )r  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sr  'z': r=  r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)rL  r"  r   r"  )FF)
r   r"  r  r  r}  r{   r  r{   r   rv  )Er	   ra   r  r   r>  r  r3  r   r   r@  r+   r?   r   rA  rZ   r   r  r   r   ri  r"   	TensorBoxr  r  get_unbacked_symbol_defsSymbolr  r  r1  rJ  r  r5   r   r  r<  rd  r4   rj  rC  rh  r_   r6   additional_buffer_depsadditional_star_depsr   r   r}  r  r  r  r  r  rc  graph_outputsr  rE  mutated_inputsr  ro  r=  mutated_input_idxsrj  rZ  rR   r  rD  rE  r  compute_dependencies_log)*rl   r   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_nodevalfsr  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesrT  r   r   	node_modealt_namerG  out_buf
other_nameis_aliasadd_depr  rZ  r   r   r   	inp_nameslogbufr  rm  r0  r"  r  r  r  s*   `                                      @@@rm   r]   Scheduler.compute_dependencies  s	   	< 	<@ @K?V?V@
 JJD((* MMO	 tyy//??D,,./!3!%!1!1!3I M1i=6P -i 8 -i 8#(=#0C -c 2e ;#0#5#>5=c 2 $1 #m33@3Ki03@3Ki0 "4 + <	 	 !&!				 6		 			
 		 		 		 JL&
 77''..0C#uzz****B9=26 +C.. (+||~S~!Auzz9RA~S!Ann=A6: - " 1 ',#JJD99((( $*		224:J$  *!!U\\2222 /3+:8<25 *   JJDIIotyy1*yy,,,'-II222F(($
 .A> #X&D%EF> <>>K#'#4#4Q#7#C#C#EC --gclln.EF $F . D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG'+yy'<'<'>G)0)9)9);J)/
);J (073F3F3H'HH -- '$.1408L!" %ZtD% (? !> !4 *B 7799$--/J$5 !!''4==?D"QR	 K 7777H$6!!''"23 I
 ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3//33HhG ++CLLN; !4 *a r 002HII,h7Xz'(*;<= 3
 'ww,,111EA> #X&D&I&I&K%LM> ;==q=(,(9(9!(<(M(M(OHII M ( !
 %Xz'(:K/LM )P F - ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4D%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0  !c'--/JC/4{{;{!{;#c%23 ! 0 	c  "))+ &&';< &&'I3OK TX
&
" < !s6   "#l	l2l
!ll*lll
l)	c           
     2  ^ ^ SSK JnJnJnJn  [        [        R                  R                  R                  5       5      nU" T R                  U5      n[        R                  R                  R                  (       d  U" T R                  T R                  5        [        [        R                  R!                  5       5      nU" T R                  UU5      u  n  n	[#        [%        T R                  5      5       V	s/ s H  n	/ / 4PM	     sn	mU H  n
U
R&                  S:X  a  U
R(                  S:X  a  M%  U
R*                  R-                  5       nTU
R.                     S   R1                  U5        TU
R2                     S   R1                  U5        M     SSKJn  U" 5               SU U4S jjn/ n[9        T R                  5       HE  u  nnUR1                  U5        UR1                  U" X[%        T R                  5      S-
  :H  S95        MG     UT l
        g s  sn	f )Nr   )rX  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                N  > TU    S   nTU    S   nX#U/n[         R                  " [        [        R                  " S5      S9[        R
                  R                  R                  R                  / US S9nSTR                  U    R                  5        3Ul        [        TU5      $ )Nr   r   ra  r  c                $    U US   US   US   S.4$ )Nr   r   r   )alivedeadis_final_steprz   )tensor_argsrd  s     rm   r  WScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>  s(    !.q!1 -a 0)6q)9Crp   )r@  rX  r4  nontensor_argsunflatten_args
mem_check_)r+   MemoryCheckKernelr?   r  rc  rk  _inductor_debugcheck_memory_stepdefaultr>  r3  r  r  )step_idxr3  expected_newly_aliveexpected_newly_deadr6  r   rl   step_allocs_deallocss         rm   construct_mem_check_nodeEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node  s     $8#A!#D "6x"@"C2WN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488rp   )r3  )r=  r$  r3  r{   r   r  )r  rX  r+  r,  r-  r   rZ   r   r  r  r>  r  r  r(   rW  r[  r  r  r   
size_alloc	size_freer  r3  
start_stepr   end_step#torch._inductor.runtime.debug_utilsr.  r=  )rl   rX  r+  r,  r-  r  name_to_freeable_input_bufr  buf_info_listr  buf_inforZ  r.  rA  	new_nodesr:  r   r@  s   `                @rm   r  #Scheduler.insert_memory_check_nodes  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
4RH4C
 &H""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG & 	N	9	9*.	9&	9 	92 	 ,GAtT"(DJJRS@S;SU - 
eC
s   8Hc                  ^	 [         R                  (       d  g/ n[        U R                  5       GH  nSS jm	SnUR	                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [        R                  SUR                  5       5        [        R                  R                   R                  UR                  5       5        UR"                  R$                   H  nUR&                  U R(                  ;   d  M  U R(                  UR&                     R                  nU Vs/ s H2  oR*                  R                  5       UR                  5       :w  d  M0  UPM4     snU R(                  UR&                     l        M     GM     [-        [        U5      5      U l        U R                   H  nUR/                  5         M     gs  snf )	z 
Remove any nodes without users
Nc                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ r|   )r  r3  rZ   r   r"  )rG  s    rm   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s&    ||Tt}}!'':T:T'TTrp   Fc              3  4   >#    U  H  nT" U5      v   M     g 7fr|   rz   )r   urO  s     rm   r   2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #M9a$6q$9$99   zremoved dead buffer: %sTzremoved dead operation: %s)rG  r-  r   r{   )r(   use_dcer   r>  r  r   r0  r  r  r3  rZ   r   r  r  r  r   r"  r   r   r   r[  r   r  r-  )
rl   updated_nodesr   active_buffersr   can_eliminater  r0  rR  rO  s
            @rm   r{  Scheduler.dead_node_elimination  s    ~~
 TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   
/I(=I(c                
    USL$ )z:Check if store mode requires cross-thread synchronization.Nrz   )rl   rj  s     rm   mode_requires_synchronization'Scheduler.mode_requires_synchronization  s    4rp   c                   ^^^^ [         [           " 5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ r|   r  )ds    rm   r  DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>/  s    affrp   r  )r  r  r  r   r   )rL  r   rJ  rF  seenvisits     rm   rc  2Scheduler.topological_sort_schedule.<locals>.visit,  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  rp   )rL  r_   r   rv  )r   r_   rF  rE  )rl   r>  r   r   rJ  rF  rb  rc  s       @@@@rm   rz  #Scheduler.topological_sort_schedule"  sj     +,.59V*,	! 	! D--/%)T" 0  D$K rp   c           	        U R                    Vs/ s H0  n[        UR                  [        R                  5      (       d  M.  UPM2     nn[        S[        U5      5       Hi  n[        [        X#   R                  5       5      5      n[        [        X#S-
     R                  5       5      5      nX#   R                  [        XTSS95        Mk     g s  snf )Nr   Tr  )r>  r   r   r+   Conditionalr  r   r<  rd  rE  r  r6   )rl   rL  conditional_nodesr:  r	  prev_bufs         rm   r  'Scheduler._enforce_conditional_ordering=  s    zz
!!Z%GAz 	 
 q#/01A%6%9%J%J%L MNLD!2q5!9!J!J!LMNH --TJ 2
s   -C Cc                N  ^  [        5       n[        U[        [        [        [
        [        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        [        U 4S jU 5       5      5      $ )Nz+get_unmet_dep_nodes is not implemented for .c              3  ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7fr|   )r[  r5  r  s     rm   r   1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>Z  s(     XZc))#.??AAZrV  c              3  B   >#    U  H  nTR                   U   v   M     g 7fr|   r1  )r   rL  rl   s     rm   r   rn  [  s     Q=at66q9=s   )r   r   r   r  r  r   r  r  r  r   RuntimeErrorr   r  )rl   rg  
unmet_depsr   unmet_dep_opss   `    rm   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodesH  s    &0l
)&"$	
 	
 //sxx( 0 =d5k]!L  YZXJQ=QQRRrp   c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r   zTopological sort failed!)	rF  fromkeysr>  rt  r   r  r   rh  r  )rl   r  r>  childrenr   r  r   crL  rm  zero_deg_nodesrG  s               rm   r}  !Scheduler._topological_sort_nodes]  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                j   0 nU R                    Hw  n[        5       nUR                   HB  nU R                  UR                     R                  5       nUR                  U5        X1U   -  nMD     X1UR                  5       '   X2l        My     [        U R                   5       H  u  pbXbl
        Xbl        M     g)z
Populate each node.ancestors
N)r>  r   r  r[  r   r5  r  r3  r   r=  r  r  )rl   name_to_ancestorsr   r   r   dep_node_namer  s          rm   r|  Scheduler.compute_ancestorsw  s    
 9;JJD)3I.. $ 0 0 : K K Mm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1rp   c                   0 n0 nU R                    H  nUR                  (       d  SnSnOUR                   Vs/ s H/  nXR                  UR                     R	                  5          S-   PM1     nnUR                   Vs/ s H/  nX R                  UR                     R	                  5          S-   PM1     nn[        U5      n[        U5      nXAUR                  5       '   XRUR                  5       '   XCl        XSl	        M     gs  snf s  snf )z
Populate each node's min/max_input_distance with the depth from graph
inputs, measured as dependency hops before fusion. Nodes whose
dependencies are all satisfied by graph inputs/constants have depth 0.
r   r   N)
r>  r  r[  r   r5  r  r  r3  r  r  )	rl   name_to_min_distancename_to_max_distancer   min_distmax_distr   dep_min_distsdep_max_distss	            rm   r}  !Scheduler.compute_input_distances  s    02/1JJD**
  $66!  7 ))9)9#(()C)T)T)VW6  !  $66!  7 ))9)9#(()C)T)T)VW6  !
 }-}-4<14<1&.#&.#) 
!
!s   6D>6Dc                   [         R                  (       d  g U R                   H  n[        U[        [
        45      (       a)  UR                  5       (       d  [         R                  S:w  a  MI  UR                  5        H?  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  5         MA     M     g )Nhalide)r(   r  r>  r   r   r   rU   cpu_backendr   rr  r   )rl   r   rg  s      rm   r   Scheduler.merge_loops  s    00JJD d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N!!# * rp   c                   [        SSSS9   [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  USS9n[        U5      n[        R	                  S	US-   UU5        XC:X  d  US:X  d  Mk  [        R	                  S
US-   5          O   [        R                  (       d  [        R                  (       a  U R                  USS9nUsSSS5        $ ! , (       d  f       g= f)z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodesTrS  ry  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r  r   r  r  fuse_nodes_oncer(   r  loop_index_inversion_in_fusion)rl   r>  r:  old_lennew_lens        rm   r  Scheduler.fuse_nodes  s     #4QU
 2Ye*  EE
 ,,UU,Ke*  TE	 %A$$Eq1u ' , 1188,,UT,J;
 
 
s   A3C%AC%%
C3c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)r>  r  r   r  r  )rl   rK  r   s      rm   r  Scheduler.process_grouped_nodes  sF     .0	JJD!+D2F!G!GdV  
rp   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        SSSS9   UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesTcompile_time_autotune_time_us)rT  dynamo_compile_column_usN)r   r   r  r  r   r  )rl   r>  rc  r  s       rm   r  Scheduler.benchmark_fused_nodes  sm     5zA~~q$$&$""6*#"&%D

 007
 
 
s   A""
A0c                    [        U5      S:  d   eUS   R                  5       nX@l        U R                  U5      n[	        S5         UR                  XUS9sSSS5        $ ! , (       d  f       g= f)r  r   generate_kernel_code_from_nodeshint_overrideN)r   r   r  r  r   r  )rl   r>  benchmark_kernelr  rc  r  s         rm   r  )Scheduler.generate_kernel_code_from_nodes  si     5zA~~q$$&$""6*;<::} ;  =<<s   A!!
A/c                    X l         U R                  U5      n[        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)r  benchmark_codegened_moduleN)r  r  r   r  )rl   modulerc  r  s       rm   r  $Scheduler.benchmark_codegened_module  s=     %""6*6755f= 877s	   >
Ac                V   [         R                  R                  nU(       d  g[        R	                  SX5        UR
                   H  nUR                  5       n[        USS5      (       a$  XB;  d  [        U[        R                  5      (       a  MK  UR                  nX$   n[        U[        R                  5      (       a'  UR                  UR                  5        UR                  n[        U[        R                  5      (       d  M  Xe:w  d  M  [        R!                  SUUU5          g   g)z
Check if selecting a Triton template would cause layout conflicts.
Returns True if there's a conflict and we should fall back to ATen.
FzNode %s has constraints %sr@  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rZ   r   buffer_layout_constraintsr  r  rb  r3  r  r   r+   ReinterpretViewr@  FlexibleLayout freeze_layout_with_exact_stridesr&  FixedLayoutr  )rl   
multi_nodeconstraintsinpinp_namer@  expected_layouts          rm   !_has_layout_conflict_for_template+Scheduler._has_layout_conflict_for_template"  s     gg77		.
H$$C||~H C400.c2#5#566ZZF)3O&""3"344 44_5K5KL&"..11o6Oe#	 7 %: rp   c           
     0   [        U R                  5       GHe  u  p[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pEO [        S UR                  5        5       5      n[        U[        R                  R
                  R                  5      (       a  U R!                  U5      (       a  UR                  5        H:  n[        U[        R                  R"                  R$                  5      (       d  M8  Un  O   [        W[        R                  R"                  R$                  5      (       d   S5       e[        U[        R                  R
                  R                  5      (       a  [        R&                  (       a  0 nXGS'   [        R&                   Hm  nUR                  US9n	U	R)                  5        V
Vs0 s H  u  p[        U
[        5      (       d  M  X_M      nn
n[+        UR)                  5       S S9S   nXgU'   Mo     UR                  R-                  U5        OUR                  R/                  U5        GM  [
        R0                  R3                  UR4                  5         UR7                  5       nSSS5        WR8                  n[        U[
        R:                  5      (       d   eUR8                  n[        U[
        R<                  5      (       d   eUR>                  (       a  [A        XR>                  5        URB                  Ul!        U RE                  XX5        GMh     gs  snn
f ! , (       d  f       N= f)aP  
Finalize a backing choice for MultiTemplateBuffers which did not already have a
choice finalized through fusion. In the case of an extern choice, this will result
in replacing the SchedulerNode.

If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
will force completion of compilation and benchmarking.
c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7fr|   )r   r  r  r  ExternKernelCaller)r   timings     rm   r   <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>`  s7      *E) & % @ @ S S  #F*Es
   7A	AzZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNr  c                    U S   $ r+  rz   r\  s    rm   r  ;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>  s	    qQRtrp   r  r   )#r=  r>  r   r   r   r+   MultiTemplateBufferr(   r  %force_extern_kernel_in_multi_templateget_min_choicer<  choice_timingsr  r  r   r  r  r  multi_kernel_hintsrh  r  finalize_as_triton_callersfinalize_as_triton_callerr  current_originsr  output_noder   
StorageBoxOperationBufferorigin_noder:   r@  _replace_node)rl   r:  r   r  min_node_unfusedr  choicecallershinttimingsr  rm  triton_timingsout_tensorboxout_storage
out_buffers                   rm   r  )Scheduler.finalize_multi_template_buffersM  s    !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&?? 
 ==jII&0&?&?&AF) & % @ @ S S    4: 0 % 'B  *"EOO$D$D$W$W     y   $OO&&?? 
 00NP(8 %+$=$=D&0&?&?d&?&SG -4MMO.,;DA#-a1I#J !%,; + .
 &))=)=)?^%TUV%WF,2DM %> 		<<WE		;;<LMYY..z/A/AB$4$@$@$BM C+00!+r}}====(--
!*b.@.@AAAA))&}6L6LM$.$5$5
!"":1CY -h. CBs   N
/N
?N
N	c                  ^ [        X!5        U R                  U5      nXPR                  U'   XPR                  UR	                  5       '   XPR
                  UR	                  5       '   0 m[        R                  " UR                  R                  UR                  5       HA  nU R                  R                  UR                  S 5      =n(       d  M2  UR                  TU'   MC     SU4S jjnU" UR                  5      Ul
        U" UR                  R                  5      UR                  l	        [        UR                  5       UR                  5       5       H2  u  pXR                   U
R	                  5       '   U
R"                  U	l        M4     UR$                  Ul        UR&                  Ul        UR(                  Ul        UR*                  Ul        g )Nc                .   > [        U4S jU  5       5      $ )Nc              3  D   >#    U  H  oR                  T5      v   M     g 7fr|   )r  )r   r   r  s     rm   r   ?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>  s     Kdsjj)9::ds    r   )r  r  s    rm   rename_deps,Scheduler._replace_node.<locals>.rename_deps  s    KdKKKrp   )r  r  r   r  )r  rq  r>  rJ  r3  r1  r
  r  r   r   r  r  r  r   rG  r  r[  r0  r  r  r   r  )rl   r  r  r:  r   new_scheduler_noder   	real_namer  new_outold_outr  s              @rm   r  Scheduler._replace_node  s{    	"*9!77
C*

1-?$--/*3E0 ??4#3#3#9#94;R;RSC 3377$GGyG.1hh + T	L 1<111
- 0;**000
&&, !$**,d.>.>.@!
G 4;W--/0#MMGM	!
 (,~~$'+~~$'+~~$(,%rp   c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r   Nscatter_moderk  )rW  r   r   r  rK  s     rm   r   ,Scheduler._any_atomic_add.<locals>.<genexpr>  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r   )rl   	node_lists     rm   _any_atomic_addScheduler._any_atomic_add  s     

 
 
 	
rp   c                (   U R                  USUS9n[        R                  " U5      n[        R                  R
                  R                  5       nUR                  5       (       d  S nXd4$ UR                  SUS9n[        U[        5      (       d   eXd4$ )NT)r  r  triton_)kernel_namesource_code)r  r   loadr  r  async_compileAsyncCompileuse_process_poolr   r   r   )rl   r>  r  src_codemodr  futs          rm   compile_kernelScheduler.compile_kernel  s     77D 8 
 x(55BBD--//C
 z  &&9(&SCc<0000zrp   c                  ^ ^^^^^^^^^ ^!^"^#^$^%^&^'^(^)^*^+ [        S TT4 5       5      n[        R                  (       d  U(       d  [        R	                  S5      $ TR                  5       (       a-  [        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  [        R	                  S5      $ TR                  5       nUS   R                  5       mT(       d   eTR                  S:X  a)  [        R                  S:w  a  [        R	                  S5      $ TR                  5       n[        [         R"                  " XE5      5      nT R%                  U5      (       a  [        R	                  S5      $ SSKJm  [+        TT5      m+US   R                  5       mTc   eSUU4S jjm%U(       Ga}  [        S	 TT4 5       5      (       Gac  TR                  5       SLmT(       a  TR                  5       OTR                  5       m*[        T*[        R,                  5      (       d   eT R/                  T*5      (       a  [        R	                  S
5      $ 0 m$/ m"[        R0                   GHI  nT*R3                  U5      m[5        TR7                  5       S S9 H  u  p[        U[8        R:                  R<                  R>                  5      (       d  M:  T*RA                  U5         T"RC                  U/T RE                  XhRF                  S9Q75        SSS5        M     [I        S5      n
Sn0 nT" HW  u  pn Ub  URK                  5         T*RA                  U5         T RY                  UT5      u  nnUX'   UU
:  a  Un
UnSSS5        MY     UT*RZ                  U'   [        U[\        5      (       d   eUT$U'   GML     [        R^                  m[a        S T*Rb                   5       5      n[e        5       =(       a!    T(       + =(       a    U[        Rf                  :*  m#[I        S5      [I        S5      sm'm(Sm&T#(       dP  T*R3                  5       mT*Ri                  5       u  m&m'[5        TR7                  5       [j        Rl                  " S5      S9nOT*Rb                   Vs/ s H  nUS4PM	     nnT(       a-  T(       a  T Ro                  U5      OT Ro                  U5      u  m(nO9T(       d  [        R	                  S
5      $ TRq                  5       m([s        TTT(5      m)SSK:J;n  / m"SnU H  u  nn[        U[\        5      (       d  M  T(       d-  [y        US5      (       a  URz                  T*Rz                  :w  a  MQ  T(       a  UT'T(-   :  a    O_US-  nU[        Rf                  :  a    ODT*RA                  U5          T"RC                  U/T RE                  U5      Q75         SSS5        M     [}        T"5      S:X  a  [        R	                  S
5      $ SUUUUU"U#U$U%U&U'U(U)U*U 4S jjn[        R                  UT"S   S   5      $ T RE                  U5      mT RE                  U5      m!T RE                  U5      m SUUUU U!U%U U+4S jjn[        R                  UT S   S9$ ! , (       d  f       GM  = f! [L         aT  n[N        RQ                  [R        RT                  5      (       a   [N        RW                  ST(       d  SOSU5         SnAGM  SnAff = f! , (       d  f       GM  = fs  snf ! U a     SSS5        GM  f = f! , (       d  f       GM"  = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
c              3     #    U  HD  nUR                  5       =(       a(    [        UR                  5       [        R                  5      v   MF     g 7fr|   )rr  r   r/  r+   r  rK  s     rm   r   .Scheduler.speedup_by_fusion.<locals>.<genexpr>  sD       
 $ MMO J1..0"2H2HIJ#s   AATr   ra  r   CompilationErrorNc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  rE  rE   rF   )ms_fusedr  r  r   r   s      rm   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rp   c              3  D   #    U  H  oR                  5       S Lv   M     g 7fr|   r3  rK  s     rm   r   r  )  s      %
7E!!-~s    Fc                    U S   $ r+  rz   r\  s    rm   r  -Scheduler.speedup_by_fusion.<locals>.<lambda>=  s	    aPQdrp   r  r  infException in compiling %s: %sr@  rB  c              3  B   #    U  H  n[        U[        5      v   M     g 7fr|   )r   r   )r   ry  s     rm   r   r  h  s      %ASA
1677ASrR  r   )	CantSplitallowed_prologue_inpsc                   > [        S5      n S n0 nT(       aw  T(       a  [        T[        R                  5      (       d   eTR	                  5       mTR                  5       u  mmT Vs/ s H  nUS   T;   d  M  UPM     snm[        TU4S jS9mT GH\  u  pEn Ub  UR                  5       nO'T(       d  UR                  nUR                  5         OS n T(       a=  TR!                  U5         TR#                  UT5      u  pXU'   X:  a  U	n UnS S S 5        M  TU:H  =(       d    TT-   TU   T-   :  nU(       d  M  U(       d  M  UR                  5         UR$                  (       a  UR&                  (       d   eUR$                  S   nUR&                  nUR(                  n[+        TTUR&                  UUUR,                  R.                  [0        R2                  " T5      5      nU(       d  GM[  Un  O   T(       a
  T" U TT5        T(       a	  U TT-   :  aW  UbT  [4        R6                  (       a  UTS '   TR9                  T5        OTR;                  U5        T(       a  UTR<                  S '   gg	s  snf ! [         aT  n[        R                  [        R                  5      (       a   [        R                  ST(       d  SOSU5         S nAGM8  S nAff = f! , (       d  f       GMP  = f)
Nr  r   c                   > TU S      $ r   rz   )r  r  s    rm   r  KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s    nQqT&:rp   r  r  r@  rB  TF)r  r   r+   r  r  r  r  rF  r  
precompiler  r  r  r  r  r  swap_as_triton_callerr  	launchersn_regsn_spillsr  bmreqr  rC   r  r(   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timings
fut_choicer  ri   	mod_fusedresr  r  pathfusible_choicecompiled_kernelr  r  should_fuse_epiloguebench_epiloguer  rc  r5  future_choicesget_choice_timings_async hint_override_best_fusion_choicer  
min_choicer  r  	ms2_fusedr  rl   s                   rm   benchmark_when_ready9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   $U|"& +%*ZAWAW*X*XXX%/%>%>%@N&0&?&?&AOJ
 +9&*8J%a=N: #*8&N &,&:&N
 2@-FI!!-"(--/C!/"+"3"3CNN,"&C &'==fE-1-L-L ) &.NH
 3;/'6/728 FE '&0 N"Sy>&+AI+MM '
 3>>"--/#&==V]]B#B.1mmA.>O+:+A+AL-<-E-EN3A # # & , . & 6 6 0 7 7 ?40  4328 %y 2@| "|S#6 ',#)*D%100AP8>"==<
 #<<_M%;F
2248 }&. % !%227==AA&,, ?2A
z !
 !! FEs=   'I+8I+I0-#I0I0,"K0
K:AK		K
K!	c                 (  >^^^^^^ SSK Jn    TS   TS   TS   4 H  nUc  M  UR                  5         M     TR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gT" TTT5        [        S5      (       a[  TTT-   :  aR  TT4TR                  ;  a@  TR                  R                  TT45        [        S5      R                  UUUUUU4S	 j5        TTT-   :  $ ! U  a     gT	 a  nS
[        U5      ;   a   S nAge S nAff = f)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratiorz   )r  r  r  path1path2
path_fuseds   rm   r  r  `  s&    053605365?8@3;sSy3I%rp   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr   rF  r  r  isinfr    r  r  r   r  r"  )r   r  r  r  r  r  r*  r+  r,  r  rc  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  rl   rW  s      @@@@@@rm   r  r  -  s   A *!,)!,/2 
 ?JJL  "&!@!@)!,"JC
 zz#CD$!%!@!@)!,"JC
 zz#DE$+/+J+J/2,(Hj
 zz(++CD$xc2 0>>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E* AE* 5;E* 1;E* -A<E* *F2F7FFFrw   )r  r  r  r  r  r  r   rv  rw  )@r   r(   benchmark_fusionrd   rt   rr  r   r/  r+   TritonTemplateBufferrx  r   r   r   r  r  r
  r  r  triton.compiler.errorsr  r  r  r  r  r  r  rh  r  r  r  TritonTemplateCallerr  r   r  r  r  rF  r  r  r  r  r  r  r  r  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesr  operator
itemgetterr  r  r  r4  r  rW  r  r   rx   ),rl   r   r   is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r  r  r  ri   r  r  r  r  num_triton_callerschoice_timings_iterry  r+  r  triton_choicesunfused_timer  r  r  r  rc  r5  r0  r1  r2  r  r  r  r  r  r  r  r  r  rW  s,   ```                       @@@@@@@@@@@@@@@@@@rm   speedup_by_fusionScheduler.speedup_by_fusion  s       
 U^ 
 

 &&/@$$T** u668":Q:QRR!!!!  $$T**oo'Q**,v ;;%F$6$6($B$$T**oo'y{HI
 00$$T**;u% #..0!!!	 	"  %
8=u~%
 "
 "
 $557tCO # ''),,. 
 j"*@*@AAAA55jAA#((//  - QSN!'!:!:!+!:!:=!I!'(<(<(>N!SIF% @ @ U U  !#99&A&-- &!%!4!4$3CWCW "5 "" BA "T  %U|CG 1?-FI
!!-"MMO $99&A)-)H)H%v*$ /7+#l2+3L.4O BA 2@( =H
**=9!/3KLLLLBQ0?U ";X $==N!$ %AKASAS% "
 )* R&&R&&*Q*QQ % U|U5\HC15J+!+!:!:!<",";";"=
C&,"((*0C0CA0F'# 8B7I7I&J7I!1v7I#&J ' ..{;33K@ U '',,U33224<UE3O	 ?PRNN(;$!&*BCC ((?@@44
8X8XX!lcCi&?!#!F$K$KK55f=!&--#Kd&9&9/&JK >=/ )<B >"a'#((//j! j! j!X  --$nQ&7&:  !% 3 3K @ $ 3 3K @&*&9&9/&J#F FP  --09PQR9S .  g	 BA" % !%227==AA&,, ?2A
z !
 !! BAF 'Kb % ! ! >=
! >=sa   -Z,Z?7$\ 3\3]$\8,
Z<?
\	A\\ 
\08]>]
]]
]	c                <    U R                   UR                  5          $ )z0Look up the node in Scheduler name_to_fused_node)r1  r9  r  s     rm   r  Scheduler.get_fused_nodey  s    &&t':':'<==rp   c                P   [         R                  SUR                  5       UR                  5       5        UR                  5       nUR                  5       U:X  d   eU R	                  U5      R                  X5      nUR                  U5        UR                  U5        UR                  U5        U R                  R                  UR                  5        Vs0 s H  ofR                  5       U_M     sn5        U R                  R                  U5      nUb  XpR                  U'   U$ s  snf )Nzfusing %s with %s)r  r  r3  r   r  rt   r,  r  r1  r  r   r~  r  )rl   r   r   ra  rc  node3rL  stream1s           rm   fuse_two_nodesScheduler.fuse_two_nodes}  s     	,enn.>@PQ!!#!V+++  (--e;5!5!&&U__EV'WEV

e(;EV'WX %%))%0)0& (Xs   D#c                    U R                  X5      (       a5  U R                  X5      (       d  U" 5       (       a  U R                  XU5        ggNTF)r  will_fusion_create_cyclerK  )rl   r   r   
speedup_fnra  s        rm   fuse_if_speedupScheduler.fuse_if_speedup  sA     MM%''11%??k:rp   c                   U(       Ga  / n0 n[        5       nU GH(  nXa;   a  [        X   5      S:  d   eX   R                  S5      n[        X   5      S:X  a  UR                  U5        UR	                  5       u  pX:X  a  [        X5      (       d   eUn
OX:X  d   e[        X5      (       d   eU	n
U R                  U
5      U
La  M  UR                  (       a3  UR                  R                  nUc   eUR                  U5        Xv4XK'   M  U R                  XUR                  U5      (       d  GM  UR                  U5        GM+     [        U5       Hq  nXK   u  p|U R                  U R                  UR                  5      U R                  UR                  5      UR                  U5      (       d  M`  UR                  U5        Ms     U H  nUR                  U5        M     U(       a  GM  gg)z
Evaluate pending template fusions for a set of fusion candidate nodes.
The fusion candidate nodes are pointwise nodes as potential epilogue
or prologue fusions
r   r   N)r   r   r  r  r   r?  rB  r  ri   r   rQ  rg   r   r   r   )rl   template_fusion_candidatesra  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionr   r   rA  fcands                rm   "_evaluate_pending_template_fusions,Scheduler._evaluate_pending_template_fusions  s    )-/  % @J|7	;6ABaGH "<!F!J!J1!M1<=B%)))4->>@%-e;;;;$)M ----e;;;;$)M &&}5]J!((&--44A=(=$++A.3A2M,/ ++n&@&@+  *--i8K 8P ""23'?'B$''''(<(<=''(<(<="..	  &))$/ 4 '*..q1 'q )(rp   c                  ^ ^^       SUUU 4S jjnU GH\  u  pxU" Xx5        T R                  U5      nT R                  U5      n[        Xx5      (       a  Xx4T R                  ;   a  MS  T R                  XxU5      (       d  Ml  T R	                  Xx5      (       a  M  T R                  Xx5      n	U	R                  b  [        U	R                  UUU	R                  S9n
[        Xx5      (       aW  Xx4T R                  ;  d   eT R                  R                  Xx45        [        Xx5      nX;  a  / X;'   X;   R                  U
5        O
U
TU'   U
TU'   GM6  U	R                  (       d  GMJ  T R                  XxT5        GM_     g )Nc                  > TR                  U 5      T;   d  TR                  U5      T;   Ga  TR                  TR                  U 5      TR                  TR                  U5      5      5      nUc   eUR                  5       u  p4UR                  nTR	                  US 5        TR	                  US 5        TR                  U5      UL d   eTR                  U5      UL d   eU" 5       (       a  TR                  X5      (       a  GM  TR                  X4T5        TR                  U 5      T;   a  GM  TR                  U5      T;   a  GM  g g r|   )r  r  r   rg   r  rO  rK  )	r   r   rY  	node_key1	node_key2
is_speedupra  pending_fusionsrl   s	         rm   resolve_pending_fusions<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions  s3   
 ##E*o=&&u-@!0!4!4''.#''(;(;E(BC" &111'5'F'F'H$	+77
##It4##It4**95BBB**95BBB!||t'D'DU'R'R##I+F+ ##E*o=&&u-@@rp   )rg   r   r   ri   r  )r  rD  rx  r  rO  rD  rg   r   ri   r  rF  r   rf   rK  )rl   possible_fusion_pairsrc  template_fusion_nodesra  r  rd  r   r   
fusion_resrY  template_pw_nodes   ` ` `       rm   _try_fusion_pairsScheduler._try_fusion_pairs  s   	G$	G$	G 	G 	G8 2LE $E1''.E''.E #500Nd&@&@@}}. 33EAA!33EA
))5%2$.$:$:##)00	&N *%77 %~T5O5OOOO2266~F+B5+P(+HFH1C-?FF~V1?.1?.!--##E+>W 2rp   c                N   [        5       nUR                  5        H  nUR                  5       u  pVUR                  nXs;   d  [	        XV5      (       a  M8  UR                  U5        U R                  U5      UL d   eU R                  U5      UL d   eU R                  XVXq5        M     g r|   )r   r   r   rg   rD  r  r  rQ  )rl   ra  rc  seen_pair_speedup_fnrY  r`  ra  is_speedup_fns           rm   _finish_pending_fusions!Scheduler._finish_pending_fusions=  s    
 @J| .446N#1#B#B#D I*66M48J9 9  $$]3&&y1Y>>>&&y1Y>>>  }R 7rp   c           
        [        U VVs/ s H  u  p4[        X45      (       d  M  UPM     snn5      n/ nU H@  u  p4[        X45      (       a  XE;   a  UR                  X445        M.  UR                  X445        MB     U$ s  snnf r|   )r   r?  rB  r   )rl   possible_fusionsdeferred_prologue_fusionsn1n2epilogue_template_nodesnew_possible_fusionss          rm   _handle_template_overlap"Scheduler._handle_template_overlapU  s     #-.M.FB2DR2LR.M#
  "&FB!"))b.K)00":$++RH5	 ' $# Ns
   A=
A=
c                   U R                  U5        [        U5      n[        R                  [        R
                  5      (       aB  [        R                  S5        U H'  n[        R                  SUR                  5       5        M)     0 n0 n/ nU R                  UU5      n[        R                  (       d  [        R                  (       a  U R                  X5      nU R                  UUUUU5        U R                  X55        U R                  Xc5        UR!                  5         U(       a&  U R                  UUUUU5        U R                  Xc5        [#        US S9nU R%                  U5      nU$ )z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  %sc                    U R                   $ r|   r  r\  s    rm   r  +Scheduler.fuse_nodes_once.<locals>.<lambda>  s    !++rp   r  )r2  r   r  r  r  r  r  r  get_possible_fusionsr(   r  r  rx  rj  ro  r\  clearr  rz  )	rl   r>  r  ra  r   rc  rg  rs  rr  s	            rm   r  Scheduler.fuse_nodes_oncei  s^    	!!%( '""7==11;<#  )=)=)?@ $  	
 OQ  	"  44

 ##v':':#<<   	!	
 	$$[B//0ES##%$"")%  334IW{(=>..u5rp   c                z   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH[  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    GO)U R                  U5      (       d  [        R	                  SU5        Me  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        U R(                  R+                  US   5      nUc  GMM  XR(                  U'   GM^     [-        US
 S9U l        U R/                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R1                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ r|   r  r\  s    rm   r  5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>  s    q{{rp   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r>  r   r  r  r=  rC  r  rx  speedup_by_combo_kernelr(   r  r+  r  r,  r  r1  r  r   r3  r~  r  r  rz  r2  )rl   rV  ra  rm  num_nodes_orignumr  r`  r|  r   rL  streams               rm   r  #Scheduler.create_combo_kernel_nodes  s    !,TZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL
 ((,,Yq\:F!39##K0E
F K-BC
33DJJ?
R

O		
 	!!$**- Ms   )H8
c                L    U H  nUR                  U R                  5        M      g r|   )r2  r1  )rl   r>  r   s      rm   r2  Scheduler.prune_redundant_deps  s     D%%d&=&=> rp   c                  ^ ^^
^ / m
[         [        [        [        4      " 5       mSUU
UU 4S jjn[        R                  " [
        5      nU HE  nT R                  U5      (       a  M  UR                  5        H  nXF   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [
        5      nU H,  n[        USS5      n	U	(       d  M  X   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T
5      m
T
R                  T R                  SS9  [         R#                  S[%        T
5      5        T
$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pU US-   US-   [        R                  -     H  nX#4nUT;   a  M  TR                  U5        TR	                  X#T5      (       a  TR                  U5        MI  UR                  5       (       d  UR                  5       (       d  Mu  TR	                  X2T5      (       d  M  TR                  X245        M     M     g r+  )r=  r(   )max_fusion_buffer_group_pairwise_attemptsr  r  r   rr  rx  )	r>  node1_indexr   r   r  r  rr  rb  rl   s	        rm   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s    &/&6""!Ok'FF'GE
 !.Cd{ HHSM}}U3CDD(//4++--1A1A1C1C&6J J )//?! '7rp   r   NT)r  reversezfound %d possible fusionsr>  rS  r   rv  )r   r   r_   r  r   r  unfusable_noder   r   r   r(   aggressive_fusionr  *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r   )rl   r>  r  r  buffer_names_groupingr   r   node_groupinggroup_groupingr   rr  rb  s   ` `       @@rm   r}  Scheduler.get_possible_fusions  sV    % 13D DEFH	@ 	@( !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLrp   c                  ^ ^^^^ [         [           " 5       mSUUUU U4S jjmUR                  5       R                  R	                  5       UR                  5       R                  R	                  5       -  mUR
                  R                  R	                  5       UR
                  R                  R	                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fr|   rp  r   rL  
found_pathrl   s     rm   r   IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>:  s,      H!DA #4#:#:1#=>>!D   "%)r   r   r  r   issubsetr{   r   r   )r   combined_ancestorscombined_namesr  rl   visiteds    rm   r  6Scheduler.will_fusion_create_cycle.<locals>.found_path)  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  rp   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fr|   rp  r  s     rm   r   5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>H  s&     WDVqJt66q9::DVr  zwill create cycler   )r   r   r   _dictr  r   r   r  )rl   r   r   cycler  r  r  r  s   `   @@@@rm   rO  "Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78rp   c                  ^ ^ SSK Jm      SU 4S jjnU" U5      nU" U5      n[        U4S jU 5       5      n[        U4S jU 5       5      nUR                  U5      nSn	U H  n
 U	[	        U
S   5      -  n	M     T R                  X5      n[        R                  R                  R                  U	S	U-  5      (       a  g
g! [
         a       gf = f)a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two nodes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r   )buffer_reuse_keyc                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ r+  )
r   r   r[  r  r   r   r0  r   has_tensor_outputr   )r   r  rM  r   rl   s       rm   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsd  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - Mrp   c              3  4   >#    U  H  nT" U5      v   M     g 7fr|   rz   r   r   r  s     rm   r   <Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>r       #S]c$4S$9$9]rT  c              3  4   >#    U  H  nT" U5      v   M     g 7fr|   rz   r  s     rm   r   r  s  r  rT  r   r   Fr  T)r   r_   r   zlist[ir.Buffer])r  r  r   intersectionr$  r  r  rZ   r   r   statically_known_gt)rl   r   r   r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @rm   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memoryM  s    * 	6	#		 1707##S]#SS##S]#SS*77G$C3s1v;. % ,,U:	 77//iPP  s   (C
CCc                   [        UR                  5        Vs/ s H  oDR                  5       PM     snUR                  5        Vs/ s H  oDR                  5       PM     sn-   5      n[        S UR                  R                   5       5      n[        S UR                  R
                   5       5      nXv-  n[        5       n	UR                  R                   HA  n
U R                  U
R                  U5      (       d  M&  U	R                  U
R                  5        MC     [        S UR                  R
                   5       5      [        S UR                  R
                   5       5      -  n[        S UR                  R                   5       5      [        S UR                  R                   5       5      -  nX-
  nX-
  nX-  n[        U5      U:  $ s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   EScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s     &T;SCxx;Sr  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r         %R:Q3hh:Qr  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r    s      $
 7HH 7r  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r    r  r  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r    s      %
 8HH 8r  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r    s     D+CCxx+Cr  )
r   r   r3  r   r  r   $can_buffer_be_removed_through_fusionr   r  r   )rl   r   r   	thresholdr   fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionre  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                   rm   (fusion_prevent_too_many_reads_and_writes2Scheduler.fusion_prevent_too_many_reads_and_writes  s    &).):;):]]_):;+0??+<=+<4}}+<=>
 '&T5;L;L;S;S&TT%%R%:K:K:Q:Q%RR'7'K$ :D%**11I88 0  .11)..A	 2 $ $
 % 1 1 7 7$
 
C5+<+<+B+BCCD
 % %
 % 1 1 8 8%
 
D5+<+<+C+CDDE
 &D (G )8$%	11M <=s   GG
c                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heuristic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )r  r  r  r  )rl   r   r   proximity_scores       rm   are_long_distant_nodes Scheduler.are_long_distant_nodes  sE    * %//12%//12
 ##rp   c                (   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GH  n[        R                  R                  U5      n	Xh   n
Xx   n[        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3XH'   Ms  U
R                  5       UR                  5       :w  a(  SU
R                  5        SUR                  5        3XH'   M  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM!  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMP  Sn[        U	[        R                  5      (       d  SU	R                    3nS	U
 SU S
U 3XH'   GM     [#        U5      $ s  snf s  snf )ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  r   rZ   r   r  r   r4   r   r   rY   r   
get_offsetnormalize_with_stride_orderr+   r  r@  r"  )rl   r   r   common_buf_namesreasonsr   node1_name2depnode2_name2deprZ  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  rm   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reason  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.Ggy11GY9W9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#566'

|4
"7)6'"ZLI U )\ 7|c YXs   H
Hc                .	   [         R                  (       d  g[        S X4 5       5      (       a  gUR                  R	                  5       nUR                  R	                  5       nX4-  nU(       d  g[        S UR                   5       5      nXc-
  (       a  g[        U5      S:  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  g[        [        UR                  R                  5      5      n[        [        UR                  R                  5      5      n[        U[        5      (       a  [        U[        5      (       d  gUR                  R                   V	s0 s H  oR                  U	_M     n
n	UR                  U
;  a  gXR                     n[        U[        5      (       d  gUR                  5       nUR                   UR                   :w  a  UR"                  UR"                  :w  a  gUR"                  UR"                  :w  d  [        UR$                  5      S:w  a  g[        UR&                  R(                  5      S:w  a  gUR&                  R*                  (       a  gSUR&                  R(                  ;   a  SUR&                  R(                  ;   d   e[        S UR&                  R-                  5        5       5      n[        U5      S:w  a  g[        [        U5      5      nXR&                  R(                  S   :X  a  SnSnO"XR&                  R(                  S   :X  d   eSnSnS	S
KJn  UR&                  R2                  S	   n[        U5      S:w  a  g/ n[4        R6                  R9                  U5       H;  nUR;                  [<        R>                  R@                  RC                  U5      5        M=     [E        U5      nU" UUS	   5      nUc  gUR&                  R(                  U   UR&                  R(                  U'   UUR&                  R(                  U'   URG                  SS5        U RI                  X5      n[        U[J        5      (       d   e[L        RO                  SU5        U$ s  sn	f )a  
Attempts to enable fusion between two nodes by inverting indexing patterns.

This optimization targets cases where node1 has a contiguous write and
node2 has a contiguous write but discontiguous read. By inverting the
indexing in node2's read and write operations, we can make them compatible
with node1 for potential fusion.

Args:
    node1: First scheduler node (source)
    node2: Second scheduler node (target for inversion)

Returns:
    int: Fusion score if successful, 0 if optimization not applicable
r  c              3  @   #    U  H  oR                  5       v   M     g 7fr|   rd  rK  s     rm   r   AScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>%  s     2>axxzz>r?  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r  1  s      .
 8HH 8r  r   r   index0index1c              3  $   #    U  H  ov   M     g 7fr|   rz   )r   r  s     rm   r   r  l  s     %T7Std7Ss   r   )generate_inverse_formulaTFz!Shared memory after inversion: %d)(r(   r  r   r   buffer_namesr   r  r   r   r  r<  rd  r   r4   r   r  r   r   	var_namesr  r  	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr  varsr   Add	make_argsr   rZ   r   r   combine_modular_indexing_pairsr   r   r  r$  r  r  )rl   r   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writer   node1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr  rA  simplified_termstermsimplified_read_exprinverse_formulascores                          rm   $shared_data_after_inverting_indexing.Scheduler.shared_data_after_inverting_indexing  s   & 442E>222 #..;;="..;;=0E" $. .
 % 8 8.
 $
  $8'(1, u  &&'!+s53D3D3K3K/Lq/P$u006678
4 1 1 8 89:*i00
9
 9
 161B1B1I1IJ1I##1IJ??,."??3+y11 "++- !2!22  K$4$44??k...#j6J6J2Kq2P u{{))*a/ ;;   222EKK666	
7
 &%Tu{{7Q7Q7S%TT A%./0	 228<<&O' : :8 DDDD&O'Q[[%%a(
z?aII''	2D##  ??E 3  ##3423GTUW "
 7<kk6P6P7
""?3 8G""#34 	""4/((6%%%%%;UCm Ks    Rc                   [         R                  (       a  [        S X4 5       5      (       a  gUR                  5       (       d  UR                  5       (       a  gUR                  R                  5       nUR                  R                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[!        U	["        R$                  " S5      S9u  pn['        U[(        5      (       a  ['        U[(        5      (       d  gUR*                  UR*                  :w  a4  UR-                  5       UR-                  5       :X  a  U R/                  U5      $ gSnUR1                  5       (       d  UR3                  X5      nOZUR1                  5       (       d  UR3                  X5      nO3[4        R7                  SUR9                  5       UR9                  5       5        U(       a*  [:        R<                  " [>        U RA                  X5      5      $ S$ s  snf s  snf )as  
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatible with node1 if that's more efficient.

Return the amount of shared data re-computed in this method.
If no such recomputation happens, return -1 (not return 0 since 0 is a valid
amount of shared data).

c              3  @   #    U  H  oR                  5       v   M     g 7fr|   r  rK  s     rm   r   >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>  s      8
 .1HHJJr?  r  r   r   r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)!r(   r  r   rr  r   r  r  r   r  r   rZ   r   r   r   r   r   r  r:  r;  r   r4   r#  r  dep_size_hintr   r  r&  r  r3  r  r  r$  r  )rl   r   r   r  r  r 	  r   r  r  
candidatesr  r  r  _numel	reordereds                  rm   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loop  s     00C 8
!&8
 5
 5
 
 %"3"3"5"5"..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((::#--/! ;    /" z?a $'zx7J7J17M#N '9--Z5S5Sw///
   "g&7&7&99))'22	!!##77II##%%77II##Q    KKT55eCD	
 	
k YXs   6K!*K&c                   [        U[        5      (       a6  UR                  5       (       + =(       a    [        UR                  5      (       + $ [        U[
        5      (       a~  [        UR                  [        R                  5      (       a  UR                  R                  5       (       + $ UR                  5       (       + =(       a    [        UR                  5      (       + $ g)z.
Is this node unfusable under any conditions.
F)	r   r  rr  rW   r   r  r+   r  r  r  s     rm   r  Scheduler.unfusable_node  s     d233'')) 2U		3 /  d566$))R%?%?@@9966888'')) 2U		3 /  rp   c                :   UR                  5       [        R                  R                  ::  a  gUR	                  5       nUR                  5       nSnXEU-  :  a	  U" S5        g[        S UR                  5        5       5      nU[        R                  R                  R                  R                  4:X  a	  U" S5        gS	S jnUR                  5       n	U	R                  5       (       d5  U" U	R                  5      (       a  UR!                  5       (       d	  U" S5        gg)
zD
Heuristics to avoid benchmarking predictably slow prologue fusions
T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     #    U  HT  nUR                   c  M  UR                   R                  5         H#  nUR                  S:X  d  M  UR                  v   M%     MV     g 7f)Ncall_function)r   r  r4  r  )r   rL  r  s      rm   r   EScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>+  sS      
.vv  VV'')tt&	 AHH * .s   A,AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                F    U R                   S:*  =(       a    U R                  $ )Nr   )itemsizeis_floating_point)r$  s    rm   low_prec_fpGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp8  s    >>Q&B5+B+BBrp   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r$  ztorch.dtyper   r{   )r   rZ   r   invoke_quant_opsr  r  r   r   r  rk  rl  constant_pad_ndr<  r5  rV   r$  rO  )
rl   prologue_noderA  rW  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  r$	  template_bufs
             rm   (check_prologue_fusion_heuristics_fusable2Scheduler.check_prologue_fusion_heuristics_fusable  s	    ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C %??A6688L..//!>>@@h rp   c                  ^  [        U[        5      (       a  [        U[        5      (       d  g[        UR                  [        R                  5      (       a)  [        UR                  [        R                  5      (       d  gUR                  5       (       d  UR                  5       (       a  g[        R                  S:X  a  gUR                  UR                  pCUu  pVUu  pxUR                  5       (       d2  UR                  5       (       d  Xh:w  d  [        U5      [        U5      :w  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  gT R                  [        [        UR                  R                  5      5      5      n	T R                  [        [        UR                  R                  5      5      5      n
[!        X5      [        R"                  :  a  gSU 4S jjnU" U5      (       d  U" U5      (       a  g/ n[%        ['        XW5      5       H   u  nu  pX:w  d  M  UR)                  U5        M"     [        U5      S:w  a  gUS   nUU   UU   nn[*        R,                  R.                  R1                  UU5      (       a  UUU4$ [*        R,                  R.                  R1                  UU5      (       a  UUU4$ g)a?  
Fusing two small pointwise nodes significantly reduces kernel overhead
and launch overhead. However, slightly different sizes would prevent fusion.
Here, we decide if expanding sizes of one node is profitible by allowing
fusion, and returns the dimension to expand, node with smaller sizes,
and new size after expand.
Nr  r   c                  > U R                   R                   H  nUR                  TR                  ;   a  TR                  UR                     nO%TR                  R                  UR                  5      nU(       d  Me  [        R                  R                  R                  X 5      (       d  M  [        UR                  [        5      (       a  M    g   grN  )r   r   r   rZ  r[  r  rZ   r   rU  r  r   r-  r  )r   r  r  rl   s      rm   has_reusable_bufferIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    ((..99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I I,,66yGG&y'<'<>TUU / rp   r   r   )r   r   r   r+   r   r  r(   r  r  r   r   r   r  r	  r<  rd  r  small_memory_access_thresholdr=  rG  r   rZ   r   r   statically_known_lt)rl   r   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr1	  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  rm   "get_expand_dim_for_pointwise_nodes,Scheduler.get_expand_dim_for_pointwise_nodesH  sl    %//z%7W7W uzz2#4#4555::r'8'899 ))++u/M/M/O/O ) #\\5<<()1&)1&  !!##1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"7223 	  u%%)<U)C)C !'0]1R'S#C#'!#**3/ (T "#q(*1-,',' ' 77//OO66WW11..QQ66rp   c                  ^^^^^ TTL a  gU R                  5       (       aB  U R                  R                  T5      nU R                  R                  T5      nUb	  Ub  XV:w  a  g[        T[        5      (       a  TR                  T5      $ [        T[        5      (       a  g[        TT5      nTR                  5       (       a5  U R                  TR                  5       5      R                  TT5      (       a  g[        T[        5      (       d  [        T[        5      (       a	  U" S5        g[        T[        5      (       a  TR                  5       (       d	  U" S5        g[        T[        5      (       GaV  [        TR                  [        R                   5      (       d	  U" S5        gTR                  R#                  5       (       d	  U" S5        g[        T[$        5      (       d	  U" S5        g[        TR                  [&        5      (       d	  U" S	5        g[        TR                  R(                  [*        5      (       d	  U" S
5        g[-        TR                  R.                  5      S:X  d   eTR                  R.                  S   R0                  m[3        U4S jTR4                  R6                   5       5      (       a	  U" S5        gTR                  R(                  R9                  5       nU HA  n	TR                  R(                  R;                  U	5      n
[3        S U
 5       5      (       d  MA    g   [-        TR                  R<                  5      S:X  d   eTR                  R<                  S   R>                  TR                  R>                  :w  a	  U" S5        g  S%UUU4S jjm[3        U4S jU R@                   5       5      (       a  g[        T[        [        45      (       a  TR                  5       (       d	  U" S5        gTRC                  5       TRD                  -  (       a	  U" S5        gTR                  5       (       GaH  [G        T5      (       d	  U" S5        gTRI                  5       (       d  TR                  5       (       a	  U" S5        gTRK                  5       nURM                  5       nU(       d	  U" S5        g[O        S URP                   5       5      U-
  nTRS                  5       U-  (       a	  U" S5        gTRU                  5       (       d  TRU                  5       (       a	  U" S5        gTRW                  5       mTSS  HK  nURY                  5       nU H2  n[[        U4S jUR\                   5       5      (       a  M)  U" S5            g   MM     [        T[^        5      (       d  T/O3TR`                   Vs/ s H  nUR                  5       (       d  M  UPM     snn[-        U5      S:X  d   eUS   n[-        TS   Rb                  5      S:X  aU  [-        TS   Rb                  S   R\                  5      S:X  a,  TS   Rb                  S   R\                  S   R                  UL d	  U" S5        gU Re                  TTU5      (       d  gTR                  5       (       a  TRU                  5       (       d%  TRI                  5       (       d  [g        T5      (       d	  U" S5        gTRi                  5       nUc   eURk                  5       (       a2  [        TR                  [        R&                  5      (       d	  U" S 5        gTRS                  5       [l        Rn                  Rp                  -  (       d0  TRS                  5       [l        Rn                  Rp                  -  (       a	  U" S!5        gTR                  5       nTR                  5       nUU:w  a  U" S"UU5        gAU Rs                  TTUS#9n[        U[t        5      (       d   eU(       aC  U[v        Rx                  :  a/  [v        Rz                  (       a  U R}                  TT5      nUS:  a  Un[v        R~                  (       aZ  U R                  TT5      =n(       aA  Uu  nnnUR                  UU5        U Rs                  TT5      n[        U[t        5      (       d   e[v        R                  (       a.  U[v        Rx                  :  a  U R                  TT5      nUS:  a  Un[        R                  [        R                  5      (       a4  [        R                  S$TR                  5       TR                  5       U5        [l        R                  R                  U TTU5      (       d  gTRC                  5       TRD                  -  (       ac  U R                  TT5      =(       aJ    [l        R                  R                  U TTU5      =(       a!    U R                  U5      R                  TT5      $ [l        R                  R                  U TTU5      =(       a!    U R                  U5      R                  TT5      $ s  snf )&zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
FNTz/grouped node must not be fused with other nodesznode1 is nopz'node1 is extern but not a triton kernelz5node1's triton kernel doesn't support epilogue fusionz.node1 is extern but node2 is not SchedulerNodez3node1 is extern but node2.node is not SchedulerNodez4node1 is extern but node2.node.data is not Pointwiser   r   c              3  @   >#    U  H  oR                   T:g  v   M     g 7fr|   r  )r   r   written_buffer_names     rm   r   %Scheduler.can_fuse.<locals>.<genexpr>  s     V>Us8822>U   z9epilogue reads from buffers other than the mutated outputc              3  *   #    U  H	  oS :g  v   M     g7f)r  Nrz   )r   usages     rm   r   rI	    s     ;F5Fs   z*node1 and node2 uses different buf layoutsc                V   > U TL=(       a    U TL=(       a    TU R                  5       ;   $ r|   )r   )r   r   r   rH	  s    rm   ._is_other_node_that_references_mutation_bufferJScheduler.can_fuse.<locals>._is_other_node_that_references_mutation_buffer   s7      u, N#50N+z/K/K/MMrp   c              3  4   >#    U  H  nT" U5      v   M     g 7fr|   rz   )r   r   rN	  s     rm   r   rI	  	  s       &D ?tDD&rT  znode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz'template has no allowed prologue inputsc              3  @   #    U  H  oR                  5       v   M     g 7fr|   r8  )r   r  s     rm   r   rI	  *  s     E_c<<>>_r?  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesr  c              3  @   >#    U  H  oR                   T;   v   M     g 7fr|   r   )r   rG  prologue_nodess     rm   r   rI	  :  s     QytyyN:yrJ	  z7template prologue can only fuse nodes with a single usezEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz6multi-output template epilogue requires ComputedBufferz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r  z%s and %s has %s shared data)r   r_   )Nr  r~  r  r   r  r  r  rr  r  r   can_fuse_multi_outputs_templater  r  r  r   r+   r  r  r   r   r   r   r   mutation_outputsr   r   r   r   inner_fn_free_symbolscollect_inner_fn_symbol_usager  r@  r>  r   r   r=  r   r5  get_allowed_prologue_inpsr   rb  rE  r  r   r  r   r0  r   r   r  r-	  r8  r/  rV   rZ   r   no_fuse_buffer_namesr  r$  r(   score_fusion_memory_thresholdr  r	  $expand_dimension_for_pointwise_nodesrD	  r  r  r	  r&  r  r  r  r  r3  r8  r  can_fuse_verticalcan_fuse_horizontal) rl   r   r   can_reorderr  rJ  stream2rW  node2_inner_fn_free_symbolssymbolusagesr4  r  unsupported_prologue_argsr   	node_outsr   rL  template_snodestemplate_snoder,	  rc  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerN	  rS	  rH	  s     ``                          @@@rm   r  Scheduler.can_fuse  s    E> ''))))--e4G))--e4G"w':w?Qe455&&u--e455 u%4#3#3$

)
)%
7$8 e122j'7
 7
 ABe344U=N=N=P=Pe677ejj"*D*DEE=>:://11KLe]33DEejj.99IJejjooy99JKuzz223q888"'**"="=a"@"E"E Ve>O>O>U>UVVVOP +0**//*O*O*Q'5FFvN;F;;;  6 uzz../1444zz&&q)00EJJ4E4EE@A-    JJ    u8:PQRR%%''()$$&8,-.u5501!!##u'8'8':':HI779H$,$F$F$H!(=> EX__EE'( &
 %%'*CCQR--//53Q3Q3S3SPQ"__.N&s+ ,,.	$CQsyyQQQUV$ % , "%);<< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sSS..00%%''259956 224L+++5577


B--A A LM""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 4454M 5 
 +S1111 !F$H$HH11$($J$J5RW$X!$)$9!66#FFueTTOT6E3Z{<<ZU $ 8 8 F/5555 11!F$H$HH$($M$Mu%! %)$9!))'--88##.  !	 yy!!$u6GHH$$&8 &&ue4 MII//eUDUVM$$V,>>ueL 9900eU$5 M""6*>>ueLMC Bs   :iic                   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   GH
  n[        U[        5      (       d  [        U[        5      (       d  M0  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mx  U	 H  n
[        U[        5      (       a)  U R!                  X5      (       a  U	R#                  U
5        MA  [        U[        5      (       d  MX  U R%                  XUR&                  5      (       d  M{  U	R#                  U
5        M     GM     [)        S [*        R,                  R/                  UR1                  5       5       5       5      nX-  (       a	  U" S5        gUR3                  5       nU HJ  nU R4                  U   R7                  5       nXR8                  U   R:                  -  (       d  MB  U" S5          g   g)z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
c              3  :   #    U  H  nUR                   v   M     g 7fr|   r  r  s     rm   r   .Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
U HHUr	  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rE  r  r   r  r  r  r  r   r   r6   r  r   r   r  r4   r5   fusable_read_and_writer,  .fusable_stardep_write_and_read_on_empty_tensorr   r   r
  r  rH  r   r   r[  r5  r1  r   )rl   r   r   node1_buf_namesrW  remaining_deps_by_namer   r   cd	remainingrM  remaining_depsnode1_op_namesr#  s                 rm   r\	  Scheduler.can_fuse_vertical  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,ZG5L5L.22%%))"''277;I y#B!"i00T5P5P6 6 "((,#G MM

  "((, $ +& $ $
 445K5R5R5TU$
 

 +
 +,224"D&&t,==?G 7 7 @ J JJJ>?	 # rp   c                l  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       a  g[        T[        5      (       d   e[        TR                  [        R                  5      (       a  g[        TR                  5      TR                  R                  ::  d  gU R                   UR                     nU/n[        U["        5      (       a  UR$                  nSnU He  n	U	R                  R&                   V
s/ s H  n
U
R                   U:X  d  M  U
PM     nn
U(       d  MD  US-  n[)        U4S jU 5       5      (       a  Me    g   US:*  $ s  snf s  sn
f )NFr   r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7fr|   )r   r4   r#   r   r%   TMPr   )r   r  writes     rm   r   -Scheduler.fusable_weak_dep.<locals>.<genexpr>  sn      
 +D	 4+ ,+DJJAA,JJ%++-, II+, +s   BB)r   rE  r   r  r	  r   r   r5   r4   r#   r   r%   r}	  r   r  r"   r  rC  r   r   r   )rl   weak_depr   r   r~	  mutating_writesr  relevant_reading_nodesnum_concurrent_readsreading_noder  relevant_readss       `       rm   r  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"eW%%%++++u{{DHH55
 %//*ekk.F.FF++H,A,AB	"'e788%*\\" 2L )44:::D99	) :  
 " A%  
 +   ! 3" $q((W
6s   F,F,F11F1c                z   [        U[        5      (       Ga  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nU R                  UR                  5      (       a  gUR                  UR                  :H  =(       aa    [        UR                   5      [        UR                   5      :  =(       a/    UR                   S [        UR                   5       UR                   :H  $ [        U["        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                  UR                  :X  a  UR                  b  X4:X  a  ggrb  )r   r4   r  r  r   r#   r   r%   r}	  r(   r  r#  r  r[  rj  r   r   r5   )rl   r  r~	  	read_name
write_names        rm   rr	   Scheduler.fusable_read_and_write  s   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 11%**== 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rp   c                f   [        U[        R                  5      (       d  gUR                  5       (       d  gU R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      n[        U[        5      (       a  XE:X  a  ggrb  )r   r+   r  r  r  r  r   r5   )rl   r  r~	  writing_noder	  r	  s         rm   rs	  8Scheduler.fusable_stardep_write_and_read_on_empty_tensorF  s     ,(B(BCC--//))--diiC	**..uzz5::F
eW%%)*Arp   c                @    [         R                  R                  X5      $ r|   )rZ   r   get_dep_size_hint)rl   r   r  s      rm   r	  Scheduler.dep_size_hintS  s    ww((::rp   c           	       ^ ^^ U4S jnU(       a9  [         R                  X5      (       a  [         R                  X5      nU" USS5      $ [        UR                  [
        R                  5      (       a  UR                  R                  5       (       d*  UR                  5       (       d  UR                  5       (       a  UR                  R                  UR                  R                  -  nUR                  R                  UR                  R                  -  n	S	S jn
SnU HH  nU	 H?  nU
" X5      (       d  M  U[        T R                  U5      T R                  U5      5      -  nMA     MJ     U" USS5      $ [        UR                  R                  5      [        UR                  R                  5      -   n[        UR                  R                  5      [        UR                  R                  5      -   n[        X5      S-  [        X5      :  a  X:  a  X!p!UR                  R                  UR                  R                  -   Vs/ s H9  nXR                  R                  ;   d  XR                  R                  ;   d  M7  UPM;     nnU" [!        UU 4S jU 5       5      SS5      $ UR                  R                  UR                  R                  -  UR                  R                  UR                  R                  -  -  n[!        U 4S jU 5       5      nSnUS:X  a'  T R#                  X5      (       a  T R%                  X5      nU" UUS5      $ s  snf )
a  
The first term in our fusion score that estimates number of saved
memory operations.

This function scores fusion candidates based on shared memory access patterns.
Higher scores indicate better fusion candidates.

Scoring strategy:
1. If nodes share exact memory deps (same buffer + same indexing), return
   the sum of shared dep sizes (original behavior).
2. If no exact matches (score == 0), check for same-buffer reads with
   different indexing (e.g., split operations reading different slices).
   - Give bonus if nodes read from exactly the same set of buffers
   - Score based on overlap ratio: common_buffer_size / total_read_size
   - High overlap (>50%) suggests good cache locality benefit from fusion
c                "   > T(       a  XU4$ X-   $ r|   rz   )r	  buffer_overlap_scoreis_mix_order_reductionreturn_is_mix_order_reductions      rm   _construct_return_value>Scheduler.score_fusion_memory.<locals>._construct_return_valueo  s     -5KLL//rp   r   Tc                    X:X  a  g[        U [        [        45      (       a4  [        U[        [        45      (       a  U R                  UR                  :H  $ grN  )r   r5   r4   r   )dep1dep2s     rm   _match-Scheduler.score_fusion_memory.<locals>._match  sJ    <dWi$899j7I.? ?  99		11rp   Fr  c              3  H   >#    U  H  nTR                  UT5      v   M     g 7fr|   r	  )r   r   r  rl   s     rm   r   0Scheduler.score_fusion_memory.<locals>.<genexpr>  s!     IDSD&&sK88D   "c              3  F   >#    U  H  nTR                  U5      v   M     g 7fr|   r	  r  s     rm   r   r	    s!     J7ID&&s++7Ir  )r	  r3   r	  r3   )r   r  r   r   r   r+   r  r  rr  r   r   r  r  r	  r   r  r   _can_use_buffer_overlap_scoring&_score_fusion_memory_by_buffer_overlap)rl   r   r   r  r	  r  r	  r	  
node1_deps
node2_depsr	  	node1_dep	node2_depnode1_dep_lennode2_dep_lenr   r  common_memory_depsr	  s   `  ``              rm   r  Scheduler.score_fusion_memoryV  s   2	0 %):)C)CE)Q)Q
 &66uDE*5!T:: 5::r'A'ABBJJ0022  ""  ""**0053D3D3K3KKJ**0053D3D3K3KKJ E'	!+Ii33 ..y94;M;Mi;X"  ", ( +5!U;;E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT,$u !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   +IDII1e  $//558I8I8P8PP##e&7&7&>&>>
 J7IJJ
  !A:$>>uLL#'#N#N$  'u.BEJJ3s   6M>Mc                p   UR                  5       (       d  UR                  5       (       a  gUR                  5       (       d  UR                  5       (       a  g[        R                  (       d  [        R                  (       Ga  UR                  5       nUR                  5       nU(       a  U(       d  g[        S U 5       5      n[        S U 5       5      n[        5       nU GH  nUR                   H  n	[        U	R                  [        5      (       d  M$  U	R                  R                  5       (       d  ME  [        U	R                  5      (       d  Ma  U	R                  R                  5       n
U
bW  [        U
[        R                  5      (       a8  U
R                  5       nX[-  (       a  UR!                  U	R                  5        M  M  UR!                  U	R                  5        M     GM     U(       a  U H  nUR                   H  n	[        U	R                  [        5      (       d  M$  U	R                  R                  5       (       d  ME  U	R                  U;   d  MW  U	R                  R                  5       n
U
b=  [        U
[        R                  5      (       a  U
R                  5       nXk-  (       a      gM      g   M     X4 Ht  nUR"                  R$                   HW  nU R&                  R)                  UR*                  5      nUc  M-  UR                  5       (       d  MD  [-        U5      (       d  MV      g   Mv     g)a  
Check if buffer overlap scoring should be used for this node pair.

Buffer overlap scoring handles split/cat patterns where nodes read from
the same buffer at different indices. We skip it when:
- Either node is a reduction (different memory access patterns)
- Either node is a template
- Both nodes are prologue/epilogue candidates for the same template,
  because horizontal fusion would prevent them from being absorbed
  into the template kernel. For example, in:
    q = a[:64, :]; k = a[64:, :]
    return mm(q + 2, k - 2)
  "q + 2" and "k - 2" both read from `a` and would get a high overlap
  score, but fusing them horizontally prevents prologue fusion into mm
  (resulting in 2 kernels instead of 1).

We allow buffer overlap scoring when:
- The node outputs are not actually in the template's allowed_prologue_inps,
  meaning they can't be prologue-fused anyway, so horizontal fusion doesn't
  prevent any optimization opportunity.
FTc              3  @   #    U  H  oR                  5       v   M     g 7fr|   r8  r  s     rm   r   <Scheduler._can_use_buffer_overlap_scoring.<locals>.<genexpr>       +TmsLLNNmr?  c              3  @   #    U  H  oR                  5       v   M     g 7fr|   r8  r  s     rm   r   r	    r	  r?  )r   rr  r(   r  r  r  r   r0  r   r   r_   r=  r/  r+   r4  rX	  r  r   r   r1  r  r   r8  )rl   r   r   node1_outputsnode2_outputsnode1_output_namesnode2_output_names&node1_prologue_eligible_template_usersr   rG  rA  allowed_inpsr   r   rF  s                  rm   r	  )Scheduler._can_use_buffer_overlap_scoring  sx   4 5#5#5#7#7%"3"3"5"5&":":":!--/M!--/M !!++Tm+T!T!++Tm+T!T
  3 %IID"499.?@@ II11337		BB
 )-		(C(C(E(4)2+B+B: : ,9+R+R+TL1@ F J J499 U  A CFFtyyQ% & %. 6(C #		&tyy2CDD $		 5 5 7 7 $		-S S -1II,G,G,IM,8Z -r/F/F> > 0=/V/V/X#5#D+0 $E (-! !* )& ++11C#66::388DH ,$00227AA$ 2 ' rp   c                  ^ ^^^ SmS
UU 4S jjm[        S UR                  R                   5       5      n[        S UR                  R                   5       5      nX4-  mT(       d  g[        U4S jUR                  R                   5       5      n[        U4S jUR                  R                   5       5      n[	        XV5      nUS:X  a  g[        UU4S jUR                  R                   5       5      n[        UU4S	 jUR                  R                   5       5      n	[	        X5      n
X-  nU[
        R                  :  a  U
$ S$ )a  
Score fusion based on buffer name overlap when exact dep matching fails.

This handles the split/cat fusion case where nodes read from the same buffer
but at different indices (e.g., different slices from a split operation).

Scoring logic:
- If nodes read from exactly the same buffers: high bonus (encourages fusion)
- For common buffers: score based on overlap ratio
  - overlap_ratio = common_buffer_size /
    max(node1_total_reads, node2_total_reads)
  - If overlap_ratio > threshold (e.g., 0.5): give proportional score
  - If overlap_ratio < threshold: minimal/no score (not worth fusing)

Note on dynamic shapes:
- When deps have unbacked symbols (dynamic shapes), dep_size_hint returns 0
- In this case, we use count * 10 as a proxy for size
- This ensures fusion still works for models with dynamic batch sizes

Note on multiple deps from same buffer:
- A node may have multiple MemoryDep entries for the same buffer name
  (e.g., 4 split reads from arg0_1 at different indices)
- We sum ALL dep sizes for each buffer, not just take max
- This ensures overlap ratio is calculated correctly when nodes read
  multiple slices from the same underlying buffer
ry  c                :   > TR                  U 5      nUS:  a  U$ T$ r   r	  )r   r   FALLBACK_DEP_SIZErl   s     rm   get_dep_sizeFScheduler._score_fusion_memory_by_buffer_overlap.<locals>.get_dep_sizeK  s%    %%c*D!84:)::rp   c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   CScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>O  r  r  c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r	  P  r  r  r   c              3  4   >#    U  H  nT" U5      v   M     g 7fr|   rz   r   r   r	  s     rm   r   r	  Y        $
)@#L)@rT  c              3  4   >#    U  H  nT" U5      v   M     g 7fr|   rz   r	  s     rm   r   r	  \  r	  rT  c              3  X   >#    U  H  nUR                   T;   d  M  T" U5      v   M!     g 7fr|   r  r   r   common_namesr	  s     rm   r   r	  g  .      %
.xx<' L.   **c              3  X   >#    U  H  nUR                   T;   d  M  T" U5      v   M!     g 7fr|   r  r	  s     rm   r   r	  l  r	  r	  )r   r3   r   r$  )r   r   r   r   r  r(   min_overlap_ratio)rl   r   r   node1_read_namesr  node1_total_read_sizenode2_total_read_sizemax_total_read_sizenode1_common_read_sizenode2_common_read_sizecommon_read_buffer_sizeoverlap_ratior	  r	  r	  s   `           @@@rm   r	  0Scheduler._score_fusion_memory_by_buffer_overlap+  sJ   < 	; 	; &%R%:K:K:Q:Q%RR%%R%:K:K:Q:Q%RR (: !$ $
).):):)@)@$
 !
 !$ $
).):):)@)@$
 !
 ""7O!#
 "% %
((..%
 "

 "% %
((..%
 "
 #&&<"U 0E
 (58P8P'P#	
VW	
rp   c                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   r  r   )
r   r   r$  r  get_fusion_pair_priorityr   r  rh  r:  r;  )rl   rr  "possible_fusions_group_by_priorityr   r   rc  fusion_pair_priority&possible_fusions_with_highest_prioritys           rm   r  4Scheduler.get_possible_fusions_with_highest_priority  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55rp   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)rZ   r8  score_fusionrJ  s     rm   r  Scheduler.score_fusion_key  s     yy%%d3U33rp   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   rZ   r   r  r   r>  r  r  r  r  )rl   r  r   s      rm   r  Scheduler.compute_last_usage  sV    
 ))A)A)CDTZZ(D 35L5LM&&t7 )rp   c                $   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GH  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   n[        U[        R                  5      (       a+  [        R                  R
                  R                  U5        M  [        U[        R                  [        R                   45      (       a  GM'  UR"                  n[        U[        R$                  5      (       a  UR'                  5       (       d   e[        R                  R
                  R                  UR"                  5        GM     U R                  R)                  5         g)z*Free any buffers that are no longer neededN)r  r  rZ   r   r  rU  freedr[  re  codegen_freer   r  r   r+   r  rZ  r[  r   r  is_input_bufferr~  )rl   r   r   r  storages        rm   free_buffersScheduler.free_buffers  s`   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---gg**40c2#5#566GG((55c:b&7&79M9M%NOO!hhG"7BMM::w?V?V?X?XXGG((55gllC)
, 	!!'')rp   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g r|   )rk  r   flushr	  )rl   r  s     rm   r	  Scheduler.flush  s.    }}++-GMMO .rp   c                   [        U[        [        45      (       d   e[        S   S==   S-  ss'   [        R
                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  [        R                  R                  5        U R                  5         g ! , (       d  f       NH= f)Nr  extern_callsr   F)increase_kernel_count)r   r  r%  r   rZ   set_kernel_handlerr0   r  r  r  r   rU  r	  )rl   scheduler_nodes     rm   codegen_extern_callScheduler.codegen_extern_call  s     &(LM
 
 	
 
 	^,1,!!&u"EF002##% G 	qww334	 GFs   !B22
C c                |   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aN  [        R                  R                  U5      =nR                  S:  a  [        U[        R                  " 5       5      e[        UR                  5      (       a.  UR                  S:X  d  [!        [        R                  " 5       5      eU" U 5      $ )Nz( should have been normalized in loweringzUnsupported device type: r      r|  )rU   r   r   rZ   r   add_device_infor/   rq  r&   r  r   get_device_propertiesmajorr7   inspectcurrentframer8   )rl   rc  device_schedulingr  s       rm   create_backendScheduler.create_backend  s    &++&&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$$V[[E-A#G$8$8$:;; &&rp   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ r|   )rk  r	  rb  s     rm   r  Scheduler.get_backend  s@    !!!&$($7$7$?DMM&!}}V$$rp   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf r|   )r  r  r=  r   r>  )rL  r:  rl   s     rm   	get_order*Scheduler.enter_context.<locals>.get_order   s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   r  )rL  ztorch.fx.Noder   r$  )r   r   r  r  r  r  r:  r;  rZ   r   rU  enter_context)rl   r   r	  rL  r  r  r  lasts   `       rm   r 
  Scheduler.enter_context  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7fr|   )r  r3  )r   rG  r  s     rm   r   AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VPUC3C CCPUr  )r[  r0  KeyErrorr   r  r  )rl   r   r  r0  s     ` rm   r  .Scheduler.can_buffer_be_removed_through_fusion  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                   UR                   n[        U[        R                  R                  R
                  5      (       ax  UR                  =n(       ae  [        U5      u  pEU[        R                  ;   d  U[        R                  ;   a0  [        U[        R                  R                  5      (       d   eSU 3$ [        R                  R                  R                  R                  (       d  [        R                  c  g[        U[         5      (       a0  UR"                   H  nU R%                  U5      nU(       d  M  Us  $    gUR                   c   eUR'                  5       (       d  UR)                  5        S3$ [        UR                   [        R*                  5      (       a  g[        UR                   [        R,                  5      (       a  g[/        UR                   SS5      (       a  g[1        UR                   5      (       a  g	U R3                  U5      =n(       a  U$ [        R                  R4                  (       a  [7        U5      (       a  g
g)zr
Return the reason why we should partition the inductor graph on this node,
or None if the node is cudagraphable.
zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r  r  r+   r  r  rP   r(   custom_should_partition_ops_ops
OpOverloadr   r^   rH   r  r   r   should_partitionrU   r   
DeviceCopyrg  r  rT   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsr-  )rl   r   r0  r4  op_overload_packet_nameop_overload_namerg  r  s           rm   r
  Scheduler.should_partition  s    ))gu11@@AA%%%B%8DR8H5#'6+M+MM#v'I'II!"ejj&;&;<<<<./?.@AA &&--886>>FKd.//..u56!M % yy$$${{}}oo'(--dii//#dii00$4991488)!$)),,0@@FF6FM ==66-d33*rp   c                   [        5       n[        R                  (       d  U$ U R                   GH  nUR                  nUc  M  [        U[        R                  R                  R                  5      (       d  MJ  UR                  nUc  M[  [        U5      u  pVU[        R                  ;  a  U[        R                  ;  a  M  UR                  5        Hn  n[        R                  R                  R!                  U5      n[#        U[$        R&                  [$        R(                  45      (       d  M]  UR+                  U5        Mp     GM     U$ )zS
Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
)r   r(   cudagraph_unsafe_unbacked_opsr>  r   r   r  r  r+   r  r  rP   r  rZ   r   r   r   r$   r%   UNBACKED_INTUNBACKED_FLOATr  )rl   unsafe_symintsr   r0  r4  r
  r
  syms           rm   &_get_cudagraph_unsafe_unbacked_symints0Scheduler._get_cudagraph_unsafe_unbacked_symints]  s    
 4><33!!JJDiiGgu'9'9'H'HII$$Bz8DR8H5#'v/S/SS$F,P,PP779gg&&//4!#(9(94;N;N'OPP"&&s+ :' 0 rp   c                    U R                  5       nU(       d  g [        U5      nU HM  n[        R                  R                  R                  U5      nUR                   H  nXb;   d  M
  SU 3s  s  $    MO     g )Nz'uses cudagraph-unsafe unbacked symint: )r
  r-  rZ   r   r   r   r"   )rl   r   r
  node_symbolsr
  simplified_symfree_syms          rm   r
  0Scheduler._uses_cudagraph_unsafe_unbacked_symint  sn     DDF5d;CWW--66s;N*77-DXJOO 8   rp   c                    0 nUR                  [        R                  R                  5        U R                   H4  nUR
                  R                  5        H  u  p4UR                  X'   M     M6     U$ )zf
Return a mapping from name strings to the corresponding graph inputs or
base scheduler node outputs.
)r  rZ   r   r  r>  r  rh  r   )rl   rJ  r   r   scheduler_buffers        rm   get_name_to_nodesScheduler.get_name_to_nodes  sd     PRAGG001JJD*.*>*>*D*D*F&%5%:%:" +G  rp   c           	        [        [        R                  R                  5       VVs0 s H  u  p#X2_M	     nnn[        [        R                  R	                  5       5       VVs0 s H  u  p#X2_M	     nnn/ [        R                  l        [        U5       H  u  pgUR                  (       a  M  / nUR                   H#  nUR                  UR                  U5      5        M%     / n	UR                   H1  n
U	R                  UR                  U
R                  5       5      5        M3     [        R                  R
                  R                  [        UUU	UR                  5      5        M     gs  snnf s  snnf )zj
computes a mapping from partition input/output indices to graph input/output
indices for each partition.
N)r=  rZ   r   r  r  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr3  rQ   constant_names)rl   
signaturesr>	  r   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingr   s              rm   compute_graph_partition_maps&Scheduler.compute_graph_partition_maps  s;    (11E1E'F%
'F)#DI'F 	" %
 (11I1I1K'L&
'L)#DI'L 	# &
 "$'0'<#L''
 M!--$$%>%B%B4%HI .  N!..%%&@&D&DT]]_&UV / GG""))! !",,	! (=%
&
s   E'"E-c                  ^     SS jm    SS jn[        5       R                  " S U 5       6 nUR                  " U4S jUR                  5        5       6   U" U5      n[        5       nU HG  n[        R
                  R                  R                  U5      nUR                  UR                  5        MI     [        [        U[        R                  " S5      S95      $ )	a9  
Returns all symbol inputs which are required to be in scope to successfully
perform codegen for this graph partition, including:
- free symbols used in partition nodes
- free symbols in partition input/node shapes, strides, and offsets. This is needed
  for recording cudagraphs for tensors with dynamic shapes.
c                    [        U [        R                  5      (       a
  [        5       $ [        U [        R                  5      (       a  [        U 5      $ [        S[        U 5       35      e)z?
Gets symbols used in input node shapes, strides, and offsets.
zUnsupported input node type: )r   r+   r  r   r  r(  r  r   r   s    rm   get_input_node_symbolsKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sT     $ 2 233!|#D")),,)$// *,I$t**VWWrp   c                &    [        S U  5       5      $ )z
Filters a set of symbols that are required for codegen. Skip symbols
that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
and SymT.R0_INDEX.
c              3     #    U  HV  n[        U[        R                  [        R                  [        R                  [        R
                  45      (       d  MR  Uv   MX     g 7fr|   )r$   r%   SIZEFLOATr
  r
  r   r  s     rm   r   VScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sI       A!		

))++	  s   AA 	A r   )symbolss    rm   filter_symbolsCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         rp   c              3  8   #    U  H  n[        U5      v   M     g 7fr|   r,  r=  s     rm   r   >Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     Iyt,T22yr  c              3  8   >#    U  H  u  pT" U5      v   M     g 7fr|   rz   )r   r  r   r6
  s      rm   r   rB
    s     N:Mwq$T**:Ms   r   r  )r   z+ir.IRNode | sympy.Expr | ir.TorchBindObjectr   OrderedSet[sympy.Symbol])r>
  rD
  r   rD
  )r   r  r  rh  rZ   r   r   r   r"   r  r:  
attrgetter)	rl   	partitionr(
  r?
  candidate_symbolsr  r  symplified_sr6
  s	           @rm   !get_graph_partition_symbol_inputs+Scheduler.get_graph_partition_symbol_inputs  s    	X=	X%	X 	-	%	, 7Al6H6HIyI7
 	  N+:K:K:MN	
 ++<=(2"A77++44Q7LJJ|001 #
 &(*=*=f*EFGGrp   c           
       ^ ^ / n[        [        R                  R                  5       5      nT R	                  5       nSUU 4S jjm[        [        U5      [        U5      5       GHj  u  pg[        5       nU H,  n	UR                  U	R                  R                  5       5        M.     UR                  U5      n
[        R                  R                  U V	s/ s H  oR                  PM     sn	5      n[        UR                  UR                   -   Vs/ s H&  n[#        U[$        5      (       a  M  UR&                  PM(     sn5      U-
  n[        U 4S jU 5       5      n[        5       nU H  n	UR                  U	R(                  5        M      X-
   Vs/ s H  nX;   d  M
  UPM     nnUR                  U5        U Vs0 s H  nX;   d  M
  XU   _M     nnU Vs0 s H  nX;   d  M
  XU;   _M     nnU Vs/ s H  nX;   d  M
  X;  d  M  UPM     nnU
R                  U5        [        U 4S jU
 5       5      n
U
 Vs/ s H  nT" U5      (       a  M  X_   PM     nnU Vs/ s H$  o[        R                  R*                  ;   d  M"  UPM&     nnT R-                  UU5      n[/        UUUUUU5      nUR1                  U5        UR3                  XJ-
  5      nGMm     USSS2   $ s  sn	f s  snf s  snf s  snf s  snf s  snf s  snf s  snf )z
Gets signature for each graph partition, including input nodes, output nodes, and
whether deallocating an input within graph partition.
c                   > TR                   R                  U S5      nUc  g[        UR                  R                  [
        5      (       a,  TR                  R                  U S5      =n(       a  T" U5      $ gg)z
Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
Buffers with NoneLayout are not allocated so graph partition should not
take them as inputs or outputs.
NFT)r[  r  r   r   r@  r?   r  )rZ  r   r  is_unallocated_bufferrl   s      rm   rM
  FScheduler.get_graph_partition_signature.<locals>.is_unallocated_buffer  sk     ""&&x6C{#((//:66 !% 7 7 ; ;Hd KK9K0;;rp   c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fr|   r  r  r   r   rl   s     rm   r   :Scheduler.get_graph_partition_signature.<locals>.<genexpr>N  ,      /1D ''++D771   (+c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fr|   rP
  rQ
  s     rm   r   rR
  x  rS
  rT
  Nr  )rZ  r"  r   r{   )r   rZ   r   r  r#
  rG  r   r  r  r  r  r*   r  r  r   r   r  r   r6   r   r  ro  rI
  r<   r   r  )rl   
partitionsskip_cudagraphsr+
  unmet_output_namesrJ  rF
  r'
  output_namesr   returned_output_namesr   r  partition_input_namesr  r   extra_input_namesr(
  input_deallocationextra_output_namesr)
  r*
  symbol_inputspartition_signaturerM
  s   `                       @rm   get_graph_partition_signature'Scheduler.get_graph_partition_signature  sZ    
'(@(@(BC--/	 	, *-Z (?";*
%I -7LL!##D$8$8$=$=$?@ " %1$=$=>P$Q! '11<<.78id!!i8K  "-!2!2[5G5G!G!GA)!W5 !G  " %/ /1/ %!
 5?L !$++DOO< " 2@!@D' @  !
 "(():; 21D' )4((1   2"1D' 32221  " 2"1D' ,0,L 1  " "(();<$. /1/ %! 21D,T2 #"1   "7!6!''BSBS:S!6   !BB;M #:"# 12!6!<!<":"K*
R $B$y 9*!
""s`   K
K
,K
	K$K$9	K)	K)	K."	K.1	K3>K3K37K8K8!K=?K=c                   UR                   R                  5        VVs0 s H'  u  p#U[        R                  R                  ;  d  M%  X#_M)     nnnUR
                  R                  5        VVs0 s H'  u  p%U[        R                  R                  ;  d  M%  X%_M)     nnnUR                   Vs/ s H3  nUR                  5       [        R                  R                  ;  d  M1  UPM5     nnUR                   Vs/ s H%  nU[        R                  R                  ;  d  M#  UPM'     n	n[        UR                  UUUUR                  U	5      $ s  snnf s  snnf s  snf s  snf )z
Updates the partition signature by removing buffers specified in
V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
)r(
  rh  rZ   r   r  r]
  r)
  maybe_get_namer*
  r<   r_
  r'
  )
rl   r/
  r   r  r(
  r  r]
  r   r)
  r*
  s
             rm   .clean_removed_buffer_from_partition_signatures8Scheduler.clean_removed_buffer_from_partition_signatures  sR    !* 5 5 ; ; =
 =177222 DL = 	 
 '99??A
A	177222 DIA 	 
 "..
.""$AGG,C,CC . 	 
 "00
0177222 0 	 

 '##$$
 	
)






s/   $EE,$EE+0EE5"EEc                  ^ ^^	^
^^^ SSK m	[        5       m/ m/ m[        U5       VVs0 s H  u  p#X2_M	     snnmSUU	UUU 4S jjm
SU
U4S jjnU H8  n[        UR                  R
                  5      TU'   TU   S:X  d  M0  T
" U5        M:     / nSnU[        U5      :  a  T(       d  T(       a  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  US-  nU[        U5      :  a  T(       a  M  T(       a  M  U[        U5      :  a  [        S5      eU$ s  snnf )ad  
Reorder nodes to minimize the number of partitions via a bfs
topological sort. This is the optimal reordering such that the
number of partitions cannot be reduced further. This may be
sub-optimal for other metrics such as peak memory. This does not
change relative orders of two cudagraphable nodes, nor the
relative order of two non_cudagraphable nodes.
r   Nc                   > TU    U 4nTR                  U 5      (       a  TR                  TU5        g TR                  TU5        g r|   )r
  heappush)r   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrl   s     rm   insert_pending_nodesHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  sA    ,T2D9O$$T**6H2ODrp   c                   > U R                   R                   H.  nTU   S:  d   eTU==   S-  ss'   TU   S:X  d  M&  T" U5        M0     g r  )r  
succ_nodes)r   	succ_nodero
  node_to_indegrees     rm   update_indegreeCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  sO    !]]55	'	2Q666 +q0+#I.!3(3	 6rp   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r_   r   rv  )	rl
  rF  r=  r   r  
pred_nodesheappopr   rq  )rl   r>  r>	  r   ru
  r  	num_itersr  rk
  rl
  ro
  rt
  rm
  rn
  s   `       @@@@@@rm    reorder_for_minimizing_partition*Scheduler.reorder_for_minimizing_partition  s_    	9=CEGI4=e4DE4Dys4DE	E 	E	4 	4 D%()A)A%BT"%*$T* 
 -/	#e*$#':)--(?@%% *)
 &--(;<%% &%
 NI #e*$##':': s5z!  ] Fs   E(c           	     R   SSK JnJn  [        [        R
                  R                  5       5      nU" UU R                  U R                  [        [        R
                  R                  R                  5       5      U5      u  pVU R                  U5      nU" XvU5      u  pXS-  :  a  U$ U$ )z`
Reorder nodes to minimize the number of partitions if this only slightly
increase peak memory.
r   )estimate_peak_memoryprepare_planning_infor	  )r  r~
  r
  r   rZ   r   r  r[  r1  r  r  r{
  )
rl   r>  r~
  r
  r  default_peak_memoryrH  reordered_nodesreorder_peak_memoryr  s
             rm   r  0Scheduler.maybe_reorder_for_minimizing_partition  s     	H"177#;#;#=>:O##qww++0023;
7 ??F!5"

 s!::""rp   c                4   / n/ n/ nSS jnU H  nU R                  U5      SLnU(       a,  [        UR                  5      S:X  a  UR                  U5        MI  U(       a   U" U5      (       a  UR                  U5        Mp  UR                  U5        M     X#-   U-   $ )z
Reorder a node if it should be partitioned and has simple dependency:
1. move a partitioned node to the front if it has no dependency
2. move a partitioned node to the back if it is only used by OutputNode
3. otherwise do not reorder
c                    U R                  5        H8  nUR                   H%  n[        UR                  [        5      (       a  M$      g   M:     grb  )r  r0  r   r   rc  )r   r   rd  s      rm   only_output_userPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user.  s<    '')99C%chh
;;$ % * rp   Nr   r   )r
  r   r  r   )rl   r>  frontmiddlebackr
  r   r
  s           rm   r  6Scheduler.reorder_for_partition_with_simple_dependency   s     *,*,(*	 D#44T:$FC(?(?$@A$ET"!&6t&<&<D!d#  ~$$rp   c                   / nSn/ n/ nU R                    HY  nU R                  U5      SLnU(       a)  X&:w  a$  UR                  U5        UR                  U5        / nUnUR                  U5        M[     U(       a"  UR                  U5        UR                  U5        [        R                  R
                  nUS:  a^  [        [        X5      5       HF  u  nu  pU
(       a  M  [        S U	 5       5      nX:  d  M*  SXH'   [        R                  SUUU5        MH     U R                  XS9nU R                  U5        U R                  X5        X4$ )zz
Given a list of BaseSchedulerNodes, split into a list of
graph partitions and compute partition input/output signatures.
TNr   c              3  T   #    U  H  n[        U[        5      (       a  M  S v   M      g7fr   N)r   r  rK  s     rm   r   ,Scheduler.graph_partition.<locals>.<genexpr>`  s$      '!*A)!-CD !*s   (	(zFPartition %d has %d kernels, below minimum size %d, skipping cudagraph)rV
  rW
  )r>  r
  r   r(   r   cudagraph_min_partition_sizer=  rG  r   cudagraphs_logr  ra
  r2
  _log_graph_partitions)rl   rV
  r'
  cur_partitionrW
  r   node_should_partitionmin_sizer:  rF
  skipkernel_countr+
  s                rm   r  Scheduler.graph_partition@  sU    +-
')JJD$($9$9$$?t$K!!H!!-0&&~6 "2N  &  m,"">2 ====a<(1#j2R(S$$It#& '!*' $L
 $.-1*&,,d($	 )T" 77! 8 

 	))*5""::%%rp   c                   [         R                  [        R                  5      (       d  g [	        S [
        R                  R                   5       5      nU(       d  g [        S U 5       5      n[        U5      U-
  n[         R                  S[        U5      UU5        [        [        X5      5       H  u  nu  px[         R                  SU[        U5      UR                  (       a  SOS[        UR                  5      [        UR                  5      5        UR                  (       d  Mw  U H  n	U R!                  U	5        M     M     g )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr|   )rU   )r   rc  s     rm   r   2Scheduler._log_graph_partitions.<locals>.<genexpr>  s     O:NVF^^:Nr  c              3  J   #    U  H  oR                   (       a  M  S v   M     g7fr
  )r'
  r<
  s     rm   r   r
    s     !PZ?O?O!!Zs   #	#zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)r
  r  r  r  r   rZ   r   device_typesr   r   r  r=  rG  r'
  r(
  r)
  _log_non_cudagraphable_node)
rl   rV
  r+
  has_gpu_devicecudagraphable_countnon_cudagraphable_countr:  rF
  r/
  r   s
             rm   r
  Scheduler._log_graph_partitionsw  s   
 **7==99 O!'':N:NOO!!PZ!PP"%j/4G"GQ
O#		
 *33z3N)O%A%	  EI'0'?'?#_I))*I**+ '''%D44T: & *Prp   c                   U R                  U5      nU(       d  gUR                  5       nUR                  b  UR                  R                  5       OSnSU 3/n[	        UR                  5      R
                  nUR                  SU 35        UbF  UR                   SSR                  S UR                   5       5       S3nUR                  SU 35        [        R                  S	USR                  U5      5        Uba  UR                  R                  S
S5      nU(       a=  UR                  5       R                  S5       H  n	[        R                  SU	5        M     ggg)z)Log details for a non-cudagraphable node.Nzreason=zir=r  r  c              3  8   #    U  H  n[        U5      v   M     g 7fr|   )r"  )r   r`  s     rm   r   8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>  s     2P<a3q66<r  r  zfx=z
    %s: %sr  r  z         %s)r
  r3  r   r  r   r~   r   r  r*  rO  r
  r  r  r  stripsplit)
rl   r   r  rN  r  partsir_typefx_strr  lines
             rm   r
  %Scheduler._log_non_cudagraphable_node  s,   &&t,MMO	151F$))++-D6(#$tyy/**s7)_%'q2P7<<2P)P(QQRSFLL3vh(\9dii6FG !,,**=$?K'--/55d;D"((= <  rp   c                    [        S5         [        R                  R                  R                  (       a  U R                  5       OU R                  U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   r  r  r(   r  _codegen_partitions_codegenr>  rk   s    rm   r  Scheduler.codegen  sO    -. ??))99 ((*]]4::. /..s   AA++
A9c                l   SSK Jn  [        R                  R                  n[        U R                  5      n[        R                  R                  5          [        R                  R                  SSU 3UUS9  U R                  U5        [        [        R                  R                  U5      (       d   eU R                  U5      nU[        R                  R                  l        [        R                  R                  R                  5         [        R                  R                  n[        R                  R                  R                  [        R                  R                   5      u  pxSSS5        [        R                  R                  R#                  WW5        [        R                  R                  R%                  XR5        [        R                  R                  R&                  R)                  UR*                   V	s/ s H  oR-                  5       PM     sn	5        g! , (       d  f       N= fs  sn	f )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  r
  rZ   r   rU  r<  rn  set_current_wrapper_codeinit_wrapper_coder
  r   re
  r
  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  r)
  r3  )
rl   rF
  r/
  r
  r
  graph_partition_id
graph_namepartition_coder  r   s
             rm   _codegen_partition_wrapper$Scheduler._codegen_partition_wrapper  s    	Bgg22!$"?"?@WW--/GG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQQKKIVI8AAGG  5GG  --/J ! 4 4 = =agg>R>R SN/ 02 	
88^T	334FR	&&--)2)?)?@)?]]_)?@	
9 0/: As   DH ?H1 
H.c                P   ^ ^^ [         R                  SUU U4S jj5       nU" 5       $ )Nc               3    >#    TR                  T T5        TR                  (       a  [        TR                  R                  5      (       a[  TR                  R                  c   S5       e[
        R                  R                  R                  TR                  R                  5         S v   TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        g ! TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        f = f7f)Ndevice should have an index)
%update_graph_partition_default_deviceru  rK   r   r   rZ   r   rU  codegen_device_guard_entercodegen_device_guard_exit)rV
  rl   r+
  s   rm   ctx1Scheduler.use_default_device_context.<locals>.ctx  s    66z:N**/@++000 0 2288D 1D $$??//553..3D//444 4 GG((BBD.2+	 ..3D//444 4 GG((BBD.2+s    B#E9'D +A%E9A&E66E9)r   zIterator[None])
contextlibcontextmanager)rl   rV
  r+
  r
  s   ``` rm   use_default_device_context$Scheduler.use_default_device_context  s+     
	"	"	3 	3 
#	3* urp   c                N   [        U5      S:X  a  US   R                  (       d  g SS jn      SS jnS n[        X5       H   u  pgUR                  (       a  M  U" U5      n  O   Uc  g [        X5       H'  u  pgUR                  (       d  M  U" Xe5      (       a  M'    g    XPl        g )Nr   r   c                6    U S   R                  5       nUc   eU$ r   r  )rF
  partition_devices     rm   get_cudagraph_partition_deviceWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s'    (|668#///##rp   c                D    U  H  nUR                  5       nX1:w  d  M    g   grb  r  )rF
  target_devicer   rc  s       rm   all_on_target_deviceMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s(     "**  " rp   )rF
  r`   r   r  )rF
  r`   r
  r  r   r{   )r   r'
  rG  ru  )rl   rV
  r+
  r
  r
  cudagraph_partition_devicerF
  r/
  s           rm   r
  /Scheduler.update_graph_partition_default_device  s     z?a
1(D(D 	$
	$	5A		 &*"$'
$? I+++-KI-V* %@ &-$'
$? I'''0D1 1 	 %@ 'A#rp   c                   U R                  5       u  p[        U5      S:  a  [        S   S==   [        U5      -  ss'   U R                  X5         [	        X5       H\  u  p4[        U5      S:  d   S[        U5       35       eUR
                  (       a  U R                  U5        MK  U R                  X45        M^     SSS5        [        U R                  5      n[        R                  R                  R                  U5        US:  as  [        R                  R                  c   eU[        [        R                  R                  5      :X  d.   SU S[        [        R                  R                  5       35       egg! , (       d  f       N= f)	z
Split nodes into partitions and codegen each partition into separate functions.
This allows further applying different optimizations (e.g., cudagraph) to
each function.
r   r  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r   r   r
  rG  r'
  r
  r
  r<  rn  rZ   r   rU  set_all_partition_namesr&
  )rl   rV
  r+
  rF
  r/
  num_partitionss         rm   r
  Scheduler._codegen_partitions0  sO    "&!5!5!7
z?QZ !78C
OK8,,ZD(+J(C$	9~* KCPYNK[\* ++MM),33II )D E d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@  EDs   A,E<<
F
c                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXd;  d"   SUR                   SUR                   S35       eUR                  U5        M     U R                  U l        U R                   b   eU R                  (       aG  [         R"                  R$                  (       a(  [&        R(                  R*                  R-                  5         [&        R(                  R*                  R/                  5         U GH  n[0        R3                  [4        R6                  5      (       a4   [0        R9                  SUR;                  5       UR=                  5       5        U RA                  U5        [         RB                  (       aD  [&        R(                  R*                  RE                  S URF                  RH                   5       5        URK                  5       =n(       Ga  XR                  :w  d*  URM                  5       (       d  URO                  5       (       a  U RQ                  5         XR                  :w  Ga>  U R                  (       ai  [S        U R                  RT                  5      (       aE  U RV                  b  U RY                  5         [&        R(                  R*                  R[                  5         Xl        [S        URT                  5      (       a  UR\                  c   S	5       eS
n	U R_                  5       (       a:  [        U R`                  Rc                  5       5      n
U
(       a  [e        U
5      S
-   OS
n	[&        R(                  R*                  Rg                  UR\                  U	U Rh                  5        U R_                  5       (       a  U R                  b  U Rk                  U5        [&        R(                  R*                  Rm                  S URF                  RH                   5       5        Xpl7        U Rp                  Rs                  URt                  5        URO                  5       (       aN  URw                  [y        UR{                  5       5      5      u  pnU R}                  U5      R                  XU5        GOUURM                  5       (       a  U R                  U5        GO-UR                  5       (       a  [        R                  " [        U5      nU R}                  U5      nS
SKEJFn  S
SKGJHn  S
SKIJJn  [        UUUU45      (       a  UnO[        S[U        U 5      < 35      eUR                  U5        O[        U[        5      (       a!  U R}                  U5      R                  U5        Oc[        U[        [        45      (       a!  U R}                  U5      R                  U5        O'[        U[        5      (       d   eUR                  5         [         R"                  R                  (       a  U R}                  U5      R                  5         U R                  Rs                  UR                  5       5        U R                  Rs                  UR                  5       5        [        U[        5      (       dW  URK                  5       nUbD  URT                  S:w  a4  U R}                  U5      R                  5       (       a  U RQ                  5         [        S UR{                  5        5       5      (       a	  Xpl        GM  S U l        GM     U R                  U R                  :w  a[  U R                  c   e[S        U R                  RT                  5      (       a(  [&        R(                  R*                  R[                  5         S U l        U RQ                  5         g ! [>         a(    [0        R9                  SUR;                  5       5         GNgf = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   %Scheduler._codegen.<locals>.<genexpr>  s      D(>HH(>r  r
  r   c              3  8   #    U  H  oR                   v   M     g 7fr|   r  r  s     rm   r   r
    s      C$:S$:r  )CUDACombinedSchedulingr  )XPUCombinedSchedulingztype(self)=r  c              3  B   #    U  H  n[        U[        5      v   M     g 7fr|   )r   r   rK  s     rm   r   r
    s     J9IA:a//9IrR  )]r(   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  ru  r  rr  r   autotune_at_compile_timerZ   r   rU  write_get_raw_stream_headerregister_alignment_check_inputsr  r  r  r  r  r3  r  r  r 
  size_assertscodegen_deferred_input_assertsr   r   r   ru  rr  r	  rK   r   current_stream_idxgenerate_stream_ctx_exitr
  r   r  r~  r   r  r
  r  generate_stream_ctx_switching!codegen_deferred_alignment_copiesrs  r  r  r  rC  r  r   r  codegen_templater	  rx  r  r  rC   codegen.cuda_combined_schedulingr
  r  r  #codegen.xpu.xpu_combined_schedulingr
  r   r  codegen_combo_kernelr  codegen_mix_order_reductionr   r   codegen_noder  r  debug_sync_kernelcodegen_syncr  rE  r  r   ready_to_flushr   )rl   r>  r  stackrb  framer  r   rc  num_streamsunique_streamsr@  rA  rB  backend_r
  r  r
  r  s                      rm   r
  Scheduler._codegenP  s8   44.++-E7A|D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #99!!))) &&6==+Q+QGG  <<> 	
<<>D..
IIO224 t$ ""$$CC D(,(8(8(>(>D  **v*111~~''''))JJL000**/@++000 0  22> 99;,,FFH*0'(55%||7V9VV7&'7799-78K8K8R8R8T-UN;IN 3a 7q ( ,,GG"LL' ;; ++--$2E2E2Q2248 GG  BB C$($4$4$:$:C  !%%%,,T__=!!484W4W)*51   (99!X !!((.""{{#=tD++F3T8V#%;=RS  'G(KDJ=)9::,,T2D"9::  (DDTJD#5}"EFF  (55d;!$(>???? }}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*&v-((0??AAJJLJ9IJJJ%)"%)"E H $"="== &&222 !4!4!9!9:: $$>>@!

O ! IIPs   *3`.aac                    US   R                  5       nU [        R                  l        X0l        Uc   eU R                  U5      nUR                  X5      $ )r  r   )r   rZ   r   r+  r  r  benchmark_combo_kernel)rl   r  node_benchmark_resultsrc  r  s        rm   r   Scheduler.benchmark_combo_kernel  sU     1((* $!!!""6*--iPPrp   c                \  ^ UnUS   R                  5       m[        U4S jU 5       5      (       d   S5       e[        R                  (       d  gSSKJn  S/ pT0 n[        U5       H  u  pxUR                  5       n	U R                  U	5      (       a  [        R                  S5         U R                  U	5      u  pX4Xh'   [        R                  " U
5      (       a  [        R                  SU5          g	 XJ-  nUR                  U5        M      U R                  X&5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  XM:  d  U(       a$  [        R                  S['        XM-  S 5      5        O#[        R                  S[)        XM-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)r  r   c              3  H   >#    U  H  oR                  5       T:H  v   M     g 7fr|   r  )r   r   rc  s     rm   r   4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>  s     K?4??$.?r	  z<All nodes in a combo kernel group must be on the same deviceTr  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr-  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r(   r  r5  r  r=  r   r  r  r  r  r  r/  r"  r   r  r  r  rE   rF   )rl   r>  subkernel_nodesr  r  
path1_listr  r:  rg  r  r"  r  r  r  	ms2_clone_path2_listsmall_kernelrc  s                    @rm   r  !Scheduler.speedup_by_combo_kernel  s      #..0K?KKK 	
J	
K ,,;rZ!#!/2HA)I ##I..  R55i@13
&-::b>>$$U ! " ICd#9 3<	*.*E*E+'CK ,9c	""7==11yL  E#)C2
   I	#0
 $44Q $ *c!f4$$]      	&#a&0  Y 	s=   %A	F=G6 =G3$G.-G..G36H+<$H&%H&&H+c                r    U R                   U   nUR                  c   eUR                  R                  5       $ r|   )r[  r   
get_layout)rl   rZ  r   s      rm   get_buffer_layoutScheduler.get_buffer_layout_  s5    x(xx###xx""$$rp   c                   U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        [        45      (       a  Mw  UR                  5       / :X  d  M  [        R
                  R                  R!                  UR                  5        M     M     g r`  )r>  rU   r   r   rZ   r   r  r  r   r;   r   r@  r?   r>   r  zero_dim_cpu_tensor_listr  )rl   r   r  r  s       rm   rt  $Scheduler.update_zero_dim_cpu_tensord  s    JJD{{}} ,,22DWW3377		BF+F3u< *"MMJ8I+J! ! #OO-388<<TYYG 3 rp   c                J    U R                   b  U R                   R                  $ g)z:CUDA Stream index that current scheduler node assigned to.N)r  r  rk   s    rm   r
  Scheduler.current_stream_idxs  s%     ##/++666rp   c                8    U R                   =nb  [        U5      $ g)z9CUDA Stream name that current scheduler node assigned to.N)r
  r!   )rl   r  s     rm   current_stream_nameScheduler.current_stream_name{  s#     111J>":..rp   c                    [        U[        5      (       a   eU R                  U   n[        R                  R
                  R                  US9U l        g)z6Code-gen to enter the Stream context assigned to node.)r  N)r   r  r~  rZ   r   rU  codegen_cuda_stream_enterr  )rl   r   node_streams      rm   generate_stream_ctx_enter#Scheduler.generate_stream_ctx_enter  sL    d$:;;;;))$/#$77#7#7#Q#Q" $R $
 rp   c                    U R                   c   e[        R                  R                  R	                  5         SU l         g)z1Code-gen to exit from the current Stream context.N)r  rZ   r   rU  codegen_cuda_stream_exitrk   s    rm   r
  "Scheduler.generate_stream_ctx_exit  s2    ''333	557#' rp   c                <   XR                   ;   d   e[        U[        5      (       a  SOU R                   U   nU R                  U:X  a  gU R                  b  Uc  gU R                  c  Ub  U R	                  U5        gU R                  5         U R	                  U5        g)aM  Generate stream entering and exiting to properly run node in a multi-stream scenario.

Stream context switching is only generated if ``node``'s assigned stream is different from
the previous node's stream. NopKernelSchedulerNodes have stream=None and inherit the
enclosing stream context (or do nothing if no context is active yet).
N)r~  r   r  r
  r(  r
  )rl   r   r  s      rm   r
  'Scheduler.generate_stream_ctx_switching  s     ***** $ 677 $$T* 	
 ""f, $$0V^ $$,1C**40 ))+**40rp   )r  rn  r  r  rk  r  r  r  r  rs  ru  r  r  r  r[  rZ  r1  rJ  r~  r>  rj  r  ri  rr  r  rx  r  )r>  zlist[ir.Operation]r   rv  )r   z!dict[str, SchedulerDonatedBuffer]ru  rw  )rZ  r"  r   r$  )rZ  r"  r   r_   r   r{   ry  )rc  rz  r   rv  )r  r"  r   rv  )r   rE  r   r_   )rj  
str | Noner   r{   r  )rg  r_   r   rS  )r   r  r>  r  r   tuple[float, str]r|   r>  r  r  r{   r  rP  r   r"  )r  r   rc  r  r   r1  )r  ir.MultiTemplateBufferr   r{   )
r  ir.OperationBufferr  r3  r:  r$  r   r   r   rv  )r  r  r   r{   )r>  r  r  rP  r   z&tuple[LambdaFuture | None, ModuleType])r   r_   r   r_   r   rd   )r   r_   r   r_   )r   r_   r   r_   ra  OrderedSet[BaseSchedulerNode]r   r_   )r   r_   r   r_   rP  r}   ra  r5  )rT  ,dict[BaseSchedulerNode, list[PendingFusion]]ra  r5  r   rv  )
rf  1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]rc  &dict[BaseSchedulerNode, PendingFusion]rg  r6  ra  r5  r  r{   )ra  r5  rc  r8  )rr  r7  rs  r7  )r>  rS  r  r{   r   rS  )rV  rP  r   rv  r  )r>  rS  r  r{   r   r7  r!  )r   r_   r   r_   r  r$  r   r{   )r   r_   r   r_   r  z!tuple[str, ...] | OrderedSet[str]r   r"  r%  r   )r(	  r_   rA  r_   rW  r  r   r{   )r   r_   r   r_   r   z,tuple[int, SchedulerNode, sympy.Expr] | None)FT)
r   r_   r   r_   r^	  r{   r  r{   r   r{   )r	  r6   r   r_   r   r_   r   r{   )r  r3   r~	  r4   r   r{   )r  r3   r~	  r5   r	  r  r   r{   rN  )r   r3   r  r{   r   r$  )TFT)r   r_   r   r_   r  r{   r	  r{   r  r{   r   zint | tuple[int, int, bool])rr  r7  r   r7  )r>  r   r   r   )r	  r_   r   rv  )rc  r  r   BaseScheduling)rc  rz  r   r9  rw
  )r   r"  r  r  r   r{   )r   r_   r   r/  )r   rD
  )r   6dict[str, ir.IRNode | ir.TorchBindObject | sympy.Expr])r+
  list[GraphPartitionSignature]r   rv  )rF
  r`   r(
  r:  r   rD
  )rV
  list[PartitionType]rW
  z
list[bool]r   r;  )r/
  r<   r   r<   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])rV
  r<  r+
  r;  r   rv  )rF
  r`   r/
  r<   r   rv  )rV
  r<  r+
  r;  r   z'contextlib.AbstractContextManager[None]r  r  r   z%tuple[float, float, list[str | None]])r>  rS  r   r{   )rZ  r"  r   z	ir.LayoutrO  )r   r/  )qr~   r   r   r   r&  r  rI  rv  r  r  r  r  propertyr  setterr  r  rq  r  r]   r  r{  r[  rz  r  rt  r}  r|  r}  r   r  r  r  r  r  r  r  r  r  r  rD  r  rK  rQ  r\  rj  ro  rx  r  r  r2  r}  rO  r  r  r  r  r	  r	  r  r-	  rD	  r  r\	  r  rr	  rs	  r	  r  r	  r	  r  r  r  r	  r	  r	  r	  r  r 
  r  r
  rI   r
  r
  r#
  r2
  rI
  ra
  re
  r{
  r  r  r  r
  r
  r  r
  r
  r
  r
  r
  r  r  r  rt  r
  r#  r(  r
  r
  r   r  r  s   @rm   r*  r*    s;   
o9b	#6
p(S
Q & & ( (7#,"HMP^KZ+#Z ,	 6	S*4#&/<$6!F	808	8, %)	*  "	
 
&
> 
>*6
>	
>)0)	)VVDp'8&'8 +'8 	'8
 '8 
'8R
 OS0AK	/ P&P/@P	Pd>  ! 3	
 
0  ! '	
 3"D2$PD2 3D2 
	D2LO?PO? @O?  L	O?
 3O? O?bS2S @S0$K$ $U$(C&C C 
!	CJ3.j?6 &6  6  
;	6 p,&,/@,	,\7&7/@7	7r.2&.2/@.2MP.2	.2`$&$/@$	$6< < !< <	<
 
<|M&M/@M	M^[
&[
/@[
	[
z ;(; ); 	;
 
;z`&`/@`	5`L "*.zM zM !zM 	zM
 $(zM 
zMx;&;/@;	;z3)3)(93)BS3)	3)r%T '7J	; !.3*.mK mK !mK 	mK
 (,mK $(mK 
%mK^d d !d 
	dLR
&R
/@R
	R
h6 Q6	:6@4@4	4	8*4
) 
&'*%5$

+:
	
=~ ! !F%	"	? '1' 
'RBH BH LBH 
"	BHHK -K @JK 	&K Z"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@5&	B5&n";'"; 2"; 
	";H>0)
 )
 +)
 
	)
V-;X	06-A--A;X-A	-A^@obQ4Q	.QN5`%
H    
(1 1rp   c                  r  ^  \ rS rSrSU 4S jjrSS jrSS jr      SS jr      SS jr      SS jr	      SS jr
    SS	 jr        SS
 jr S       SS jjrS S jrS!S jrSS jrS"S jrSS jr    S#S jrS$S jr      S%S jr    S&S jr S     S'S jjrSrU =r$ )(r9  i  c                .   > [         TU ]  5         Xl        g r|   )r  r  r+  )rl   r+  r  s     rm   r  BaseScheduling.__init__  s    "rp   c                \    U R                   (       a  U R                   R                  5         g g r|   )r+  r	  rk   s    rm   free_buffers_in_scheduler(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') rp   c                    [        5       $ )z0Return a set of .codegen.common.BackendFeature()r   rb  s     rm   get_backend_features#BaseScheduling.get_backend_features  s
    |rp   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
r  r  s      rm   r\	   BaseScheduling.can_fuse_vertical  
     "!rp   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
r  r  s      rm   r]	  "BaseScheduling.can_fuse_horizontal  rK  rp   c                   UR                  5       n[        U[        R                  5      (       d  gUR	                  5       (       d  g[        UR
                  [        R                  5      (       a  [        UR
                  R                  5      S:H  =(       ap    [        UR
                  R                  S   [        R                  5      =(       a8    UR
                  R                  S   R                  5       UR                  5       :H  $ g)aF  
A Multi-Output Template (referenced in #144012) is a template node
with MultiOutputLayout, and its output buffers are instances of MultiOutput.
In this context, we verify whether node1 represents the Multi-Output Template
and node2 corresponds to one of its outputs. If so, we further check if
backend supports this fusion.

Fr   r   )r/  r   r+   r  rV   r   r=   r   rb  r  r3  )rl   r   r   r,	  s       rm   rT	  .BaseScheduling.can_fuse_multi_outputs_template  s     ..0,(9(9::5577ejj"..11EJJ%%&!+ Ouzz003RYY?OJJ%%a(113|7L7L7NN rp   c                   UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R	                  X5      (       a  [        X5      $ [        U[
        5      (       a  UR                  U5      $ [        U[        5      (       aU  [        U[        5      (       a@  [        UR                  [        R                  5      (       d   e[        R                  X5      $ [        R                  X5      $ )z
Fuse two nodes
)rx  rC  rt   r   r  r  r   r"  r  r   r   r+   r  r%  r/  r   r  s      rm   rt   BaseScheduling.fuse  s     !1!1!3!3-225@@77EE*588677??5))899j=?
 ?
 ejj"*D*DEEEE7EEeSS%**588rp   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
r  )rl   r  s     rm   r  BaseScheduling.group_fn  rK  rp   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
r  )rl   rA  epilogue_nodesrS	  s       rm   r
  BaseScheduling.codegen_template  s
     "!rp   c                    [         ez4
Generate a kernel given a list of pre-fused nodes.
r  )rl   r>  r  r  s       rm   r  .BaseScheduling.generate_kernel_code_from_nodes  s
     "!rp   c                    [         erX  r  r  s     rm   r  BaseScheduling.codegen_node  
     "!rp   c                    [         er|   r  r  s     rm   r  *BaseScheduling.codegen_mix_order_reduction!  r  rp   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
r  rk   s    rm   r  BaseScheduling.codegen_sync$  r\  rp   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Frz   rk   s    rm   r  BaseScheduling.ready_to_flush*  s    
 rp   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
r  rk   s    rm   r	  BaseScheduling.flush1  r\  rp   c                    [         e)r  r  rJ  s     rm   r  $BaseScheduling.benchmark_fused_nodes7  
     "!rp   c                    [         e)zi
Benchmark a compiled module and return the execution time
in milliseconds on randomly generated inputs.
r  )rl   r  s     rm   r  )BaseScheduling.benchmark_codegened_module@  s
    
 "!rp   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   rz   r  s      rm   r	  'BaseScheduling.get_fusion_pair_priorityG  s     rp   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
r  )rl   r  r  s      rm   r  %BaseScheduling.benchmark_combo_kernelP  rg  rp   c                    U(       a9  SSK Jn  U" UU5      n[        R                  R                  R                  X$5        g g )Nr   )'set_kernel_post_grad_provenance_tracing)r  ro  rZ   r   rU  write_provenance_debug_handle)rl   node_scheduler  ro  debug_handles        rm   codegen_commentBaseScheduling.codegen_commentY  s<    
 UBL GG  >> rp   r  )r+  zScheduler | Noneru  )rc  r  r   zOrderedSet[BackendFeature]r!  r  )r  r  r   z"tuple[tuple[sympy.Expr, ...], ...])rA  r_   rU  r  rS	  r  r   r/  r|   r2  )r   z"FusedSchedulerNode | SchedulerNoder   rv  )r   r  r   rv  rw  r0  )r  r   r   r1  r%  r=  )rq  r  r  r/  r   rv  )r~   r   r   r   r  rD  rG  r\	  r]	  rT	  rt   r  r
  r  r  r  r  r  r	  r  r  r	  r  rs  r   r  r  s   @rm   r9  r9    s   #*"&"/@"	""&"/@"	"&/@	49&9/@9	9("3"	+""(" 4" 4	"
 
"$ %)		"*	" 	" "		"
 
	""""""0"	""&/@	"4"	." #'2   
	 rp   r9  )r   z$torch._inductor.codecache.LocalCache)rg  r_   r   r"  )rg  r_   r   zCallable[[Any], Any] | None)rg  r_   r   r  )r  r   r   r"  )r   r_   r1  rK  r[  r  r   rv  )r|  )FusedSchedulerNode | GroupedSchedulerNoder   rv  )r|  ru  r+  r*  r   rS  r   rv  )rz   )r  zlist[list[int]]r  rx  r  rv  r   z	list[int])r  r3  rb  r4  r   rv  rQ  )r  r$  r  r$  r  r$  r  r$  r  rC   r   ztuple[int, int])r  r  r  r  r  r$  r  r$  r  r$  r  r$  r  rC   r   r{   rw  )r   z	ir.IRNoder   rD
  )r   r_   r   rD
  )rA  r_   r   r{   )r   r_   r   r_   )
__future__r   r  r
  r{  r  r	  r
  r  r  r:  r  r  r  r
  r  r   r   concurrent.futuresr   r   r   r	   r
   r   r   typing_extensionsr   torch.utils._ordered_setr   r+   r   r   collections.abcr   r   r   typesr   torch._inductor.codegen.wrapperr   r  r   r   r  torch._inductor.async_compiletorch.utils._pytreer~  _pytreere  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r    torch._inductor.stream_utilsr!   %torch.fx.experimental.symbolic_shapesr"   torch.utils._sympy.symbolr#   r$   r%   torch.utils._tritonr&   r  r'   r(   r)   r*   r,   analyze_preserves_zero_maskr-   codegen.commonr.   r/   r0   comm_analysisr1   r2   r3   r4   r5   r6   excr7   r8   fx_utilsr9   r:   r;   r<   r=   r>   r?   r  r@   r  rA   rB   runtime.hintsrC   rD   runtime.runtime_utilsrE   rF   r   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   virtualizedrZ   	getLoggerr~   r  _logginggetArtifactLoggerr  r&  r  r
  r  r`   r   ra   rb   	dataclassrd   r   r   r)  r~  r_   r   r  r  rr  r  r  rB  rc  r0  r  r  r   r  r  r   r  r%  rC  r  r  r  r  r  r  r-  rm  rl  r"  r(  r-  r8  r=  r?  rB  rD  rF  r*  r9  rz   rp   rm   <module>r     s   "          	     , 3 B B ' / ) << J5   $ $ $ 6 E ? 7 M 8 > O O * D D D M M ; : 2 $    J : 7 &    (  !^^--hA
NN44XO  >>;;$  11(LI 34y 4T]t_ D D D* ( ( (Y Yx h8 h8 h8V 1_ 1 1H1 H1V 2 2,' #L T"
 
 #
*  *K
*K4*K ,*K 
	*KZ"* 1 "*J5. 5S*% S*l
:	$: $ 
	8k** k*\[G0 [G|:-+= :-zC:!3 C:Lb, bP #%+#++  + 	+\0%01C0	08
1((( ( 	(
 #( (6		  	
   # 
@ 
 
 
> %??, 4$
&""P@nJ1 nJ1bUx xrp   