
    3j*[                      % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJrJrJrJ r J!r!  S SK"J"r"  S SKJ#r#  S SK	J$r$  S SK%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0  S SK1J2r2J3r3J4r4  S S	KJ5r5  S SK6r6S SK7r7S SK8J9s  J:r;  S S
K<J=r=  S SK>J?r?  S SK@JArA  S SKBJCrC  S SKDJErE  S SK8JFrFJGrG  SS/rHS SKIJJrJ  S SKKJLrLJMrMJNrNJOrO  \-(       ah  S SKJPrPJQrQJRrR  S SKSJTrT  S SK7JUrUJVrVJWrW  S SKXJYrY  S SKZJ[r[  S SK\J]r]  S SK^J_r_  SSK`Jara  SSKbJcrc  SSKdJere  SSKfJgrg  SS KhJiriJjrjJkrkJlrlJmrmJnrn  SS!KoJprp  SS"KqJrrrJsrs  / S#Qrt\0" S$5      ru\R                  GSLS% j5       rwS S&KxJyry  S S'KzJ{r{  S S(K|J}r}  S S)K~Jr  S S*KJr  S S+KJr  S S,KJrJrJrJrJr  S S-KJrJr  S S.KJrJr  SS/KJr  SS0KJr  \R                  S1:H  r\GR,                  " \5      r\0" S25      r\\6GR6                  \6GR6                  4   r\7GR:                  \-  \7R                  -  S-  r\(       a  S3O\GR@                  " S4S55      rS6S7S8\ 3S9.rS:rS:rS:rS;r\E" \7GRN                  \7GRP                  \7GRR                  \7GRT                  \7GRV                  \7GRX                  \7GRZ                  \7GR\                  \7GR^                  \7GR`                  \7GRb                  \7GRd                  \7GRf                  \7GRh                  \7GRj                  /5      rS<\S='   S>r\\S-
  -  S :X  a  \S?:  d   S@5       eGSMSA jrGSNSB jr " SC SD\6GRv                  5      r\GRz                  " SESF9 " SG SH5      5       rGSOGSPSI jjr   GSQ         GSRSK jjr   GSQ         GSRSL jjr\R                  GSSSM j5       rGSTSN jrGSUSO jrGSVSP jrGSWSQ jrGSXSR jrGSYSS jr    GSZST jrGS[SU jrGS\SV jr    GS]SW jrGS^SX jrSY 4     GS_SZ jjr        GS`S\ jrGSaGSbS] jjr  GSc         GSdS^ jjr     GSe             GSfS_ jjrGSgS` jrGShSa jrGSiSb jrGSjSc jrGSkSd jr\3" Se5      r\0" SfSESg9r\\(\&\4   \4   r " Sh Si\,\)\\4   5      rGSlSj jr    GSlSk jr    GSmSl jr    GSnSm jr      GSoSn jr      GSpSo jr GSq     GSrSp jjr      GSsSq jrGStSr jrGSuSs jrGSvSt jrGSwSu jrGSxSv jrGSySw jrGSzSx jrGS{Sy jrGS|Sz jr\" / S{Q5      r    GS}S| jrGS~S} jrGSS~ jrS SKrGSS jr/ rS[\S'   GSS jrGSS jr\GR                  GSS j5       r\GR                     GS       GSS jj5       r\r\r\rSJS.GSS jjrSJS.       GSS jjr\RF                  " S?5      GSS j5       r " S S\+5      r\GRz                   " S S5      5       Gr  " S S5      Gr " S SG\5      Gr\GR                  GSS j5       Gr " S S5      Gr " S SG\5      Gr\R                  GSGSS jj5       Gr\RF                  GSS j5       Gr\RF                  GSSS j5       GrGSS jGr	 GSq       GSS jjGr
GSGSS jjGr      GSS jGrGSS jGrGSS jGrSJSJSES.         GSS jjGrSSJS.       GSS jjGr GS     GSS jjGrSJS.       GSS jjGrSJS.       GSS jjGr        GSS jGr\RF                  " SS9GSSS j5       Gr\RF                  " SS9GSSS j5       GrS Gr\RF                  " SS9GSSS j5       Gr                  GSS jGrGSS jGr  GS                 GSS jjGrGSS jGr\\6GR6                  -  GrS\S'   \R                   GS         GSS jj5       Gr\R                  GSS j5       Gr\R                  GSS j5       Gr \R                  GSS j5       Gr!\R                  GSS j5       Gr"GSS jGr#GSS jGr$GSS jGr%GSS jGr&GSS jGr'        GSS jGr(    GS               GSS jjGr)GSSS jGr* " S S5      Gr+        GSS jGr,        GSS jGr-GSS jGr.GSS jGr/GSS jGr0        GSS jGr1        GSS jGr2\GR                        GSS j5       Gr3 GSq     GSS jjGr4GSS jGr5GSS jGr6GSS jGr7GSS jGr8GSS jGr9GSS jGr:\GR                  GSS j5       Gr;GSS jGr<\R                  GSS j5       Gr=\R                  GSS j5       Gr>\R                  GSS j5       Gr?GSS jGr@GSS jGrAGSS jGrBGSS jGrCGSSS jGrDGSSS jGrEGSS jGrFGS|S jGrG " S S\GR                  5      GrI          GSS jGrJGSS jGrK    GSS jGrL GSq     GSS jjGrMGSS jGrN GSq     GSS jjGrOGSS jGrP      GSS jGrQ        GSS jGrRS 4           GSS jjGrSS 4           GSS jjGrTGSS jGrUGSS jGrVGSS jGrW\GRz                   " S S5      5       GrX\GR                  GSS j5       GrYGSS jGrZGSS jGr[GSSS jGr\GSS jGr]GSS jGr^              GSS jGr_GSS jGr`GSS jGraGSS jGrbGSS jGrc        GSS jGrdGSS jGre        GSS jGrfGSGS  jGrg GSq       GSGS jjGrh      GSGS jGriGSGS jGrj      GSGS jGrkGSSGS jGrlGSGS jGrmGSGSGS	GS
GSGSGSGS.GrnG\nGR                  5        V Vs0 s H  u  pX_M	     snn Grp\GR                  " GS5      GrrGSGS jGrsGSGS jGrtGSGS jGruGSGS jGrv\R                  GSGS j5       Grw\GRz                   " GS GS5      5       Grx0 GryGS\GS'           GSGS jGrz\E" 5       Gr{GS\GS'   GSGS jGr|GSqGSGS jjGr}GSGS jGr~\0" GS5      Gr\0" GS5      Gr " GS  GS!\ G\G\4   5      Gr\2" SEGS"9GSqSESF.GSGS# jjj5       GrGSGS$ jGr " GS% GS&\GR                  5      Gr\R                  GSGS' j5       GrGSSGS( jGrGSGS) jGrGSGS* jGrGSGS+ jGrGSGS, jGrGSLGS- jGrGSGS. jGrGSSGS/ jGrGSGS0 jGrGS1GrGSGS2 jGrGSGS3 jGrGSGS4 jGr  GS         GSGS5 jjGrGSGS6 jGrGSGS7 jGrGSSGS8 jGrGSGS9 jGrGSGS: jGr\GRz                  " SESF9 " GS; GS<5      5       Gr\GS=\&4   Gr\G\G\/G\4   Gr " GS> GS?5      GrG\" 5       GrGSGS@ jGrGSGSA jGrGSGSB jGrGSGSC jGrGSGSD jGr\E" / GSEQ5      GrGSGSF jGr\#GSGSG j5       GrGSGSH jGr              GSGSI jGr      GSGSJ jGr GS       GSGSK jjGrgs  snn f (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVar)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table_sympy_subs)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     [          V s/ s H*  n [        [        U 5      R                  5       (       d  M(  U PM,     nn [	        U5      S::  d   e[	        U5      S:X  a  SnU$ UR                  5       nU$ s  sn f )Nr7   r   rH   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      O/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/_inductor/utils.pyget_gpu_typerX   j   sh    &KY'%*;*H*H*J!YJKz?aZA-vHO 4>>>3CHO Ls
   'A2A2)get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubinz.hsaco.)rH   hiprJ         zOrderedSet[torch.dtype]_TMA_SUPPORTED_DTYPES@      zmust be power of 2c                *    U [         -   S-
  [         * -  $ )z/Round up to the nearest multiple of ALIGN_BYTESr7   )ALIGN_BYTES)nbytess    rW   _alignrz      s    [ 1$44    c                   [        U [        R                  [        R                  45      (       a#  [	        [        [        U R                  5      5      $ [        U [        5      =(       d"    [        R                  " U [        5      [        :H  $ )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrx   )vs    rW   r   r      sT    !eii+,,3{AFF+,,aK599Q#<#KKr{   c                  4    \ rS rSrSrSrSr\SS j5       rSr	g)	r      z<Symbolically round up to the nearest multiple of ALIGN_BYTESr7   Tc                    [        U[        [        R                  45      (       a  [	        [        U5      5      $ [        U5      (       a  U$ g N)r}   intr~   Integerrz   r   )clsvalues     rW   eval
align.eval   s<    ec5==122#e*%%uL r{    N)r   
sympy.Exprreturnzsympy.Expr | None)
__name__
__module____qualname____firstlineno____doc__nargs
is_integerclassmethodr   __static_attributes__r   r{   rW   r   r      s!    FEJ r{   r   T)frozenc                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
rg)GraphPartitionMap   zH
Mapping from the partition info (e.g., input/output) to the graph info
r   idzlist[int | None]input_index_mappingoutput_index_mapping	list[str]constant_namesr   Nr   r   r   r   r   __annotations__r   r   r{   rW   r   r      s$    
 	G *)** r{   r   c           
        U " 5         [         R                  R                  5         [         R                  " [	        S5      [         R
                  SS9n[         R                  R                  SS9n[         R                  R                  SS9nUR                  5         [        S5       H  nUR                  5         U " 5         M     UR                  5         [         R                  R                  5         UR                  U5      S-  n[        S[	        X-  5      5      n[        S[	        X'-  5      5      n	[        U5       H
  nU " 5         M     [        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[         R                  R                  [         R                  R                  R                  /S9 n
[         R                  R                  5         [        U	5       Hp  nUR                  5         XK   R                  5         [         R                  R                   R                  S	5         U " 5         S
S
S
5        X[   R                  5         Mr     [         R                  R                  5         [         R"                  " [%        XE5       VVs/ s H  u  pUR                  U5      PM     snn5      nS
S
S
5        [         R&                  " W5      R)                  5       n[*        R-                  S5        [*        R-                  W
R/                  5       R1                  SSS95        [3        U
R5                  5        Vs/ s HI  nUR6                  [8        R                  :X  d  M#  [:        R<                  " SUR>                  5      c  MG  UPMK     sn5      nU(       a#  U[@        R&                  " S U 5       5      S-  -  n[*        R-                  SU5        U$ s  snf s  snf ! , (       d  f       GN= fs  snnf ! , (       d  f       GNK= fs  snf ):  
Returns benchmark results by examining torch profiler events.
This could be more accurate as it doesn't count CPU side overhead.
However, this also requires manually excluding irrelevant event, e.g.
vectorized_elementwise_kernel which is used to fill L2 cache,
various CUDA events, etc, so could also be fragile.
    ArH   dtypedeviceTenable_timing   r7   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  8   #    U  H  oR                   v   M     g 7fr   device_time_total.0events     rW   	<genexpr>fp8_bench.<locals>.<genexpr>(  s     Q33        @@profiling results: %s ms)!rP   rH   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestabler\   eventsdevice_typer[   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rW   	fp8_benchr      s>    D	JJKKJu}}VLE **"""6K

  t 4I1X
  	JJ**959K 1c&./0H1c#+,-H 8_
  BGxQA5::##$#7KQ?DXO!!!!5IO			NN++00
 
  
 


 xAKKMN!!#&&7 8L! ! 	

 +.{+FG+F41Q^^A+FG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
#!!Z__4  HH0%**=	 #	
	O OOQQQ	

 II(#.JO RO 87
 H
 
*	
sP   'P'P!;A8P>3P&;AP>P82P>"Q) QQ&
P50P>>
QFc                4    SSK Jn  U" [        5      " XX#5      $ )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   is_vetted_benchmarkingr   s        rW   do_bench_using_profilingr   0  s     " U*+DE
C r{   c           
        U(       d  SSK Jn  U" 5         [        5       nUR                  5       n[	        U5      nU " 5         UR                  5         [        R                  " [        S5      [        R                  US9nUR                  SS9n	UR                  SS9n
U	R                  5         [        S5       H  nUR                  5         U " 5         M     U
R                  5         UR                  5         U	R                  U
5      S-  n[        S[        X-  5      5      n[        S[        X,-  5      5      n[        U5       H
  nU " 5         M     UR                  5         [        R                  R!                  [#        [        R                  R$                  U5      /S	9 n[        U5       H  nUR                  5         U " 5         M     UR                  5         S
S
S
5        [&        R)                  S5        [&        R)                  WR+                  5       R-                  SSS95        [/        UR1                  5        Vs/ s H7  nUR2                  [#        [4        U5      :X  d  M#  UR6                  S:w  d  M5  UPM9     sn5      n[/        U Vs/ s H  nSUR6                  ;  d  M  UPM     sn5      n[9        U5      S:X  a  [;        SU S[9        U5       SU 35      eUR=                  5         UR+                  5       n[&        R)                  S5        [&        R)                  UR-                  SS95        [?        S U 5       5      S-  U-  n[&        R)                  SU5        U$ ! , (       d  f       GN= fs  snf s  snf )r   r   )may_ban_benchmarkingr   r   Tr   r   r7   r   Nr   r   r   r   zContext SyncFillFunctorzDFailed to capture any events after filtering cache clearing events. z	 events: z, repeats: zprofiling time breakdown)r   c              3  8   #    U  H  oR                   v   M     g 7fr   r   r   s     rW   r   ,_do_bench_using_profiling.<locals>.<genexpr>  s     A=%%%=r   r   r   ) r   r   rX   upperrY   r   rP   r   r   r   r   r   r   r   r   r   r   rO   r   r   r   r   r   r\   r   r   r[   r   rR   RuntimeError_build_treesum)r   r   r   r   r   r   device_type_upperdevice_interfacer   r   r   r   r   r   r   r   r   r   actual_eventsr   s                       rW   r   r   H  s    "M.K#))+/<D  "KKJuyyME #((t(<K &&T&:I1X
    "**959K 1c&./0H1c#+,-H 8_
    "			ENN335FG
 
  
 
xAKKMD	 ! 	$$&
 IIlIIann$$-EQS$TU 	
#  GJ8I$JJ  

n, #	
O +OO5}EJJ/NOOM =QRm9S%9$:+hZQ
 	
 !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J[
 
$	
 	Ps*    :M%?"M7%M77M7M<%M<%
M4c                     SSK Jn   [        R                  R	                  SS5        U S L=(       a%    [        [        [        R                  SS 5      S5      $ ! [         a     g[         a  nS[        U5      ;   d   e S nAgS nAff = f)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rP   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrO   opsImportErrorr  str)r  r   s     rW   has_torchvision_roi_alignr    s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 
B$	B-BBc                t   U c   [         R                  " S5      R                  $ [        U [        5      (       a  [         R                  " U 5      n U R
                  S;  aY  U R                  cL  [        U R
                  5      n[         R                  " U R
                  UR                  R                  5       S9$ U $ )Ng        )cpumeta)index)
rP   r   r   r}   r  typer  rY   Workercurrent_devicer   r  s     rW   decode_devicer    s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMr{   c                ~    [         R                  " [        R                  U [        R
                  R                  5      $ r   )	functoolsreduceoperatormulr~   SOne)its    rW   sympy_productr$    s#    HLL"eggkk::r{   c           	         [        U 5      [        U5      :X  d   e[        R                  " [        S [	        X5       5       5      5      $ )Nc              3  .   #    U  H  u  pX-  v   M     g 7fr   r   )r   abs      rW   r   sympy_dot.<locals>.<genexpr>  s     >odaAEos   )rR   r~   expandr  r   )seq1seq2s     rW   	sympy_dotr-    s6    t9D	!!!<<>c$o>>??r{   c                b    U  Vs0 s H  n[        U5      U_M     snR                  5       $ s  snf r   )r   values)r#  rT   s     rW   uniquer0    s+     !bBqE1Hb!((**!s   ,c           
        [        U [        R                  5      (       d  [        U[        R                  5      (       a4  [        [        R                  " U 5      [        R                  " U5      5      $ [        U [
        5      (       a  [        U[
        5      (       d$   U  S[        U 5       SU S[        U5       35       e[        X5      $ )Nz: , )r}   r~   Exprr_   sympifyr   r  runtime_ceildiv)numberdenoms     rW   rj   rj     s    &%**%%E5::)F)Fu}}V,emmE.BCC fc""z%'='= ("T&\N"UG2d5k];= 6))r{   c                x   U c  g[        U 5      R                  S5      S   n0 SS_SS_SS	_S
S_SS_SS_SS	_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_S#S$SS%S&S'S(.EnUR                  [        UR	                  5       5       Vs0 s H  o3U_M     sn5        [        U [         5      (       a  U $ S)X!    3$ s  snf )*Nz*i8rp   r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e4m3fnuzfp8e4b8float8_e5m2fnuzfp8e5b16float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16i32i64u16u32u64)int32int64uint8uint16uint32uint64*)r  splitupdatelistr/  r}   )key	dtype_strtysr   s       rW   _type_ofri    s]   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	9 	: 	$ 	D 	6 	F  	6!" 	6#$ 	%& 	'( 3C8 JJd3::<0101012S#&&3@a/?,@@ 2s   B7c                Z    U  Vs/ s H  n[         R                  " U5      PM     sn$ s  snf )z
Gets the shape and stride of a tensor. For non-symbolic tensors, this is
trivial. But for symbolic tensors, we need to map from SymIntNode into
sympy.Expr.
)r~   r4  lstr   s     rW   convert_shape_to_inductorrm    s%     '**cEMM!c***s    (c                p    [        U [        R                  5      (       a  U R                  R                  $ U $ )z
Convert SymInt to sympy.Expr, leave int as is.

Unlike sympy.sympify() which converts int to sympy.Integer,
this function preserves int as int and only converts SymInt to Expr.
)r}   rP   r2   nodeexprvals    rW   convert_symint_to_exprrs    s(     #u||$$xx}}Jr{   c                    SSK Jn  [        U [        5      (       a  U $ [        U [        R
                  5      (       a  [        U 5      $ UR                  R                  R                  R                  U SS9$ )zD
Like convert_shape_to_symint, but operates on a single expression.
r7   VN)hint)
virtualizedrv  r}   r   r~   r   graphsizevars	shape_envcreate_symintnode)r   rv  s     rW   convert_to_symintr}    sk      a 	

 !U]]++ F	 !!++==ad=Kr{   c                D    U  Vs/ s H  n[        U5      PM     sn$ s  snf )zn
Takes a list of shapes from Inductor and converts them into symints (or just
ints if all shapes are static).
)r}  rk  s     rW   convert_shape_to_symintr  *  s"     +..#Qa #...s   c                N    [        S U R                  R                   5       5      $ )z%
Does this op overload have aliasing
c              3  <   #    U  H  oR                   S Lv   M     g 7fr   )
alias_infor   r'  s     rW   r   is_view.<locals>.<genexpr>8  s     F1EA||4'1Es   )any_schema	argumentsops    rW   is_viewr  4  s     F1E1EFFFr{   c                    gNFr   )r   s    rW   <lambda>r  =  s    r{   c                  ^ U R                   S:w  a  g[        U R                  [        R                  R
                  5      (       d  U R                  [        R                  L d  g[        [        R                  R
                  U R                  5      nU[        R                  L d  [        U5      (       a  [        U4S jU R                   5       5      $ [        R                  R                  UR                  ;   =(       d    T" U5      $ )z
Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

Uses in views ops will follow the views uses
call_functionFc              3  <   >#    U  H  n[        UT5      v   M     g 7fr   )is_pointwise_use)r   uis_pointwise_fns     rW   r   #is_pointwise_use.<locals>.<genexpr>N  s     KA#A77s   )r  r}   targetrP   _ops
OpOverloadr  getitemr   r  r   usersTag	pointwisetags)user  r  s    ` rW   r  r  ;  s     vv 3::uzz4455xGWGW9W%**''4F!!!WV__KKKK99&++-H1HHr{   	list[Any]c           	       ^^ [         R                  R                  5       m/ mSUU4S jjnTR                  " U /[	        [         R
                  X1U45      Q76 n[        U R                  R                  5      S:X  a3  [        U R                  R                  S   R                  5      S:X  a  U4nTR                  U5        [         R                  R                  0 T5      nUT4$ )Nc                `   > TR                  U 5        TR                  S[        T5       35      $ )Narg)appendplaceholderrR   )r  g
graph_argss    rW   add_tensor_arg)gen_gm_and_inputs.<locals>.add_tensor_argY  s,    #}}s3z?"3455r{   r7   r   Tensor)r  torch.Tensorr   r5   )rP   fxGraphr  r#   r  rR   r  returnsr  r  outputr4   )r  r   kwargsr  ro  gmr  r  s         @@rW   gen_gm_and_inputsr  S  s     	A%'J6 6 ??u||^F^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>r{   c                t    U S:X  a  g [        U 5      nUR                  5       (       a  UR                  5         g g Nr  )rY   rQ   r   r  s     rW   r   r   k  s7    /7$$&&$$& 'r{   c                    [        U5        [        R                  " S5        [        R                  " 5       n[        U5       H  nU " U6 n[        U5        M     [        R                  " 5       nWc   eXt-
  $ )Ni9  )r   rP   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rW   timedr  s  sk     	d				B5\'F  
			B7Nr{   c                    [         R                  " [        U5       Vs/ s H  n[        XX%5      PM     sn5      n[         R                  " U5      U-  n[        X-  S 5        UR                  5       $ s  snf )Nz.6f)rP   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rW   print_performancer    se     ll>CFmLmue	4mLG << 5(D	T_S!#99;	 	Ms   A3c                F   ^ [        X5      " 5       m[        XU4S j5        g)zKReplace obj.method() with a new method that returns a precomputed constant.c                    > T $ r   r   )r  s   rW   r  #precompute_method.<locals>.<lambda>  s    r{   N)rO   setattr)objmethodr  s     @rW   precompute_methodr    s    S!#FC(r{   c                ,    U H  n[        X5        M     g)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rW   precompute_methodsr    s    #& r{   c                8    [        X:  5      [        X:  5      -
  $ r   )r   r'  r(  s     rW   cmpr    s    qu:AE
""r{   c                    [        U [        5      (       a  U /U-  $ [        U 5      S:X  a  [        U 5      " U S   /5      U-  $ U $ )Nr7   r   )r}   r   rR   r  )rT   sizes     rW   pad_listliker    sD    !SsTz
1v{Aw!v%%Hr{   c                @    [        U 5      S:X  a  / $ SS jn[        XS9$ )Nr   c                    [        U [        5      (       a  U $ SSKJn  [        X5      (       d   eU R	                  5       $ )Nr7   )rF   )r}   r  	schedulerrF   get_name)elemrF   s     rW   	sort_functuple_sorted.<locals>.sort_func  s4    dC  K0$2222}}r{   rf  )r  rl   r   r  )rR   sorted)rT   r  s     rW   tuple_sortedr    s$    
1v{	 !##r{   PRV)	covariantc                  2    \ rS rSr\SS j5       rSS jrSrg)CachedMethodi  c                    g r   r   )r   s    rW   clear_cacheCachedMethod.clear_cache  s    ),r{   c                    g r   r   selfr   r  s      rW   __call__CachedMethod.__call__  s    r{   r   N)r   r   r   None)r   P.argsr  P.kwargsr   r  )r   r   r   r   staticmethodr  r  r   r   r{   rW   r  r    s    , ,Dr{   r  c           	        ^ U R                   nSU S3mSU 0n[        SU ST ST S3R                  5       U5        [        R                  " U 5      " X! S3   5      nS
U4S	 jjnXCl        U$ )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                B   > [        U T5      (       a  [        U T5        g g r   r  delattrr  rf  s    rW   r  "cache_on_self.<locals>.clear_cache  s    4D# r{   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rf  s        @rW   cache_on_selfr     s    ;;DtfF
C *CF  E "' (+e ,			 FH oob!#n&=">?G &Nr{   c                    [        U 5      $ )zU
Variant of cache_on_self for properties. The only difference is the type signature.
)r   )r   s    rW   cache_property_on_selfr    s     r{   c                    ^      SU 4S jjnU$ )Nc           	        >^ ST SU R                    S3mSU 0n[        ST ST ST S3R                  5       U5        [        R                  " U 5      " US	   5      nSU4S
 jjnX2l        U$ )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                B   > [        U T5      (       a  [        U T5        g g r   r  r  s    rW   r  <cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!!c" "r{   r  r  )r   r  r  r  rf  
class_names       @rW   r  'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, #CL1	# (r{   )r   FN_TYPE[P, RV]r   r
  r   )r  r  s   ` rW   cache_on_self_and_argsr    s     
$$	$L Nr{   c           
        SSK Jn  [        U [        5      (       ay  [        R
                  " [        R                  U  Vs/ s H?  n[        US5      (       d  M  UR                  (       d  M)  UR                  R                  PMA     sn[        5       5      $ [        XR                  5      (       a  U R                  $ [        5       $ s  snf )Nr7   irro  ) r  r}   re  r  r  r  or_r  ro  originsr!   r?   )node_scheduler  ro  s      rW   aggregate_originsr     s     -&&LL * *D4( "-1YY "		!!) L	
 		
 
M??	3	3$$$|s   C
C
+C
c                `   [        U 5      nUS:X  ag  S nU Vs/ s HA  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   c  M9  U" U5      PMC     nn[        [	        U5      5      nGOUS:X  a  / nU H  nUR                  S:X  d  M  S nSnSUR                  ;   a  UR                  S   S   nO$SUR                  ;   a  UR                  S   S   nS	nU(       d  Mi  [        US
   [        5      (       a  UR                  US
   U-   5        M  UR                  US
   R                  U-   5        M     [        [	        U5      5      nO:US:X  a.  U Vs/ s H   oDR                  S:X  d  M  UR                  PM"     nnO[        eSR                  S/U-   5      $ s  snf s  snf )Noriginal_atenc                .   U R                   S   nSn[        U[        R                  R                  5      (       a  UR
                  R                  nU$ [        U[        R                  R                  5      (       a  [        UR                  5       5      nU$ )Nr  r  )
r  r}   rP   r  r  _overloadpacketr   HigherOrderOperatorr  r   )originr  rf  s      rW   get_origin_meta_str2get_fused_kernel_name.<locals>.get_origin_meta_str=  su    "KK8MC-)>)>??#33<< J M5::+I+IJJ-,,./Jr{   r  rP   r  source_fn_stackr   fwd_source_fn_stackbackwardr7   inductor_noder   fused)r  r  r  r  r!   r}   r  r  r   r   NotImplementedErrorjoin)r  descriptive_namesall_originsr  r  sources	source_fnsuffixs           rW   get_fused_kernel_namer(  6  s    $M2KO+	 &
%yyO+ (  6;;. ( O,	 ('% 	 
 G,-	g	%!FyyO+ 	$3 &,= >r BI*fkk9 &,A B2 FI'F ilC00NN9Q<&#89NN9Q<#8#86#AB "" G,-	o	-&1
&1FYY/5QKFKKk 	 
 "!88WI'((G
<
s"   F&F&F&F&!F+8F+c                b  ^^ ^!^" [        U 5      nU Vs/ s H  o3R                  S:X  d  M  UPM     nn[        R                  " [        5      n[        R                  " [        5      nSm!U(       a  [        S U 5       5      n[        U5      S:X  ac  US   R                  m![        T!S5      (       d0  [        T!R                  5       VV	s0 s H  u  pX_M	     n
nn	U
T!l        UR                  U!4S jS9  U GHo  nS	UR                  ;   a  UR                  S	   b  UR                  S	   nSn[        U[        R                   R"                  5      (       a  [%        UR&                  5      nOB[        U[        R                   R(                  5      (       a  [%        UR+                  5       5      nU(       a  Xm   R-                  UR*                  5        S
UR                  ;   a<  UR                  S
   S   R*                  nX]   R-                  UR*                  5        GM&  UR                  R/                  S5      S:X  d  GMH  X[R*                     R-                  UR*                  5        GMr     T!b  SOSnTR0                   SU SSR3                  UR5                  5       5       SSR3                  UR5                  5       5       S3nTR0                   S3/n[7        UR9                  5       5       HA  u  nnUR-                  TR0                   SU SSR3                  [7        U5      5       35        MC     T!Gb  SSKJm   UR-                  TR0                   S35        [        5       n/ n[        U T R>                  5      (       Gd  SSK J!n        S*U 4S jjnS+S jm"S,U"4S jjnU  GH  n	[        U	S5      (       a  U	RD                  c  M$  [        U	RD                  S5      (       a  U	RD                  RF                  b  U	RD                  RF                   H  nUR*                  U;   a  M  URI                  UR*                  5        UR                  RK                  UR*                  5      nUc  MZ  U" UUR*                  5      u  nnUR-                  TR0                   SU S U" U5       S!U S35        M     [        U	RD                  S"5      (       d  GM+  U	RD                  RL                  c  GME  U	RD                  RL                   HW  nUR                  RK                  UR*                  5      nUc  M-  U" UUR*                  5      u  nnUR-                  S#U-   5        MY     GM     U H  nURO                  S$S%9nUbJ  [        RP                  RR                  (       a+  URU                  U4S& jURW                  5        5       5        M_  UR-                  TR0                   SU 35        M     UR-                  TR0                   S'S(R3                  U5       35        US)R3                  U5      4$ s  snf s  sn	nf )-a  
Retrieves metadata information for a kernel.
Args:
    node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
        Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
    wrapper (PythonWrapperCodegen):
        An instance of PythonWrapperCodegen, used to define the code comment format.
Returns:
    tuple[str, str]:
        A tuple containing two strings:
            - The first string represents the kernel's metadata.
            - The second string represent the kernel's detailed metadata.
r  Nc              3  8   #    U  H  oR                   v   M     g 7fr   )ry  )r   ns     rW   r   &get_kernel_metadata.<locals>.<genexpr>  s     "CNq77Nr   r7   r   )_inductor_kernel_metadata_node_to_idx_mapc                "   > TR                   U    $ r   )r-  )r+  single_graphs    rW   r  %get_kernel_metadata.<locals>.<lambda>  s    lTTUVWr{   r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r2  z], Original ATen: []z" Source node to ATen node mapping:   z => r  z Graph fragment:ru  c                R  > [        U TR                  5      (       aF  [        U R                  TR                  5      (       a!  U R                  R                  R                  nOU R                  nUc  UnOUR
                  n U R                  5       nX44$ ! [         a    S n X44$ f = fr   )r}   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr!  )bufferrw_namer<  r   layoutr  s        rW   get_buffer_info,get_kernel_metadata.<locals>.get_buffer_info  s     fbll33
KK9 9 #)++"2"2">">K"("4"4K&"D&++D"#..0F |# + "!F|#"s   B B&%B&c           	     j    SSR                  U  Vs/ s H  n[        U5      PM     sn5       S3$ s  snf )N[r2  r6  )r"  r  )shaperT   s     rW   stringify_shape,get_kernel_metadata.<locals>.stringify_shape  s1    499e%<ec!fe%<=>a@@%<s   0
c                   > U c  gT" U R                   5       nT" U R                  5       nU R                   nS[        U R                      U U U S3$ )Nr  ")r  strider   r    r   )r@  shape_annotationstride_annotationdevice_annotationrF  s       rW   stringfy_layout,get_kernel_metadata.<locals>.stringfy_layout  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?r{   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadatac              3  F   >#    U  H  nTR                    S U 3v   M     g7f)r7  N)comment)r   liner  s     rW   r   r,    s(      ) ; 's4&1 ;s   !z
   return ,
)r>  z-ir.TensorBox | ir.Buffer | ir.TorchBindObjectr?  r  r   ztuple[str, ir.Layout | None])rE  zIterable[int]r   r  )r@  zir.Layout | Noner   r  ),r  r  collectionsdefaultdictre  r!   rR   ry  r  	enumeratenodesr-  sortr  r}   rP   r  r  r  r  r  r   r  getrV  r"  keysr  itemsr  r  r?   rx  rv  rP  rQ  addtry_get_bufferrR  format_nodeversionrq   extend
splitlines)#r  r  r$  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr+  node_to_idx_mapro  r  rf  sort_strmetadatadetailed_metadataoriginal_noder]  	all_reads
all_writesrv  rA  rN  rr>  
input_namer@  woutput_namer   formatted_noder  r/  rF  s#    `                              @@@rW   get_kernel_metadatary  m  sG   $ $M2K+6W;)):Vf;NW ,,T2N$006
 L""CN"CC}")!,22L<)TUU8A,BTBT8U"V8Ufc168U"VIXFW    dii'DIIo,F,R IIo6MC-)>)>??-778M5::+I+IJJ-,,./"'..tyy9$))#))K(+00C&&tyy1YY]],->99%,,TYY7   *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= >u  s=/diiu6N5OP	
 !?   GOO#44D!EF%/\	 "
-99&$E$PS$-$(A
 #q-00AMM4I1=='22q}}7J7J7V]]0066Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ 1 AMM844,,8]]11!"!7!7!?!>$)8)HQ"))#*;< 2- #< #D!--d-KN)emm.?.? "(( ) . 9 9 ;) 
 "((GOO+<C?O)PQ # 	  GOO#4Jsxx
?S>T!UVTYY0111Y X #Ws   X&X&	X+c                   [        U 5      n [        U 5      nU (       ak  U R                  5       nUR                   HB  nU(       a  U" U5      (       a  M  XB;  d  M   UR	                  U5        U R                  U5        MD     U (       a  Mk  U$ )zJReturns the set of nodes whose values depend on those within initial_queue)re  r!   rS   r  rb  r  )initial_queueskip_filterdominated_setro  users        rW   dominated_nodesr    sx    
 'M}-M
  "JJD{400(!!$'$$T*  - r{   c                Z  ^^	 SSK Jm  SUU	4S jjm	[        U5      u  p#U Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[        U 5      u  pcU Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[	        [
        R                  " / UQUQ76 5      $ s  snf s  snf )Nr7   r  c                l  > [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      =(       a=    [        U TR
                  TR                  TR                  TR                  45      (       + $ r   )	r}   r9  r:  r;  r@   ComputedBufferInputsKernelInputBufferTemplateBuffer)r+  r  is_unrealized_nodes    rW   r  *gather_origins.<locals>.is_unrealized_node(  s    a&&%aff--a''%aff--!RYY' 

!!!!	1
 -
 	
r{   )r+  r@   r   r9  )r  r  r"   r  r!   	itertoolschain)
r   r  kwargs_flattenr   rr  kwargs_originsargs_flattenargs_originsr  r  s
           @@rW   gather_originsr  #  s     
 
" %V,N-;W^c?QRU?Vkckk^NW"4(OL+7S<C;Mc;RKCKK<LSiooE|EnEFF XSs   B#B#B(0B(c                X   ^^^^ SS jmSUU4S jjmSUU4S jjmSU4S jjmT" U 5      $ )z
Normal sympy str is very slow, this is a lot faster.  The result are
somewhat worse, as it doesn't do as much simplification.  So don't
use this for final codegen.
c                    [        U [        R                  5      =(       a1    [        U R                  5      S:H  =(       a    U R                  S   S:H  $ )N   r   r   )r}   r~   MulrR   r   )rp  s    rW   is_neg_leadsympy_str.<locals>.is_neg_leadG  s:    tUYY'VC		Na,?VDIIaLTVDV	
r{   c                v  > [        U [        R                  5      (       a  [        U R                  5      S:X  aT  T" U R                  S   5      (       a:  T" U R                  S   5       ST" U R                  S   R                  S   5       3$ SR                  [        TU R                  5      5      $ T" U 5      $ )Nr  r7   r   z - z + )r}   r~   r   rR   r   r"  r   )rp  r  sympy_str_muls    rW   sympy_str_add sympy_str.<locals>.sympy_str_addL  s    dEII&& 499~"{499Q<'@'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&r{   c                   > [        U [        R                  5      (       aJ  T" U 5      (       a  ST" U R                  S   5       3$ SR	                  [        TU R                  5      5      $ T" U 5      $ )N-r7   z * )r}   r~   r  r   r"  r   )rp  r  sympy_str_atoms    rW   r   sympy_str.<locals>.sympy_str_mulW  sa    dEII&&4   >$))A,7899zz#ndii"@AA!$''r{   c                  > [        U [        R                  5      (       a  U R                  $ [        U [        R                  [        R
                  45      (       a  ST" U 5       S3$ [        U [        [        [        [        45      (       aC  U R                  R                   SSR                  [        [        U R                  5      5       S3$ [!        U 5      $ )N()r2  )r}   r~   Symbolr   r   r  rc   r`   ra   rb   funcr   r"  r   	sympy_strr   r  )rp  r  s    rW   r  !sympy_str.<locals>.sympy_str_atomb  s    dELL))99uyy%))455}T*+1--(HMNNii(()499SDII5N+O*PPQRRt9r{   )rp  r   r   r9  rp  r   r   r  r   )rp  r  r  r  r  s    @@@@rW   r  r  @  s.    

	' 	'	( 	( r{   c                    SSK Jn  [        R                  (       a9  [	        UR
                  SS 5      =n(       a  UR                  S:w  a  [        U 5      $ [        R                  " 5       $ )Nr7   ru  current_node
index_expr)
rx  rv  ri   compute_all_boundsrO   interpreterr  rf   rg   unknown)r  rv  fx_nodes      rW   get_bounds_index_exprr  o  sN     	!!~tDDWDNNl*5!!""$$r{   c                    U S   S:H  $ )Nr   rt  r   )prefixs    rW   prefix_is_reductionr  }  s    !9r{   c                D    U [         R                  :w  d   e[        XSSS9$ )1
Used to generate an integer-nonnegative symbol.
Tintegernonnegative)re   SIZErd   )r  rl  s     rW   sympy_index_symbol_with_prefixr    s'     TYY vDdCCr{   c                b    U =(       d    [         R                  =(       a    [         R                  $ r   )ri   debug_index_assertsassert_indirect_indexing)checks    rW   generate_assertr    s    /V//TV5T5TTr{   c                D    U S   S:w  d   e[         R                  " U SSS9$ )r  r   r   Tr  )r~   r  r   s    rW   sympy_index_symbolr    s)     7c>> <<d==r{   c                    [        X5      $ )z
When the passed replacement symbol v is a string, it is converted to a symbol with name v that
have the same replaced expression integer and nonnegative properties.
r&   )rp  replacementss     rW   
sympy_subsr    s    
 t**r{   c                    [        U [        R                  5      =(       d-    [        U [        R                  5      =(       a    U R                  $ r   )r}   rP   r2   r  _has_symbolic_sizes_strides)r'  s    rW   is_symbolicr    s3    a& 1ell#E(E(Er{   c                 &    [        S U  5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   )r  r  s     rW   r   "any_is_symbolic.<locals>.<genexpr>  s     ,t!{1~~tr   r  )r   s    rW   any_is_symbolicr    s    ,t,,,r{   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarc                    SSK Jn  U R                  R                   HH  n[	        U5      (       a  Us  $ UR
                  R                  S5      =nc  M7  U" U5      (       d  MF  Us  $    g )Nr   )r)   rr  )%torch.fx.experimental.symbolic_shapesr)   ry  r]  is_cudagraph_unsafe_fx_noder  r_  )r  r)   ro  rr  s       rW   %get_first_incompatible_cudagraph_noder    sW     L&t,,K99==''C49Ns9S9SK  r{   c                    [        [        [        U R                  R                  5      5      5      nUR
                  S:X  d   eU$ )z$Get the output node from an FX graphr  )nextiterreversedry  r]  r  )r  	last_nodes     rW   output_noder    s6    T(288>>234I<<8###r{   c                    U R                   R                  SS9n[        S U 5       5      n[        U 5      R                  S   n[        U[        5      (       a  UOU4n[        S U 5       5      nX%-  $ )Nr  r  c              3     #    U  HX  n[        UR                  R                  S 5      [        R                  5      (       d  M=  UR                  S    R
                  v   MZ     g7frr  N)r}   r  r_  rP   r  r   )r   ro  s     rW   r   "get_all_devices.<locals>.<genexpr>  sC      9%DdiimmE*ELL9 	 		%%s   <A" A"r   c              3    #    U  H  n[        U[        R                  R                  5      (       d  M.  [        UR                  R                  S 5      [        R                  5      (       d  Mh  UR                  S    R                  v   M     g7fr  )r}   rP   r  r5   r  r_  r  r   )r   r  s     rW   r   r    s[      7Cc588==) 	 sxx||E*ELL9 	s   -B6B- B)ry  
find_nodesr!   r  r   r}   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rW   get_all_devicesr    s~    ++}+=.8 9%9 /M "o""1%G$We44w7*H,6 77 -K &&r{   c                    [        [        R                  R                  5       5       GH5  n U R	                  S5      (       d  M  [        R                  U    nUR
                   H  nUR	                  S5      (       d  M  [        X5      n[        U[        R                  R                  R                  R                  5      (       d  Me  UR                   Hp  n[        U[        R                  R                  R                  R                  5      (       d  MB  UR                  R                   R"                  R%                  5         Mr     M     [        R                  U 	 GM8     S[        R                  ;   aR  [        R                  S   n['        UR(                  R*                  R,                  5      ?UR(                  R*                  ?[0        R2                  " 5         g )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)re  sysmodulesr`  
startswith__dict__rO   r}   rP   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rW   unload_xpu_triton_pydsr    sJ   CKK,,./%%&NOOKK$I##I.. .EOO33EEVV  #)"8"8%"!OO33EEYY 
 #MM--1199; #9 $ KK$# 0( #++-kk12""(()2JJ#JJLr{   _registered_cachesc                    [        U S5      (       a  [        U R                  5      (       d  [        U  S35      e[        R                  U 5        U $ )z\
Use this decorator to register any caches that should be cache_clear'd
with fresh_cache().
cache_clearz# does not have a cache_clear method)r  callabler  AttributeErrorr  r  r  s    rW   clear_on_fresh_cacher    sE    
 3&&hs.G.Gu$GHIIc"Jr{   c                 >    [          H  n U R                  5         M     g)z
Clear all registered caches.
N)r  r  r  s    rW   clear_cachesr    s     " "r{   c              #  `  #    [         R                  R                  U 5      n U[         R                  U '   Sv   Uc!  [         R                  R                  U S5        gU[         R                  U '   g! Uc!  [         R                  R                  U S5        f U[         R                  U '   f = f7f)a  Thread-safe env var set/restore using atomic C-level lookups.

We avoid mock.patch.dict(os.environ, ...) because it internally calls
os.environ.copy(), which iterates all env var keys then fetches values in
separate steps. That approach is not atomic and can race with background threads
(e.g. Triton async compilation) modifying the environment, causing KeyError,
so we use os.environ.get() for individual keys which is an atomic C-level lookup.
N)osenvironr_  rS   )rf  r   olds      rW   _set_envr
  $  sy      **..
C"

3;JJNN3%!BJJsO ;JJNN3%!BJJsOs    B.A2 8B.29B++B.c              #  `  ^#    [        5         SSKJn  U" [        R                  " US95      m [        ST5         [        R                  ST5        U" [        R                  R                  TS5      5      n[        SU5         Sv   [        U [        5      (       a  [        U 5      S:X  d   S	5       e[        R                  R                  U5      (       a{  [        R                  " U5      nU R!                  U Vs0 s HH  nS
U;  d  M  U[        R                  R#                  [        R                  R                  XF5      5      _MJ     sn5        SSS5        SSS5        U(       a^  [%        5       (       a-  [&        R(                  R+                  5       (       a
  [-        5         [.        R0                  " T[%        5       U4S jS9  [        5         gs  snf ! , (       d  f       N= f! , (       d  f       N= f! [2         a    [        R5                  ST5        e f = f! [        5         f = f7f)z
Contextmanager that provides a clean tmp cachedir for pt2 caches.

Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
generated with this cache instance.
r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                .   > [         R                  STUS9$ )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  pathr  inductor_cache_dirs      rW   r  fresh_cache.<locals>.<lambda>g  s    S[[@&% 6A 6r{   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr
  r   r   r  r  r"  r}   dictrR   existslistdirrd  getsize
is_windowsrP   rJ   rQ   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rW   fresh_cacher)  9  s     ND1(2B2Bs2KL'/1CDII35GH7/:  ,.>?mT22}-2W4WW2ww~~&677 "

+; <%,, */).A#*!#3 !V277??277<<@P3T#U U). @ E$ ||		 6 6 8 8&(MM" )l  	5 @? EDD  >@RS 	ss   +H.G: A	G)A9G=
GAGGG)A-G: H.G
G&	"G))
G73G: :"HH H++H.)reversec                   U R                   n[        [        U 5      5      n[        [	        X2SS95      nU(       d  [        [        U5      5      $ U$ )NTrf  r*  )__getitem__r   rR   re  r  r  )seqr*  gettera_rsort_idxs        rW   argsortr2  z  sC    __F
C/C F3D9:HHX&''Or{   c          	     F  ^  SU 4S jjn[        U5       VVs/ s H>  u  pEU[        U[        R                  5      (       a  UR                  R
                  OU4PM@     nnn[        U[        R                  " U5      US9nU VVs/ s H  u  pGUPM	     nnnU$ s  snnf s  snnf )Nc                ~   > U u  p#Uu  pESU4S jjnU" X5:  5      (       a  gU" X5:  5      (       a  gX$:  a  gX$:  a  gg)Nc                R   > [        U [        5      (       a  U $ TR                  U SS9$ )NT)size_oblivious)r}   r9  evaluate_expr)rp  r{  s    rW   evaluate*argsort_sym.<locals>.cmp.<locals>.evaluate  s+    $%%**4*EEr{   r   r7   r   )rp  z bool | torch.SymInt | sympy.Exprr   r9  r   )r'  r(  a_idxa_valb_idxb_valr8  r{  s          rW   r  argsort_sym.<locals>.cmp  sN    	F
 EM""EM""
 ==r{   r,  )r'  tuple[int, sympy.Expr]r(  r?  r   r   )	r\  r}   rP   r2   ro  rp  r  r  
cmp_to_key)	r{  r.  r*  r  rl  r   exprsr   r  s	   `        rW   argsort_symrB    s    4  n$FC 
Z5<<88affkka@$ 
  5i2237IE %&fccF&M
 's   ABBc                r    U [         R                  :X  a  g[         R                  " SU S9R                  5       $ )Nrv   r   r   )rP   ra  r   element_sizerD  s    rW   get_dtype_sizerF    s-     ;;r'4466r{   c                       \ rS rSr% S\S'   Srg)LineContexti  r   contextr   Nr   r   r   r   r   r   r   r{   rW   rH  rH    s    Lr{   rH  c                  *    \ rS rSr% S\S'   S\S'   Srg)ValueWithLineMapi  r  r   zlist[tuple[int, LineContext]]line_mapr   NrJ  r   r{   rW   rL  rL    s    J++r{   rL  c                     \ rS rSrSrSSS jjr\R                  SS j5       rSS jr	SS jr
SS jrS rSS	 jrSS
 jrSS jrSS jrS S jrS!S jrS"S#S jjrS"S$S jjrS"S$S jjrS%S&S jjrS'S jrSS jrS(S jrS)S jrSrg)*IndentedBufferi     c                    / U l         Xl        g r   )_lines_indent)r  initial_indents     rW   __init__IndentedBuffer.__init__  s    BD%r{   c              #  \   #    U R                   n Xl         S v   X l         g ! X l         f = f7fr   )tabwidth)r  rX  prevs      rW   set_tabwidthIndentedBuffer.set_tabwidth  s%     }}	!$M MDMs   ,
! ,),c                   [        5       nSn/ nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O5[        U[        5      (       a  UR                  X$R                  45        MX  Un[        U[        5      (       d   eUR                  U5        UR                  S5        USUR                  S5      -   -  nM     [        UR                  5       U5      $ )Nr7   rY  )r   rR  r}   DeferredLineBaserH  r  rI  r  writecountrL  getvalue)r  bufr   linemaplirW  s         rW   getvaluewithlinemap"IndentedBuffer.getvaluewithlinemap  s    j13++B".//t<  B,,::/dC((((IIdOIIdOTZZ%%%A   88r{   c                6    U R                  5       R                  $ r   )rd  r   r  s    rW   r`  IndentedBuffer.getvalue  s    '')///r{   c                   [        5       nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O[        U[        5      (       a  M<  Un[        U[
        5      (       d   eUR                  S5      (       a  UR                  US S 5        M  UR                  U5        UR                  S5        M     UR                  5       $ )N\r   rY  )	r   rR  r}   r]  rH  r  endswithr^  r`  )r  ra  rc  rW  s       rW   getrawvalueIndentedBuffer.getrawvalue  s    j++B".//t<  B,,dC((((}}T""		$s)$		$		$   ||~r{   c                    U R                   $ r   rR  rg  s    rW   get_lines_refIndentedBuffer.get_lines_ref  s    {{r{   c                8    U R                   R                  5         g r   )rR  clearrg  s    rW   rs  IndentedBuffer.clear  s    r{   c                ,    [        U R                  5      $ r   )r9  rR  rg  s    rW   __bool__IndentedBuffer.__bool__  s    DKK  r{   c                :    SU R                   U R                  -  -  $ )Nr5  )rS  rX  rg  s    rW   r  IndentedBuffer.prefix  s    dllT]]233r{   c                &    U R                  S5        g )NrY  	writelinerg  s    rW   newlineIndentedBuffer.newline  s    tr{   c                   [        U[        5      (       a  U R                  R                  U5        g [        U[        5      (       a9  U R                  R                  UR                  U R                  5       5      5        g UR                  5       (       a.  U R                  R                  U R                  5        U 35        g U R                  R                  S5        g Nr  )r}   rH  rR  r  r]  with_prefixr  stripr  rW  s     rW   r|  IndentedBuffer.writeline  s    dK((KKt$.//KKt//>?ZZ\\KK$++-78KKr"r{   c                8    U H  nU R                  U5        M     g r   r{  )r  linesrW  s      rW   
writelinesIndentedBuffer.writelines  s    DNN4  r{   c                L   ^ ^ [         R                  SUU 4S jj5       nU" 5       $ )Nc               3     >#    T=R                   T -  sl          S v   T=R                   T -  sl         g ! T=R                   T -  sl         f = f7fr   rS  )offsetr  s   rW   r  "IndentedBuffer.indent.<locals>.ctx  s8     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r  r  s   `` rW   indentIndentedBuffer.indent  s$    		"	"	' 
#	' ur{   c                .    U =R                   U-  sl         g r   r  r  r  s     rW   	do_indentIndentedBuffer.do_indent'      r{   c                .    U =R                   U-  sl         g r   r  r  s     rW   do_unindentIndentedBuffer.do_unindent*  r  r{   c           	        [        U[        5      (       a  [        S5      nUR                   HR  n[        U[        5      (       a  M  U(       d  M#  [        U[        U5      [        UR                  5       5      -
  5      nMT     [        R                  " U5      (       a  SnUR                   HV  n[        U[        5      (       a  U R                  R                  U5        M5  [        R                  X[        U5      S  5        MX     g [        R                  " U5      nU(       a  UR                  5       nU(       d  g UR                  5       nUR!                  S5       H  nU R                  U5        M     g )Ninfr   rY  )r}   rO  floatrR  rH  minrR   r  mathisinfr  r|  r   textwrapdedentrstriprc  )r  
other_coder  r  rW  r   s         rW   spliceIndentedBuffer.splice-  s   j.115\F"))!$44 TS5G)GHF * zz&!!"))dK00KK&&t,",,TF3FG	 * "4J'..0
#**,J%%d+q! ,r{   c                    [        U R                  S9nU R                   Vs/ s H
  o1" U5      PM     snUl        U$ s  snf N)rT  )rO  rS  rR  )r  r  r   rW  s       rW   r   IndentedBuffer.mapE  s8    DLL9-1[[9[Td4j[9

 :s   =c                @    [        U 5       SU R                  5        S3$ )Nr  r  )r  r`  rg  s    rW   __repr__IndentedBuffer.__repr__J  s     t*Qt}}/q11r{   c                    U R                   UR                   :X  d   e[        U R                   S9nUR                  U R                  5        UR                  UR                  5        U$ r  )rS  rO  r  rR  )r  otherr   s      rW   __add__IndentedBuffer.__add__M  sK    ||u}},,,DLL9t{{#u||$
r{   c                    XR                   ;   $ r   ro  )r  new_lines     rW   containsIndentedBuffer.containsU  s    ;;&&r{   )rS  rR  rX  Nr   )rT  r   r   r  )rX  r   r   r  )r   rL  r   r  r   r  r   r9  )rW  z$LineContext | DeferredLineBase | strr   r  )r  z.Sequence[LineContext | DeferredLineBase | str]r   r  r   )r  r   r   'contextlib.AbstractContextManager[None])r  r   r   r  F)r  zIndentedBuffer | strr  r9  r   r  )r  zCallable[[Any], Any]r   rO  )r  r   r   rO  )r  z$DeferredLineBase | LineContext | strr   r9  )r   r   r   r   rX  rU  r  r  rZ  rd  r`  rl  rp  rs  rv  r  r}  r|  r  r  r  r  r  r   r  r  r  r   r   r{   rW   rO  rO    s    H& ! !9(0(!4#!	"0
2'r{   rO  c                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )FakeIndentedBufferiY  c                "   > [         TU ]  5         g r   )superrU  )r  	__class__s    rW   rU  FakeIndentedBuffer.__init__Z  s    r{   c                V    US:X  a  [         R                  X5      $ [        SU S35      e)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r  )r  r   s     rW   r  #FakeIndentedBuffer.__getattribute__]  s9    ;**466!$ (= =
 	
r{   r   r  )r   r  r   r   )r   r   r   r   rU  r  r   __classcell__r  s   @rW   r  r  Y  s    
 
r{   r  c               #     #    [         R                  [         R                  p S v   Xs[         l        [         l        g ! Xs[         l        [         l        f = f7fr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rW   restore_stdout_stderrr  h  s9     %(ZZN@!/
CJ
CJs    A> AAAc                  h    \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSrg)r]  iq  z.A line that can be 'unwritten' at a later timec                >    UR                  5       (       d  SnXl        g r  )r  rW  r  s     rW   rU  DeferredLineBase.__init__t  s    zz||D	r{   c                    [         e)zJReturns either self.line or None to indicate the line has been 'unwritten'r!  rg  s    rW   r  DeferredLineBase.__call__y      !!r{   c                    [         e)z3Returns a new deferred line with the same conditionr  r  s     rW   	_new_lineDeferredLineBase._new_line}  r  r{   c                @    U R                  U U R                   35      $ r   r  rW  )r  r  s     rW   r  DeferredLineBase.with_prefix  s    ~~455r{   c                T    U R                  U R                  R                  5       5      $ r   )r  rW  r  rg  s    rW   r  DeferredLineBase.lstrip  s    ~~dii..011r{   c                >    U R                  U R                  U   5      $ r   r  )r  r  s     rW   r-  DeferredLineBase.__getitem__  s    ~~dii.//r{   c                ,    [        U R                  5      $ r   )r9  rW  rg  s    rW   rv  DeferredLineBase.__bool__  s    DIIr{   c                ,    [        U R                  5      $ r   )rR   rW  rg  s    rW   __len__DeferredLineBase.__len__  s    499~r{   )rW  N)rW  r  r   
str | None)rW  r  r   r   )r  r  r   r   )r   r   )r  zint | slicer   r   r  r   r   )r   r   r   r   r   rU  r  r  r  r  r-  rv  r  r   r   r{   rW   r]  r]  q  s-    8
""620r{   r]  c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
DelayReplaceLinei  z6At end of codegen call `line.replace(key, value_fn())`c                <   > [         TU ]  U5        Xl        X l        g r   )r  rU  rf  value_fn)r  rf  r  rW  r  s       rW   rU  DelayReplaceLine.__init__  s     r{   c                j    U R                   R                  U R                  U R                  5       5      $ r   )rW  replacerf  r  rg  s    rW   r  DelayReplaceLine.__call__  s#    yy  4==?;;r{   c                D    [        U R                  U R                  U5      $ r   )r  rf  r  r  s     rW   r  DelayReplaceLine._new_line  s    $-->>r{   )rf  r  )rf  r  r  zCallable[[], str]rW  r  r  )rW  r  r   r  )
r   r   r   r   r   rU  r  r  r   r  r  s   @rW   r  r    s    @!
<? ?r{   r  c                   [        U [        R                  5      (       a  U nO[        R                  " [        5       U 5      n[        R
                  " U5      n[        R                  R                  (       aF  UR                  c   eUR                  S:  d  UR                  S:X  a  [        R                  S5        ggUR                  S:X  a  SOSnUR                  nXC:  a  [        R                  S	X4S
.S9  gg)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrJ   rr   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r}   rP   r   rX   r   createre  rq   majorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rW   
is_big_gpur    s    /5<<00 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I:%> 	 	
 r{   c                     [         R                  R                  5       (       a(  [         R                  R                  5       R                  $ [         R
                  R                  S5      R                  $ )NrH   )rP   rJ   rQ   get_device_propertiesgpu_subslice_countrH   r  r   r{   rW   get_max_num_smsr    sI    yyyy..0CCC::++F3IIIr{   c                     [         R                  R                  5       (       d  g[         R                  R                  [         R                  R	                  5       5      n U R
                  S:H  $ )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rP   rH   rQ   r  r  r  )device_propertiess    rW   
using_b200r    sM     ::""$$

889R9R9TU""b((r{   c                     [         R                  R                  5       (       a
  [        5       $ [         R                  R                  5       n [        5       U b  U -
  $ S-
  $ )zFHandle experimental carveout if set otherwise return hardware SM countr   )rP   rJ   rQ   r  r  _get_sm_carveout_experimental)carveouts    rW   get_num_smsr    sM     yy  xx557HH,@HHaHHr{   c                    SSK JnJn  Uc
  [        5       nUR	                  S5      nX -  [
        -  nU" UUUUR                  " 5       S9$ )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r7   )r8   WorkspaceZeroModeF)r_  	zero_moder   
outer_name)codegen.commonr8   r
  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr8   r
  r  r  s          rW   get_tma_workspace_argr    sU     @"}!++E2I-0CCD++-	 r{   c                    [         R                  R                  (       d  gS[         R                  R	                  S5      R
                  ;   a  U S::  a  gg)Nr   gfx942rr   r7   r  )rP   re  rq   rH   r  gcnArchName)block_ks    rW   get_default_kpackr    s<    ==5::33A6BBBwRT}r{   c                   U R                   U;  a!  [        R                  SU R                   U5        [        U R                  R
                  5      =(       a+    U R                   U;   =(       a    [        U R                  5      $ )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r@  allowed_layout_dtypess     rW   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%r{   c                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf NrX  )r   ri   max_autotune_gemm_backendsrc  r  backendrT   s     rW   _use_autotune_backendr"    P    ==?!<<BBDJJ3OOa	O      Ac                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf r  )r   ri   max_autotune_conv_backendsrc  r  r   s     rW   _use_conv_autotune_backendr'  	  r#  r$  )enable_int32enable_float8check_max_autotunec                  SSK JnJn  [        R                  [        R
                  [        R                  /nU(       a>  [        R                  [        R
                  [        R                  [        R                  /nU(       a/  UR                  [        R                  [        R                  /5        [        U R                  R                  5      =(       a    [        X5      =(       d/    U R                  R                  S:H  =(       a    U R                  U;   =(       ak    [         R"                  =(       d    [         R$                  =(       d    U(       + =(       a/    ['        S5      =(       a    U" U R                  UR(                  5      $ )Nr7   )BackendFeaturehas_backend_featurer  TRITON)r  r,  r-  rP   r   rM  rO  r\  rf  rC  rD  r  r   r  r  r   ri   max_autotunemax_autotune_gemmr"  TRITON_TEMPLATES)r@  r(  r)  r*  r,  r-  layout_dtypess          rW   use_triton_templater3    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOr{   output_layout
add_guardsc                  ^^^^^^	 SSK Jn  SSKJm  SU4S jjmSUU4S jjnSUUU	4S jjm        SUUU4S jjm        SU4S	 jjm	U" 5       =(       a$    [	        U4S
 jU 5       5      =(       a    U" U 5      $ )u.  
Return True iff *all* supplied tensors satisfy the CUDA TMA constraints
that Triton relies on today.
* https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

A tensor is accepted when:
  * 1 ≤ rank ≤ 5 (cuTensorMapEncodeTiled)
  * dtype in _TMA_SUPPORTED_DTYPES (CUtensorMapDataType enum)
  * Base pointer 16-byte aligned
  * Exactly one contiguous ("inner") dim with stride 1
  * All "outer" dims have 16-byte aligned strides
  * Inner dim size × itemsize is a multiple of 16
  * For 1-byte dtypes (e.g. FP8), inner dim ≥ 32
r   )has_triton_tma_devicer7   ru  c                X   > TR                   R                  R                  U [        5      $ r   )ry  rz  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrv  s    rW   _alignedcan_use_tma.<locals>._alignedA  s     ww<<ZWWr{   c                   > U c  gU R                   nU R                  nU R                  nT" U R                  5      (       d  gT" XU5      $ )NTF)r  rJ  r   r  )r@  sizesstridesr   r=  _is_tma_compatibles       rW   _is_tma_compatible_layout.can_use_tma.<locals>._is_tma_compatible_layoutD  sG    >-- &&!%%88r{   c                   > U R                  5       nU R                  5       nU R                  5       nU R                  5       TR                  R
                  ;   a  gU R                  5       =nb  UR                  S:X  a	  T" XU5      $ T" XU5      $ )NFrJ   )get_size
get_stride	get_dtyper  ry  unaligned_buffers
get_devicer  )r  r@  rA  r   m_devicerv  rB  _is_tma_compatible_xpus        rW   _is_tma_compatible_matrix.can_use_tma.<locals>._is_tma_compatible_matrixQ  sw    

,,. ::<177444&H38N)%%@@!%%88r{   c                  > [        U 5      nUR                  nUS:  d  US:  a  gU[        ;  a  gT(       aK  TR                  R                  R                  U 5      nTR                  R                  R                  U5      nOjU  Vs/ s H(  nTR                  R                  R                  U5      PM*     nnU Vs/ s H(  nTR                  R                  R                  U5      PM*     nn[        U5       V	Vs/ s H4  u  pTR                  R                  R                  US5      (       d  M2  U	PM6     n
n	n[        U
5      S:w  a  gU
S   n[        U5       H  u  pX:X  a  M  T" X-  5      (       a  M    g   X[   nT" X-  5      (       d  gUS:X  a,  TR                  R                  R                  US5      (       d  ggs  snf s  snf s  snn	f )Nr7   r   Fr       T)
rR   itemsizert   ry  rz  guard_int_seq!replace_backed_symbols_with_hintsr\  statically_known_equalsstatically_known_geq)r@  rA  r   rankrQ  sizes_i	strides_ir   str   r  	inner_idx	inner_dimrv  r=  r6  s                rW   rB  'can_use_tma.<locals>._is_tma_compatible_  s   
 5z>>!8tax--gg&&44U;G((66w?I PUOT!  BB1Eu   RYQX2  BB2FQX   #9-
-ww77A> - 	 

 u:?!H	 y)EA~BM**	 * &		,-- q=!1!1!F!FyRT!U!UC

s   /G</G;1G0Gc                j  > US   nTR                   R                  R                  U5      nTR                   R                  R                  US5      (       d  gSnU  HT  nTR                   R                  R                  U5      nTR                   R                  R	                  Xu5      (       d  MT    g   g)Nr   r7   Fl    T)ry  rz  rS  rT  statically_known_gt)	r@  rA  r   last_stridelast_stride_hint
MAX_UINT32r  	size_hintrv  s	           rW   rL  +can_use_tma.<locals>._is_tma_compatible_xpu  s     bk77++MM
 ww778H!LL 
D((JJ4PIww33IJJ 
 r{   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r  rM  s     rW   r   can_use_tma.<locals>.<genexpr>  s     ?h)!,,h   )r<  int | sympy.Exprr   r9  )r@  Layout | Noner   r9  )r  r@   r   r9  )r@  Sequence[sympy.Expr]rA  zSequence[_IntLike]r   torch.dtyper   r9  )torch.utils._tritonr8  rx  rv  r   )
r5  r6  matricesr8  rC  rv  r=  rB  rM  rL  s
    `   @@@@@rW   can_use_tmarm  ,  s    " :X9 99 93#3#3 3 
	3 3j##  
	. 	 	5?h??	5%m4r{   c                &   [         R                  " [         R                  5      R                  n/ nU  HV  n[	        U[
        [        R                  45      (       a	  XB:  a    gM1  UR                  [        R                  " XB5      5        MX     U(       d  gSSK
Jn  [        U5      S:X  a  US   O[        R                  " U6 nU(       a%  UR                  R                  R!                  U5      $ UR                  R                  R#                  U5      $ )NFTr7   ru  r   )rP   iinfor\  r   r}   r   r~   r   r  Lerx  rv  rR   Andry  rz  guard_or_falsestatically_known_true)r@  r6  	int32_max
conditionsr  rv  	conditions          rW   _descriptor_shape_fits_in_int32rw    s     EKK(,,IJdS%--011   ehht78  !$ZA!5
1599j;QI  	
''	2 WW33I>r{   r6  c                  ^ [         R                  R                  (       d  g[        S U 5       5      (       d  g[        U4S jU 5       5      (       d  g[         R                  R                  (       a  [        U R                  TS9(       d  g[        R                  R                  b  g[         R                  R                  (       a  U OS n[        X#TS.6$ )NFc              3  Z   #    U  H!  n[        UR                  5       5      S :H  v   M#     g7f)r  N)rR   rF  )r   r  s     rW   r   *use_triton_tma_template.<locals>.<genexpr>  s      8x!s1::< A%xs   )+c              3  T   >#    U  H  n[        UR                  5       TS 9v   M     g7f)rx  N)rw  rF  )r   r  r6  s     rW   r   r{    s%      A 	(

Ls   %(rx  Tr4  )ri   r  enable_persistent_tma_matmulr   enable_template_tma_storerw  r  rP   re  rq   rm  )r5  r6  rl  r@  s    `  rW   use_triton_tma_templater    s     ==558x888    }}..7Vz8  }}$$mmEE]4F:NNr{   c                f    [        X US.6(       d  gSSKJn  SSKJn  U" 5       =(       a    U" 5       $ )Nr4  Fr   )%has_triton_tensor_descriptor_host_tmar7   is_datacenter_blackwell_arch)r  rk  r  codegen.cuda.cuda_envr  )r5  r6  rl  r  r  s        rW   !use_triton_blackwell_tma_templater    s5     #	:  IC 12U7S7UUr{   c                     X;   =(       a    X;   $ r   r   )scale_option_ascale_option_bscaling_typess      rW   use_triton_scaling_templater    s    
 *N~/NNr{   )maxsizec                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)zCheck if CuTeDSL is importable; cache the result for reuse.

Call ensure_cute_available.cache_clear() after installing CuTeDSL
in the same interpreter to retry the import.
cutlassNF	importlibutil	find_specr  r   r{   rW   ensure_cute_availabler    s3    ~~''	2$>>     # 
00c                      [         R                  R                  S5      SLn U (       a
  [	        5         U $ ! [         a     gf = f)zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
in the same interpreter to retry the import.
cutlass_apiNF)r  r  r  r  _ensure_fp4_dtype_registered)	availables    rW   "ensure_nv_universal_gemm_availabler    sE    NN,,];4G	 $&	  s   !6 
AAc                    ^^ SSK n  U R                  R                  [        R                  5        g! [
        [        4 a4    SSKmU R                  R                  mUU4S jnXR                  l         gf = f)a  Patch cutlass_api to handle torch.float4_e2m1fn_x2 -> cutlass.Float4E2M1FN.

NOTE: cutlass_api doesn't natively map this dtype. We patch the lookup function
in-place so all callers (including TensorWrapper) pick up the change.
Remove once cutlass_api adds native FP4 support.
r   Nc                T   > U [         R                  :X  a  TR                  $ T" U 5      $ r   )rP   rK  Float4E2M1FN)r   _origr  s    rW   _patched._ensure_fp4_dtype_registered.<locals>._patched'  s'    ...+++<r{   )cutlass_api.utilsr  cutlass_type_from_torch_typerP   rK  KeyErrorr  r  )r  r  r  r  s     @@rW   r  r    sf     B66u7M7MNn% 
B!!>>	 
 :B6
Bs   )2 AA65A6c                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)a3  Check if nvMatmulHeuristics is importable; cache the result for reuse.

nvMatmulHeuristics provides performance model-based kernel selection
for NVIDIA GEMM operations.

Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
nvMatmulHeuristics in the same interpreter to retry the import.
nvMatmulHeuristicsNFr  r   r{   rW   $ensure_nvmatmul_heuristics_availabler  /  s4    ~~''(<=TII r  c                   [        5       (       d  g[        S5      (       d  gSSKJn  [	        UR
                  R                  5      (       d  gU" 5       (       d  g[        R                  /n	[        X)5      (       d  g[        R                  (       d  [        R                  (       d  g[        XUS9(       d  g[        S X4 5       5      (       a  gU(       a  U(       a  gUc  gUc  Ub  gg)a  
Returns True if we can use the blackwell kernel for grouped mm.
Required conditions:
    1. CuTeDSL backend is enabled
    2. CuTeDSL is available
    3. We are on a blackwell arch
    4. The dtype is bf16
    5. Max autotune or max autotune gemm is enabled
    6. A, B, and the output are 16B aligned
    7. We are not using dynamic shapes
    8. A is 2d
    9. B is 3d
    10. Offsets are provided
    11. Bias and Scale are not provided
FCUTEDSLr7   r  )r5  c              3  8   #    U  H  n[        U5      v   M     g 7fr   )
is_dynamicr   rT   s     rW   r   3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>q  s     
1.Q:a==.r   T)r  r"  r  r  r  r   r  rP   rM  r  ri   r/  r0  rm  r  )
mat_amat_br@  a_is_2db_is_2doffsbiasscale_resultr  r2  s
             rW    use_blackwell_cutedsl_grouped_mmr  ?  s    2 !"" ++C&--$$%%'))^^$M 776#;#; u6:

15.
111g|<3r{   c                   SSK Jn  UR                  R                  (       a2  UR                  R                  (       d  [
        R                  " S5        gUR                  R                  R                  X-  U-  SS9nUS::  d  U[        R                  R                  :  a  gSSKJn  [        R                  R                   (       a  g[        R"                  [        R$                  [        R&                  /n[)        X5      =(       a9    [        R*                  =(       d    [        R,                  =(       a    [/        S	5      nU(       a;  U" 5       (       d/  [0        R3                  S
[        R                  R4                  5        gU$ )Nr7   ru  zYCUTLASS backend is not supported with non-AOT cpp_wrapper mode. Skipping CUTLASS backend.Fr   fallbackr   )try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cutlass.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rx  rv  ry  cpp_wrapperaot_modewarningswarnrz  optimization_hintri   r  cutlass_backend_min_gemm_sizecodegen.cutlass.utilsr  rP   re  rq   r   rM  r\  r  r/  r0  r"  r   r  cutlass_dir)	r@  r  r+  krv  	gemm_sizer  r2  r   s	            rW   use_cutlass_templater    s    	ww177#3#3(	
   221519r2JIA~V^^%Q%QQ9 }} ]]ENNEKK@Mf4 	-  <F$<$<	-!),  !##KK4 **	 Jr{   _IntLikec                :  ^
^ SSK Jm  [        5       (       d  g[        5       (       d  g[	        S5      (       d  gSSKJm
  T
R                  (       a  gU R                  R                  S:w  d  [        R                  R                  (       a  g[        R                  (       d  [        R                  (       d  gXU/nUb  UR!                  U5        [#        U4S jU 5       5      (       a  gXE/n	Ub  U	R!                  U5        [#        U
4S	 jU	 5       5      (       a  gg
)a  
Return True if we can use the NVIDIA Universal GEMM Template.

Required conditions:
    1. NVGEMM backend is enabled
    2. cutlass_api is available
    3. We are on a NVIDIA GPU
    4. Max autotune or max autotune gemm is enabled
    5. Not in AOT Inductor mode (requires runtime JIT compilation)
    6. Base pointers are 16-byte aligned
    7. Shape dimensions are not unbacked symbols

Note:
    - Shape and stride constraints are handled internally by
      cutlass_api.get_kernels() which filters incompatible kernels.
    - GroupedGemm currently only supports TN layout (column-major B).
      Any other layout will act as a noop and fall back to ATen.
    - Dynamic shapes are supported as long as they have hints
      (from example inputs).
r   )has_free_unbacked_symbolsFNVGEMMr7   ru  rH   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   dimr  s     rW   r   1use_nv_universal_gemm_template.<locals>.<genexpr>  s     
C]c$S))]rf  c              3  p   >#    U  H+  oR                  5       TR                  R                  ;   v   M-     g 7fr   )r  ry  rI  )r   trv  s     rW   r   r    s&     
O>N::<177444>Ns   36T)r  r  r  r  r"  rx  rv  aot_compilationr   r  rP   re  rq   ri   r/  r0  r  r  )r@  r  r+  r  r  r  r  r  dims_to_checktensors_to_checkrv  r  s             @@rW   use_nv_universal_gemm_templater    s    < P ""-// **}}V#u}}'8'86#;#;
 1IM}Q

C]
CCC ~%

O>N
OOOr{   c                    [         R                  R                  R                  5       nUS:X  a  gU R                  5       UR	                  S5       Vs/ s H  o"R                  5       PM     sn;   $ s  snf )z8Check if CUTLASS should be used for the given operation.ALLTrX  )ri   r  cutlass_enabled_opsr   rc  r  )op_nameenabled_opsrT   s      rW   _use_cutlass_for_opr    sY    ..44::<Ke==?+2C2CC2HI2HQwwy2HIIIIs   A0r   c           
        SSK Jn  [        R                  R                  U-  nUR
                  R                  R                  [        R                  " [        R                  " X%U -  5      [        R                  " X%U-  5      5      5      =(       aa    UR
                  R                  (       + =(       a?    UR
                  R                  (       + =(       a    [        R                  R                  S:  $ )Nr   ru  )torch._inductor.virtualizedrv  ri   r  decompose_k_thresholdry  rz  rs  r~   rq  Ger  r  num_decompose_k_splits)r  r+  r  threshold_multiplerv  r  s         rW   use_decompose_k_choicer  	  s     ."MM??BTT 	
..IIA56A56	
 	5    	5 ###	5 MM0014
r{   c           
        [         R                  R                  nSSKJn  [        [        R                  R                  5      =(       a    UR                  R                  R                  [        R                  " [        R                  " X#U -  5      [        R                  " X#U-  5      5      5      =(       a=    UR                  R                  (       + =(       a    UR                  R                   (       + $ )z
Check if we should use the contiguous subgraph transform.
This transform makes the second matrix contiguous before the matmul.
r   ru  )ri   rocmcontiguous_thresholdr  rv  r9  rP   re  rq   ry  rz  rs  r~   rq  r  r  r  )r  r+  r  r  rv  s        rW   use_contiguousr  	  s     ";;;; . 	U]] 	$GG22II145145
	$    	$ ###
r{   c                6   [         R                  R                  n/ SQn[        U[        R
                  5      (       a  UR                  (       d  U$ US:X  a  / $ [        U [        R
                  5      (       a  U R                  (       a0  [        U[        R
                  5      (       a  UR                  (       d  SnO[        X -  X!-  5      nSn[        R                  " U5      nU Vs/ s H  nX::  d  M
  X:  d  M  UPM     nn/ / / pn	U H`  nX,-  nUS:  a  M  XS-
  -  S:X  a  US:  a  U	R                  U5        M3  US-  S:X  a  U
R                  U5        MO  UR                  U5        Mb     [         R                  S:X  a  X-   U-   $ X-   U-   nUS U $ s  snf )	N)rr   rP  ru   rs      r   r  r  rs   r7   rP  
EXHAUSTIVE)ri   r  r  r}   r~   r3  	is_numberr  divisorsr  max_autotune_gemm_search_space)r  r+  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rW   get_k_splitsr  0	  s    ]]99N .!UZZ  	1		1ejj!!!++1ejj!!!++!&!&)K~~a H  G! 	&-&< 	   =?B> 3; AI!#$$Q'RZ1_%%a( !!!$ " ,,< 5FF#8>IK''=s   (	F5F<Fc                T    [         R                  R                  U 5      R                  $ r   )rP   rH   r  r  r   s    rW   _rocm_native_device_arch_namer  i	  s    ::++F3???r{   c                      SS K n SSKJnJn  SSKJn  [        R                  R                  U R                  5      nXAX#4$ ! [         a    SS jnSS jn " S S5      nS n N&f = f)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     / $ r   r   r   r{   rW   r  *try_import_ck_lib.<locals>.gen_ops_library	      Ir{   c                     / $ r   r   r   r{   rW   r  .try_import_ck_lib.<locals>.gen_ops_preselected	  r  r{   c                      \ rS rSrSrg)*try_import_ck_lib.<locals>.CKGemmOperationi	  r   N)r   r   r   r   r   r   r{   rW   r  r   	  s    r{   r  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r  )r  r  r  r  package_dirnames        rW   try_import_ck_libr  n	  sh    	
	
 ''//+*>*>? -@QQ  			 	 s   ;A  A$#A$c                J   [         R                  (       d  [         R                  (       d  g[        R                  R
                  (       d  gU R                  R                  S:w  a  g[        U R                  5      n[         R                  R                   Vs0 s H  o"R                  S5      S   U_M     sn=(       d    UR                  S5      S   U0nUR                  5       [         R                  R                  -   Vs/ s H  nX2   PM	     nnU(       d  gU R                  [        R                  [        R                   [        R"                  4;  a  g[%        5       u  n    nU(       d  [&        R)                  S5        gU[         R                  l        gs  snf s  snf )NFrH   :r   z,Please pip install Composable Kernel packageT)ri   r/  r0  rP   re  rq   r   r  r  r  archrc  r`  ck_supported_archr   r   rM  rO  r  r   r  ck_dir)r@  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rW   use_ck_templater  	  s@   6#;#;==}}V# 0>K39;;3C3CD3Cawws|A)3CD #q!;IO
 !%%'&++*G*GG!GA 	G  ! %||EMM5>>5==II"3"51aBC+FKK+ E!s   FF c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr7   ru  CKr   r  r   rx  rv  r"  r  ry  rz  r  r@  r  r+  r  rv  s        rW   use_ck_gemm_templater  	  sP     	d# 	KF#	KGG..quqy2.FJr{   c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr7   ru  CKTILEr   r  r   r  r  s        rW   use_ck_tile_gemm_templater  	  sP     	h' 	KF#	KGG..quqy2.FJr{   c                <    [        S5      =(       a    [        U 5      $ )Nr  )r'  r  r@  s    rW   use_ck_conv_templater  	  s    %d+G0GGr{   c                    [         R                  =(       d    [         R                  =(       a    U R                  R                  S:H  $ r  )ri   r/  r0  r   r  r  s    rW   _use_template_for_cpur  	  s2    7v77&
--


%&r{   c                   SSK Jn  [        UR                  U5      (       d   eUR                  R                  nUR                  R
                  n[        U 5      =(       al    UR                  5       [        R                  :H  =(       aD    [        U5      S:H  =(       a/    [        U5      S:H  =(       a    US   US   :H  =(       a    US   S:H  n[        XUSS9=(       a#    UR                  R                  5       =(       d    U$ )Nr7   )rA      r  F)require_constant_mat2)r  rA   r}   r@  r  rJ  r  rH  rP   rO  rR   use_cpp_gemm_templateis_contiguous)r@  mat1mat2rA   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rW   use_cpp_bmm_templater)  	  s     dkk6****
   I++$$Kf% 	"NN-	"^q 	" "	" ^y|+		"
 ^q  " !t5Q !!#D'Dr{   c                   SSK Jn  SSKJn  SSKJn	  SSKJn
  [        U 5      (       a  [        S5      (       d  g[        R                  R                  (       d  gUR                  5       [        R                  [        R                   4;   n[        R"                  [        R$                  [        R&                  [        R                  [        R                   /nU
" UUU(       a  U R(                  OS UUS9u  ppp[+        X45      (       a  g[-        X'R.                  5      (       a  UR1                  5       nU	" UR                  5       5      u  nnU" S	UUUUR                  5       UR                  5       U[3        5       U(       + US
9
nSS jnU R(                  U;   =(       aT    US L=(       aI    U" U5      =(       a:    [-        X'R4                  5      =(       a    UR7                  5       =(       d    U(       + $ )Nr7   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    U R                  5         U R                  5       S   S:H  $ )Nr   r7   )freeze_layoutrG  rT   s    rW   is_last_dim_stride12use_cpp_gemm_template.<locals>.is_last_dim_stride1
  s"    	||~b!Q&&r{   )rT   r@   r   r9  )r  r  codegen.cpp_micro_gemmr+  codegen.cpp_utilsr,  kernel.mm_commonr-  r  r"  ri   cppweight_prepackrH  rP   r^  rS  rO  rM  halfr   has_free_symbolsr}   BaseViewunwrap_viewparallel_num_threadsr;  is_module_buffer)r@  r$  r%  r0  r!  is_woq_int4r8  r  r+  r,  r-  	int8_gemmr2  r  r+  r  r5  r   r2  r<  s                       rW   r"  r"  	  s    9M) ((0Ee0L0L::$$ U[[%**$==I]]ENNEJJUZZXM")"+&,,'#A!T $$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C t]]+	C ""$A,A(Ar{   c                 ~    [         R                  =(       d    [         R                  (       + =(       d    [        S5      $ )NATEN)ri   r/  r0  r"  r   r{   rW   use_aten_gemm_kernelsrM  (
  s-    7v77 '	v	&'r{   c                  b    \ rS rSr% \R
                  " S5      rS\S'   S
S jrS
S jr	SS jr
Srg	)DebugDirManageri.
  r   r  prev_debug_namec                @    [        [        R                  5      U l        g r   )r  rO  counterr   rg  s    rW   rU  DebugDirManager.__init__2
  s    ../r{   c                    [         R                  R                  R                  U l        U R                   SU R
                   3U l        U R                  [         R                  R                  l        g )N_tmp_)rP   _dynamori   debug_dir_rootrP  r   new_namerg  s    rW   	__enter__DebugDirManager.__enter__5
  sM    $}}33BB//0dggY?.2mm+r{   c                    [         R                  " U R                  5        U R                  [        R
                  R                  l        g r   )r!  r"  rX  rP  rP   rV  ri   rW  )r  r   s     rW   __exit__DebugDirManager.__exit__:
  s*    dmm$.2.B.B+r{   )r   rX  rP  Nr  )r   r   r   r  )r   r   r   r   r  r_  rR  r   rU  rY  r\  r   r   r{   rW   rO  rO  .
  s&    ooa G0<
Cr{   rO  c                  ^ SSK Jn  [        5       mSU4S jjn[        R                  R                  USU5         [        R                  R                  5         U " U0 UD6nS S S 5        W[        T5      4$ ! , (       d  f       N= f)Nr7   r<   c                (   > TR                  U 5        g r   )rb  codesource_codess    rW   save_output_code*run_and_get_code.<locals>.save_output_codeH
  s    r{   rc  ra  r  r   r  )
ry  r=   r!   r   patchr  rP   rV  resetre  )r   r   r  r=   rc  r  rb  s         @rW   run_and_get_coderh  ?
  so    
 %$.LL 
		=*<>N	OT$V$ 
P 4%%% 
P	Os   'A77
Bc                   UR                  SS5      n[        U /UQ70 UD6u  pE/ nU H  n[        R                  (       aS  [        R                  R
                  SLa6  UR                  [        R                  " SU[        R                  5      5        O5UR                  [        R                  " SU[        R                  5      5        U(       d  M  U Vs/ s H  oSS PM	     nnM     XF4$ s  snf )Nremove_quoteFTzR"TRITON\((.*?)\)TRITON"z	'''.*?'''r   )
rS   rh  ri   r  r  autotune_at_compile_timerf  r   findallDOTALL)	r   r   r  rj  r  rb  kernelsra  r  s	            rW   run_and_get_kernelsrp  Q
  s     ::ne4L+B@@@FG&--"H"HPT"T NN2::&A4STNN2::lD"))DE<29:'a|'G:G  ? ;s   C,c                *   ^  SU 4S jjn[        U5      $ )Nc                 R   > T" 5       n U R                  5       R                  5         U $ r   )r  r  )r  r   s    rW   run_with_backward1run_fw_bw_and_get_code.<locals>.run_with_backwarde
  s!    

r{   )r   r   )rh  )r   rs  s   ` rW   run_fw_bw_and_get_coderu  d
  s    
 -..r{   c                t  ^^ SSK Jn  / mSU4S jjmS	U4S jjn[        R                  R	                  USU5         [        R                  R	                  UST5         [
        R                  R                  5         U " U0 UD6nSSS5        SSS5        T$ ! , (       d  f       N= f! , (       d  f       T$ = f)
zLGet the inductor-generated code, but skip any actual compilation or running.r7   r<   c                (   > TR                  U 5        g r   r  r`  s    rW   rc  "get_code.<locals>.save_output_codes
  s    D!r{   c                   >  " S S5      nU R                   (       a  U R                  5       OU R                  5       u  p#T" UR                  5        U(       a  T" UR                  5        U" 5       $ )Nc                  ,    \ rS rSrSrSS jrSS jrSrg)	@get_code.<locals>.patched_compile_to_module.<locals>.DummyModuleiw
  z4This is empty to replace the generated triton modulec                    g r   r   rg  s    rW   rU  Iget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__z
  s    r{   c                    g r   r   r  s      rW   callEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call}
  s    r{   r   Nr  r   r   r  r   r   r  )r   r   r   r   r   rU  r  r   r   r{   rW   DummyModuler|  w
  s    Fr{   r  )r  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coderc  s       rW   patched_compile_to_module+get_code.<locals>.patched_compile_to_modulev
  s[    	 	 04/?/?D))+T\\^ 	" 	++,[../}r{   compile_to_modulerc  Nre  )r  r=   r   r   )ry  r=   r   rf  r  rP   rV  rg  )r   r   r  r=   r  r   rc  rb  s         @@rW   get_coder  m
  s    $ L", 	

.0I	
 	

-);=MN	 	O	
  	ON	
 	
 s#   "B('BB(
B%	!B((
B7c                    [        U /UQ70 UD6nS[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ Nr7   r  z%expected one or two code outputs got r   )r  rR   )r   r   r  rb  s       rW   get_triton_coder  
  sQ    B000LL!&Q& 
/L0A/BC& ?r{   c                    [        U /UQ70 UD6u  p4S[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ r  )rh  rR   )r   r   r  r   rb  s        rW   run_and_get_triton_coder  
  sU     'r;D;F;OAL!&Q& 
/L0A/BC& ?r{   c                   ^^^ SSK Jm  SSKJn  UR                  m/ mSUUU4S jjn[
        R                  R                  USU5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr   r<   rD   c                 h   > T" U 0 UD6  U S   n[        UT5      (       d   eTR                  U5        g )Nr  )r}   r  )r   r  ry  r=   graph_lowerings	real_inits      rW   	fake_init-run_and_get_graph_lowering.<locals>.fake_init
  s:    4"6"Q%////u%r{   rU  r  )torch._inductor.graphr=   torch._inductor.output_coderE   rU  r   rf  r  )	r   r   r  rE   r  r  r=   r  r  s	         @@@rW   run_and_get_graph_loweringr  
  sv     4;((IO& & 
		?J		BT$V$ 
C ?"" 
C	B ?""s   		A
A/c              #     #    SSK Jn  UR                  U    n [        R                  " X5      UR                  U '   Sv   X2R                  U '   g! X2R                  U '   f = f7f)zs
Override the lowering of aten_op with override_fn.
The first argument of override_fn is the original lowering fn.
r   )loweringN)torch._inductorr  	loweringsr  partial)aten_opoverride_fnr  orig_fns       rW   override_loweringr  
  sY      )  )G.&/&7&7&M7#&-7#g7#s   A"'A  A"AA"c                   ^ ^^ SSK Jn  UR                  mSUUU 4S jjn[        R                  R
                  R                  USU5      $ )zf
Add hook functions to be called at the beginning and end of Scheduler.__init__.
Used for unit tests.
r   )	Schedulerc                F   > T" X5        T" X5      nT(       a  T" X5        U$ r   r   )r  r]  outr  post_fnpre_fns      rW   r  (add_scheduler_init_hook.<locals>.wrapper
  s%    y i'I%
r{   rU  )r  r   r]  r   r   r   )torch._inductor.schedulerr  rU  unittestr   rf  r  )r  r  r  r  r  s   ``  @rW   add_scheduler_init_hookr  
  s>     4  G  ==%%iWEEr{   c                    [         R                  (       a  [        R                  U 5        g[        R	                  U 5        g)z
Warnings that will be actionable for PyTorch developers, but not
end users.  Allows us to easily disable them in stable releases but
keep them on for nightly builds.
N)ri   developer_warningsr   r  info)msgs    rW   developer_warningr  
  s$       Cr{   c                     [         R                  R                  S5      n U S-   [        [         R                  5      :  aV  [        [         R                  U S-      5      S:  a3  [         R                  U S-      S   S:w  a  [         R                  U S-      $ [         R                   H)  nUR                  S5      (       d  M  U[        S5      S s  $    g! [         a     NJf = f)a  
An experimental API used only when config.benchmark_kernel is true.

The benchmark name is only available at codegen time. So we can not
directly call it in benchmark_all_kernels which is run after codegen.

The function assumes the argument after --only is the benchmark name.
It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
scripts, this function may return None.

There are 2 flavors of --only argument we need handle:
1. --only model_name
2. --only=model_name
z--onlyr7   r   r  z--only=N)r  argvr  rR   
ValueErrorr  )rl  r  s     rW   get_benchmark_namer  
  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx>>)$$s9~'((    s   BC 
C"!C"c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7fr7   Nr   r  s     rW   r   is_ones.<locals>.<genexpr>       %u!Avu   r   ra  s    rW   is_onesr        %u%%%r{   c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7f)r   Nr   r  s     rW   r   is_zeros.<locals>.<genexpr>  r  r  r  r  s    rW   is_zerosr    r  r{   c                &    [        S U  5       5      $ )Nc              3     #    U  HI  n[        U[        R                  5      (       d  M$  UR                  [        R                  " S 5      :H  v   MK     g7f)r  N)r}   rP   r  r   )r   r   s     rW   r    is_cpu_device.<locals>.<genexpr>   s9      DdELL) 	+u||E**s
   #A*Ar  )inputss    rW   is_cpu_devicer    s       r{   c                    [        U [        R                  5      (       d   S5       eU R                  (       a  [        R
                  $ [        R                  $ )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r}   r~   r3  r   rP   r]  rQ  rq  s    rW   get_sympy_Expr_dtyper  '  s@    c5::&& B& ~~{{}}r{   c              /     #    U (       a.  [         R                  R                  " U0 UD6 nUv   S S S 5        g S v   g ! , (       d  f       g = f7fr   )rP   r   r   )should_profiler   r  r   s       rW   maybe_profiler  1  s;     ^^##T4V4G 54 	 54s   (A=A
AAc                 p    [         R                  R                  n U S:  a  [        R                  " 5       n U $ Nr7   )ri   rA  threadsrP   get_num_threads)r  s    rW   rG  rG  :  s+    jj  G{'')Nr{   c                     SSK Jn   U " 5       nUR                  S[        R                  R
                  (       a  S5      $ S5      $ )Nr7   )get_backend_options
num_stagesr  r   )runtime.triton_helpersr  r_  rP   re  rq   )r  optionss     rW   get_backend_num_stagesr  A  s2    ;!#G;;|%--*;*;QCCCCr{   c                N   [        U [        R                  R                  R                  R
                  S:H  S9nUb  U$ SSKJnJn  [        R                  R                  5       =(       a!    [        R                  R                  5       S:  nU [        R                  [        R                  [        R                  4;   d   e[        R                  " U5      R                   R#                  S5      (       a  SSKJn  U" 5       nU [        R                  [        R                  4;   a  U(       a  U" X5      $ [        R                  R                  R                  R
                  S:X  a  U" [        R                  U5      $ U" [        R                  U5      $ U [        R                  [        R                  4;   a  U(       a  U" U 5      $ [        R                  R                  R                  R
                  S:X  a  U" [        R                  5      $ U" [        R                  5      $ )z
We don't want to throw errors in this function. First check to see if the device is in device_info.py,
then fall back to the inaccurate triton estimation.
tf32)is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rv   r   
clock_rate)max_clock_rate)r   rP   backendsrH   matmulfp32_precisiontriton.testingr  r  rQ   get_device_capabilityr   rM  rO  inspect	signature
parametersr_  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clocks          rW   get_device_tflopsr  I  s    u~~**11@@FJG M**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\JJ8!#U]]ENN33,U==>>%%44>,U]]HEE&u}}h??U]]ENN33,U33>>%%44>,U]];;&u}}55r{   c                     SSK Jn   U " 5       $ )Nr   get_dram_gbps)r  r  r  s    rW   get_gpu_dram_gbpsr  t  s    ,?r{   c                 x    SSK Jn   U R                  R                  R	                  S5      R                  SS5      $ )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r_  r  s    rW   get_gpu_shared_memoryr  {  s.    %==44Q7;;<LaPPr{   c                     [         R                  R                  5       (       aT  [         R                  R                  5       R                  n [         R                  R                  5       R
                  nX-  $ Sn SnX-  $ )NrP  i   )rP   rH   rQ   r  	warp_sizemax_threads_per_block)r  r  s     rW   get_max_numwarpsr    sg    zz  JJ446@@	 %

 @ @ B X X
 !-- 	 $ --r{   c                $    U R                  S5      $ )Nwelford)r  reduction_types    rW   is_welford_reductionr    s    $$Y//r{   c                4    [        U 5      (       a  gU S:X  a  gg)Nr   online_softmax_reducer  r7   )r  r  s    rW   reduction_num_outputsr     s    N++	2	2r{   c                 2    [         R                  " 5       S:H  $ )NLinux)platformsystemr   r{   rW   is_linuxr    s    ??''r{   c                 (    [         R                  S:H  $ )Nrk   )r  r  r   r{   rW   r   r     s    <<7""r{   c                &    [        S U  5       5      $ )Nc              3     #    U  H7  n[        U[        R                  5      =(       a    UR                  (       + v   M9     g 7fr   )r}   r~   r3  r  r  s     rW   r   #has_free_symbols.<locals>.<genexpr>  s)     Jcz!UZZ(<_<cs   ?Ar  )itrs    rW   rD  rD    s    JcJJJr{   c            	        SSK Jn  U  H  n[        X!R                  UR                  UR
                  UR                  UR                  45      (       aR  [        UR                  5       =(       d    S5      (       d'  [        UR                  5       =(       d    S5      (       a    gM  [        X!R                  5      (       d  M  [        S[        U5       35      e   g)Nr7   r  r   Tzunexpected type for is_dynamic F)r  r  r}   r9  r;  rE  r  r>   rD  maybe_get_sizemaybe_get_strider@   	TypeErrorr  )r   r  r  s      rW   r  r    s    bmmR[[":K:KRYYW
 
   0 0 2 8b99=M""$*> > > Ayy))=d1gYGHH  r{   c                      \ rS rSrSrSrSrg)Placeholderi  KERNEL_NAMEDESCRIPTIVE_NAMEr   N)r   r   r   r   r  r  r   r   r{   rW   r  r    s      K *r{   r  c                v   SSK Jn  [        R                  " SSS9 n[        R
                  " 5       n[        R
                  " 5       n[        U[        U5      S9R                  " U6   [        SUR                   3US9  [        UR                  US9  [        R                  " 5       n[        X5         U " UR                  5        S S S 5        [        R                  " 5       U-
  n	U" UR                  5        UR                  R                  5         UR                  5         [        S	UR                   3US9  [        UR                  US9  UR!                  5       UR!                  5       :H  n
["        R%                  S
UUR&                  U
U	5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr7   )stable_topological_sortrv  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior   r^   rZ   	propagater  ry  r   nowr]   lint	recompiler`  r   r  r   )r  r  inpr  r  r(  	before_ioafter_io
start_timetime_elapsedr  s              rW   pass_execution_and_saver%    sE    9		$	$
 
KKM	;;=R#3C#89CCSI	"(($1-bhhY'\\^
#B,N -||~
2)


#!,bhhX& H$5$5$77hFF	
+
 
 -,
 
s%   BF*2FCF*
F'	#F**
F8c                f    SSK Jn  [        XR                  5      =(       a    U R	                  5       $ )z:
Check if input buffer is a multi-outputs template buffer
r7   r  )r  r  r}   r  is_multi_outputs_template	input_bufr  s     rW   r'  r'    s-      	9//0 	2//1r{   c                    SSK Jn  [        XR                  5      =(       a7    [	        U R
                  5      S:H  =(       a    [        U R
                  S   5      $ )zD
Check if input buffer is a output of multi-outputs template buffer
r7   r  r   )r  r  r}   MultiOutputrR   r  r'  r(  s     rW   #is_output_of_multi_outputs_templater,    sJ      	9nn- 	;	  !Q&	;%i&6&6q&9:r{   c                   U c  gSSK Jn  [        XR                  5      =(       a:    [        XR                  5      (       + =(       a    US L =(       d    U R
                  UL =(       Gd_    [        U 5      UR                  L =(       Ga@    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       d    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       df    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  $ )NFr7   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r}   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rP   r  torchrecr.  defaultr/  r0  ro  r  r  s      rW   is_collectiver8    sM    | 	4--. 	3400	34Z14++r1  	T
b''' 	
 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/r{   c                <    SSK Jn  [        U 5      UR                  L $ Nr7   r  )r  r  r  r2  )ro  r  s     rW   is_waitr;  (  s    :''r{   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      =(       a    US L =(       d    U" U 5      $ )Nr   GroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_collectiver  s     rW   r   &contains_collective.<locals>.<genexpr>5  s     @<a&q))<r   )r  r>  r}   r  snodesr8  ro  )snode	filter_fnr>  s      rW   r@  r@  .  sJ     ?%..@5<<@@@$P)t*;*Oy?OPr{   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   r=  c              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_waitr  s     rW   r    contains_wait.<locals>.<genexpr>>  s     :\=##\r   )r  r>  r}   r  rB  r;  ro  )rC  r>  s     rW   rG  rG  :  s4    >%..:U\\:::uzz""r{   c                    SSK Jn  [        U[        R                  R
                  5      (       a  U/n[        XR                  5      =(       a    U R                  U;   $ r:  )r  r  r}   rP   r  r  r4  r3  r7  s      rW   is_fallback_oprJ  C  sF     "ejj++,,Td--.I43C3Cr3IIr{   c                @    X!U    R                   R                  5          $ r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rW   buf_name_to_fused_snoderP  N  s!     (3??HHJKKr{   c                    gr  r   rC  s    rW   r  r  Y      ur{   c           	         U" U 5      (       a  g UR                  U 5        U R                   H-  n[        UR                  X#5      nXa;   a  M   [	        UUUUUS9  M/     g )Ncriteria_cb)rb  unmet_dependenciesrP  r   find_recursive_deps_of_node)rC  collected_node_setrN  rO  rV  depdefining_op_for_deps          rW   rX  rX  T  sf     55!''5HHk
 4##	
 (r{   c                    gr  r   rR  s    rW   r  r  r  rS  r{   c           
        U" U 5      (       a  g UR                  U 5        U R                  5        H  nUR                   H  nUR                  c   eUR                  R	                  5       S:X  a  M2  UR                  R	                  5       U;  a  MR  X6R                  R	                  5          nXq;   a  Mu  [        UUUUUS9  M     M     g )NOUTPUTrU  )rb  get_outputsr  ro  r  find_recursive_users_of_node)rC  rY  rN  rO  rV  or~  user_ops           rW   r`  r`  m  s     55! GGD99(((yy!!#x/yy!!#+==(););)=>G,(""'  !r{   c                j    [         R                  R                  R                  (       a  SOSnX-
  U-
  $ )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rP   
_functorchri   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rW   num_fw_fixed_argumentsri    s3     $$::   69SSSr{   c                   SS jnSn/ nU R                   R                   H8  nUR                  S:X  d  M  U" U5      (       a  UR                  U5        US-  nM:     U[	        [        [        U5      5      5      :X  d   e[        U5      $ )z6
Infers which inputs are static for a backwards graph
c                    SU R                   ;  =(       a;    SU R                   ;  =(       a%    SU R                   ;  =(       a    SU R                   ;  $ )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r;  s    rW   is_saved_tensor'count_tangents.<locals>.is_saved_tensor  sH    aff$ .!&&(.!/.  qvv-		
r{   r   r  r7   )rT   r5   r   r9  )ry  r]  r  r  re  r   rR   )fx_grp  	arg_countstatic_arg_idxsr+  s        rW   count_tangentsru    s    

 IOZZ44= q!!&&y1NI	  d5_)=#>????r{   c                    / n[        U R                  R                  5       HK  u  p#UR                  S:w  a    U$ UR                  R                  S5      (       d  M:  UR                  U5        MM     U$ )z
Returns indices of backward graph inputs that are always at fixed
addresses: primals (parameters/buffers/user inputs saved for backward).
Excludes saved activations which may not be at fixed addresses when
the forward is partitioned for CUDA graphs.
r  primals_)r\  ry  r]  r  r   r  r  )rr  static_idxsrl  r+  s       rW   get_static_bw_input_idxsry    sh     KDJJ,,-44=   66Z((s#	 .
 r{   c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
	BoxedBooli  r9  r   c                    U R                   $ r   )r   rg  s    rW   rv  BoxedBool.__bool__  s    zzr{   c                @    [        U [        5      (       a	  SU l        U $ gr  )r}   r{  r   r  s    rW   disableBoxedBool.disable  s    c9%%CIJr{   r   Nr  )r  r   r   zBoxedBool | bool)	r   r   r   r   r   rv  r  r  r   r   r{   rW   r{  r{    s     K  r{   r{  c              #     ^ ^#    SSK Jn  UR                  m   S             SU U4S jjjn[        R                  R                  USU5         S v   S S S 5        g ! , (       d  f       g = f7f)Nr7   r9   c                :   > TR                  U5        T" XX#XE5      $ r   rx  )r  kernel_namer  ro  gpucpp_definitionkernel_listorig_define_kernels         rW   define_kernel.collect_defined_kernels.<locals>.define_kernel  s'     	;'!{c
 	
r{   r  )NTN)r  r:   r  r  r  r  ro  r  r  r9  r  r  r   r   )codegen.wrapperr:   r  r   rf  r  )r  r:   r  r  s   `  @rW   collect_defined_kernelsr    s     5-;;  $%)
"

 
 	

 
 #
 

 
 
		/-	P 
Q	P	Ps   AA2A!	A2!
A/+A2c                    U S-   $ )N__original__r   r  s    rW    get_cloned_parameter_buffer_namer    s    .  r{   c                    U [         ;   $ r   )rN   r  s    rW   r  r    s    Yr{   c                 :    [         R                  R                  SL$ )z,Check if we're running on ROCm/HIP platform.N)rP   re  rq   r   r{   rW   is_rocmr    s    ==D((r{   c                0    U S:g  =(       a    [        U 5      $ )NrI   )r  r  s    rW   device_need_guardr    s    U?-vf~-r{   c                h   U [         R                  :X  aD  [         R                  R                  5       (       a!  [         R                  R	                  5       S:  $ U [         R                  :X  a$  [         R
                  R                  5       (       a  gU [         R                  [         R                  4;   $ )N)r  r   T)rP   rM  rH   rQ   r  rJ   r]  r9  rD  s    rW   ,needs_fallback_due_to_atomic_add_limitationsr    sq    5::#:#:#<#<zz//1F::	%..	 UYY%;%;%=%=ejj111r{   c                   U R                   [        R                  R                  R                  [        R                  R                  R
                  4;   a  Uc  gU R                   [        R                  R                  R                  :X  a  SOSnUS U4;  =(       Gd&    U=(       a    [        U5      =(       a    [        U5      =(       d    U R                   [        R                  R                  R                  :H  =(       ap    US:H  =(       ad    U=(       a[    US:H  =(       aO    [        R                  R                  =(       a.    [        R                  R                  =(       d    [        5       S:g  =(       dJ    X:H  =(       a#    U[        R                  [        R                  4;   =(       d    [        R                   " 5       $ )NFrb  r  r  r7   )overloadpacketrP   r  atenscatter_reduce_scatter_reducescatter_r  r  ri   rA  fallback_scatter_reduce_sumdynamic_threadsrG  r9  r]  $are_deterministic_algorithms_enabled)r3  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rW   use_scatter_fallbackr    s]    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 'SJ5::u{{:S,S	8 557!r{   c                   SSK JnJn  SSKJn  [        S[        U 5       S35        [        U 5       GH.  u  pE[        SUS S35        XRL a  [        S	5        M'  XQL a  [        S
5        M8  [        XS5      (       a  UR                  5       n[        U(       a  SOS S35        U(       a;  UR                  c   e[        SUR                  R                  R                   35        [        S5        UR                  R                   H  n[        U5        M     [        S5        UR                  R                   H  n[        U5        M     GM  [!        S[#        U5       35      e   g)z
An API that can be used in pdb to dump a node_schedule.
Right mainly dump the read/write dependencies but can add more as needed.
r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr5  3r	  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  rR   r\  r}   is_reductionro  r:  reduction_hintrP  rQ  rR  r  r  )r  r  r  r  rl  ro  is_redrZ  s           rW   dump_node_scheduler  %  s&   
 O7	M 236
:;}-	#al"$%%%&,,&&(FfU$/?@yy,,,01N1N0OPQ*''--c
 .+''..c
 / !9$t*FGG+ .r{   c                z    SSK Jn  U" U R                  5       [        U R                  5      -  [
        -  S:H  5      $ )Nr   )rs  )r  rs  storage_offsetrF  r   GPU_ALIGN_BYTES)r   rs  s     rW   tensor_is_alignedr  F  s:     L 				 >&,,#?	??RVWW r{   c                    [        U R                  R                  5      (       d  g[        R                  =(       d    [        U 5      $ r  )r  r   r  ri   assume_aligned_inputsr  )example_inputs    rW   should_assume_input_alignedr  T  s5     -&&++,,''K+<]+KKr{   c                 X   [         R                  R                  R                  5       n U (       d  [        R
                  " 5       $ U R                  (       a  U R                  R                  (       d  [        R
                  " 5       $ U R                  R                  nUR                  5       $ r   )	rP   _guardsTracingContexttry_getr  nullcontextr  r{  suppress_guards)tracing_contextr{  s     rW   #maybe_get_suppress_shape_guards_ctxr  ]  sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&r{   c                "   [         R                  R                  R                  [        SS5         [
        R                  R                  5         SS KnSS K	nUR                  " 5       nUR                  " U5      nSSKJn  UR                  U5        UR                  nUR!                  UR"                  5        U " U0 UD6n	UR%                  5       n
UR!                  U5        UR'                  U5        S S S 5        X4$ ! , (       d  f       W	W
4$ = f)Nr   Tr   )output_code_log)r  r   rf  r  ri   rP   rV  rg  r  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr`  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   s              rW   run_and_get_cpp_coder  m  s     
			#	#FGT	:[[]""#56=""2&$**
  /T$V$'')  ,%%b) 
;  9! 
;	:  19s   CC==
Dc                :   [        U 5      nUb  UR                  $ U  H  n[        U[        R                  5      (       a  UR
                  R                  s  $ [        U[        R                  5      (       d  M[  UR                  5        H<  n[        U[        R                  5      (       d  M$  UR
                  R                  s  s  $    UR                  5        H<  n[        U[        R                  5      (       d  M$  UR
                  R                  s  s  $    M     g r   )	rZ   r{  r}   rP   r2   ro  r  r  rJ  )r  r  inputr  rJ  s        rW   shape_env_from_inputsr    s     (I """ eU\\**::''' eU\\**

dELL1199... %  ,,.fell33!;;000 )  r{   c                B   ^ ^^ [        T5      S:X  a  T $ SUU U4S jjnU$ )Nr   c                   > [        U TT5      u  pT" U 5      n[        U5      (       a  [        R                  " X5        U$ r   )copy_misaligned_inputsrR   rP   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr  inputs_to_checkr  mutated_input_idxss       rW   r  )align_inputs_from_check_idxs.<locals>.run  sD    #9);$
  J {  :
r{   )r  list[InputType]r   r   )rR   )r  r  r  r  s   ``` rW   align_inputs_from_check_idxsr    s(    
 ?q   Jr{   c                X   SU R                  5       ;   a  SnO;[        S [        U R                  5       U R                  5       5       5       5      S-   n[        R
                  " X4S5      R                  5       n[        R
                  " X R                  5       U R                  5       5      $ )Nr   c              3  6   #    U  H  u  pUS -
  U-  v   M     g7fr  r   )r   rE  rJ  s      rW   r   )clone_preserve_strides.<locals>.<genexpr>  s     T:Sf$:Ss   r7   r   )r  r  r   rJ  rP   
as_stridedclone)rT   needed_sizer>  s      rW   clone_preserve_stridesr    s    AFFH} T#affh
:STTWXX 	 a6<<>FFFFHahhj99r{   c                T   / n/ nUSLnU H  nX   n[        U[        R                  5      (       d   S[        U5       35       eUR	                  5       [
        -  (       d  MW  [        U5      X'   U(       d  Mm  Xb;   d  Mt  UR                  U5        UR                  X   5        M     X44$ )z
Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
cloned tensor which is in `return_pair_idxs`.
Nz Expected tensors only, but got: )r}   rP   r  r  data_ptr	ALIGNMENTr  r  )r  check_inputs_idxsreturn_pair_idxsr  r  ret_pair_definedr   _inps           rW   r  r    s     ')K&(K (t3}$-- 	
.tDzl;	
- ==?Y&&248JMA$9""4("":=1  ##r{   c                    / nU HV  nX   n[        U[        R                  5      (       d  M(  UR                  5       [        -  S:X  d  ME  UR                  U5        MX     [        U5      [        U5      :w  a  U$ U$ )zO
We require all inputs to be aligned, so introduce a copy for any
that aren't.
r   )r}   rP   r  r  r  r  rR   )r  static_input_idxsaligned_static_input_idxsrl  r  s        rW   remove_unaligned_input_idxsr    sp     !# eU\\**0@90LQR/R%,,S1 ! $%->)??((r{   c                   SSK Jn  [        R                  " [        R                  5      R
                  nUR                  R                  R                  nUR                  R                  R                  R                  n[        R                  (       a&  UR                  R                  R                  X5        gUR                  R                  R                  X:*  5      (       a  gUR                  (       a.  UR                  R                  R                  U S:  5      (       a  gU" U 5      =(       a    U" U 5      U:*  $ )zCheck if an expression fits within 32-bit integer range.

NOTE: This function intentionally does not install guards. Callers are
responsible for guarding (e.g. via check_leq) when they decide to use
32-bit indexing based on this result.
r7   ru  Tg@xDF)rx  rv  rP   ro  r\  r   ry  rz  guarding_hint_or_throwr{  has_guarding_hintri   assume_32bit_indexing	check_leqrs  r  )r   rv  int_maxr  r  s        rW   expr_fits_within_32bitr    s     kk%++&**GWW--DD((22DD##	""1. 	ww--al;; 	 7711!d(;;  QH$:1$=$HHr{   c                6  ^^^ [         R                  R                  R                  5       nUb  UR                  b  [        UR                  5      S:X  d   e[        U 5      mUR                  c   eUR                   H  nUc  UR                  R                  S 5        M#  Sm[         R                  R                  R                  5       =n(       a  UR                  mSUU4S jjmUR                  R                  [        U4S jU 5       5      5        M     g g g )Nr   Fc                r   > Tc  [        U 5      $ T(       a  TR                  U 5      $ TR                  U 5      $ r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callr{  s    rW   map_expr4set_tracing_context_output_strides.<locals>.map_expr5  s7     ("1v((<<Q??$55a88r{   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r   r  s     rW   r   5set_tracing_context_output_strides.<locals>.<genexpr>=  s     5u!(1++urf  )r   r   r   z)float | int | SymInt | SymFloat | SymBool)
rP   r  r  r  output_stridesrR   r  r  r  r  )r  compiled_graphrI  rA  r  r  r  r{  s        @@@rW   "set_tracing_context_output_stridesr  $  s     mm**224Gw55A7))*a///).9	,,888#22E}&&--d3$)!--66>>@@3@(+(=(=%9 9 &&--5u55 3	  Br{   c                 4   [         R                  b  [         R                  $ [         R                  " 5       (       d  g[        R                  R                  5       (       a  g SSKJn   U [        R                  R                  S5      :  $ ! [         a     gf = f)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
ri   fx_graph_remote_cache	is_fbcoderP   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rW    should_use_remote_fx_graph_cacher  A  s    ##/+++,,..H  5#8#8#M#M8$    s   "B
 

BBc                2    [         R                  " SSU 5      $ )Nz[^a-zA-Z0-9_]r   )r   subr  s    rW   normalize_namer  T  s    66"C..r{   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                j    [         R                  S[        U 5      5      n[        R	                  X5      $ )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r  _triton_type_mappingr_  )r   triton_type_names     rW   triton_typer  j  s+    &**5#e*=##$4GGr{   c                    [         R                  X 5      nUR                  SS5      n[        [        U5      n[        U[        R                  5      (       d   eU$ )Nr  r  )_torch_triton_mappingr_  r  rO   rP   r}   r   )r   adjusted_type	type_namer/  s       rW   triton_type_to_torchr  p  sM    )--e;M%%eR0Iy)Ii----r{   c                   U R                   (       + =(       a    U R                  5       UR                  5       :H  =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       a    U R                  UR                  :H  =(       ae    U R                  5       R                  5       UR                  5       R                  5       :H  =(       a!    U R                  5       UR                  5       :H  $ r   )	is_mkldnnr  rJ  r   r   untyped_storager  r  r:  r   s     rW   is_same_tensorr   x  s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;r{   c                   U R                   =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       as    U R                  UR                  :H  =(       aS    [        R
                  R                  R                  U 5      [        R
                  R                  R                  U5      :H  $ r   )r  r  r   r   rP   r  mkldnnr  r  s     rW   is_same_mkldnn_tensorr#    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOr{   c                     g)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   r{   rW   boolean_opsr1    s    r{   c                  *    \ rS rSr% S\S'   S\S'   Srg)OpDtypeRulei  r3   type_promotion_kindtorch.dtype | Noneoverride_return_dtyper   NrJ  r   r{   rW   r3  r3    s    88--r{   r3  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                (    [        X5      [        U '   g r   )r3  r7  )r   r4  r6  s      rW   #register_op_dtype_propagation_rulesr9    s    
 (3(t$r{   zOrderedSet[str]op_requires_libdevice_fp64c                .    [         R                  U 5        g r   )r:  rb  r  s    rW   #register_op_requires_libdevice_fp64r<    s    ""4(r{   c                   SSK Jn  U (       d$  UR                  R                  5       R                  n U S:X  a  [
        R                  $ U S:X  a  gU S:X  a  [
        R                  $ U S:X  a  [
        R                  $ [
        R                  $ )Nr   ru  r  rI   rJ   tpu)
r  rv  ry  get_current_device_or_throwr  ri   cpu_backendxpu_backendtpu_backendcuda_backend)r   rv  s     rW   get_current_backendrD    ss    -gg99;@@e!!!				!!!		!!!"""r{   c                    U [         R                  [         R                  4;   a=  [        R                  R
                  (       a  [        5       S:X  a  [         R                  $ U $ )z"Maybe upcast [b]float16 to float32r  )rP   r   rM  ri   r  codegen_upcast_to_fp32rD  rO  rD  s    rW   upcast_compute_typerG    s@     	%--00MM00!X-}}Lr{   KeyTypeValTypec                  v    \ rS rSrSrSS jrSS jrSS jrSS jrSSS jjr	SS	 jr
SS
 jrSS jrSS jrSrg)
ScopedDicti  z
A dictionary-like object that allows for scoped updates. It maintains
an original dictionary and a set of new items that can override
the original items within the scope.  The original dictionary is
unmodified.
c                    Xl         0 U l        g r   original_dict	new_items)r  rN  s     rW   rU  ScopedDict.__init__  s    *13r{   c                \    XR                   ;   a  U R                   U   $ U R                  U   $ r   rO  rN  r  s     rW   r-  ScopedDict.__getitem__  s,    .. >>#&&!!#&&r{   c                     X R                   U'   g r   )rO  )r  rf  r   s      rW   __setitem__ScopedDict.__setitem__  s    #sr{   c                H    XR                   ;   =(       d    XR                  ;   $ r   rR  r  s     rW   __contains__ScopedDict.__contains__  s    nn$A/A/A(AAr{   Nc                t    XR                   ;   a  U R                   U   $ U R                  R                  X5      $ r   )rO  rN  r_  )r  rf  r6  s      rW   r_  ScopedDict.get  s2    .. >>#&&!!%%c33r{   c                    [        U R                  5      nU R                   H  nX R                  ;  d  M  US-  nM     U$ r  )rR   rN  rO  )r  r+  r  s      rW   r  ScopedDict.__len__  s<    ""#A***Q   r{   c              #     #    U R                    S h  vN   U R                   H  nXR                   ;  d  M  Uv   M     g  N-7fr   rM  )r  r  s     rW   __iter__ScopedDict.__iter__  s8     %%%%A***   	&s   AA  A
Ac                R    [        U R                  =(       d    U R                  5      $ r   )r9  rN  rO  rg  s    rW   rv  ScopedDict.__bool__  s    D&&8$..99r{   c                    [         er   r  r  s     rW   __delitem__ScopedDict.__delitem__  s    !!r{   rR  )rN  Mapping[KeyType, ValType])rf  rH  r   rI  )rf  rH  r   rI  r   r  )rf  r  r   r9  r   )rf  rH  r6  ValType | Noner   rg  r  )r   zIterator[KeyType]r  )rf  rH  r   r  )r   r   r   r   r   rU  r-  rU  rX  r_  r  r_  rv  rd  r   r   r{   rW   rK  rK    s5    4'
$B4
:"r{   rK  )frozen_defaultc              .   ^ SU4S jjnU c  U$ U" U 5      $ )Nc                0   > [         R                  " U STS9$ )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rW   wrapir_dataclass.<locals>.wrap  s    $$S$vFFr{   )r   rl   r   rl   r   )r   r   rn  s    ` rW   ir_dataclassrp    s    G {9r{   c                     [         R                  R                  R                  5       n U b'  U R                  (       a  U R                  R
                  $ g r   )rP   r  r  r  fw_metadatabw_donated_idxs)r  s    rW   get_donated_idxsrt    s=    mm22::<O"'B'B**:::r{   c                  (    \ rS rSrSrSrSrSrSrSr	g)	TritonAttrsDescriptorVersioni  r   r7   r  r   rP  r   N)
r   r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   r   r{   rW   rv  rv    s     LKK	  Gr{   rv  c                 f   [         R                  R                  S5      c  [        R                  $ SS Kn SS Kn [        U R                  R                  S5      (       a  [        R                  $ [        U R                  R                  S5      (       a  [        R                  $ [        R                  $ )Nr  r   AttrsDescriptor)r  r  r  rv  rw  triton.backends.compilertriton.compiler.compilerr  r  compilerry  rx  r{  )r  s    rW   #get_triton_attrs_descriptor_versionr  '  s    ~~)1+888##v''):;; ,777	))+<	=	=+777 ,333r{   c                 8    [        5       [        R                  :H  $ r   )r  rv  r{  r   r{   rW   triton_version_uses_attrs_dictr  A  s    .04P4X4XXXr{   c                    U R                  5       n[        U [        R                  R                  5      (       a  U SU R
                   3OUnX4$ )Nrp   )r   r}   rP   r  r  _overloadname)r  op_overload_packet_nameop_overload_names      rW   get_op_namesr  E  sR    #%779 b%**//00 #
#1R%5%5$67$ 
 #44r{   c                |   SSK Jn  U R                  n[        U[        R
                  R                  5      (       d  gU[        R                  R                  R                  R                  [        R                  R                  R                  R                  [        R                  R                  R                  R                  4;   as  U" X R                  U R                  SS9nUbT  Uu  pEUS   nU HE  nUc  M  UR                  S   R                   [        R"                  [        R$                  4;   d  ME    g   g)z
Check if an FX node is cudagraph-unsafe based on its input arguments.

Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
with boolean indices triggers .nonzero() during capture, but integer indices
are safe).
r   )normalize_functionFT)normalize_to_only_use_kwargsindicesrr  )torch.fx.operator_schemasr  r  r}   rP   r  r  r  r  	index_putr6  
index_put__unsafe_index_putr   r  r  r   r9  r^  )r  r  r  
normalizedr   r  r  rl  s           rW   ,_fx_node_is_input_dependent_cudagraph_unsafer  O  s     =^^Ffejj3344 		  ((		!!))		((00 
 (LL'..t

 !"IAY'G?sxx'<'<JJKKA (    r{   c                   U R                   n[        U5      [        ;   a  g[        U[        R
                  R                  5      (       a3  [        R                  R                  R                  UR                  ;   a  g[        U 5      (       a  gU R                  R                  S5      =nb]  [        U[        [        45      (       d  U/OUnU H7  n[        U[        R                   5      (       d  M$  UR"                  (       d  M7    g   g)a  
Check if an FX node is cudagraph-unsafe.

This includes:
- Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
- Ops with the cudagraph_unsafe tag
- Input-dependent unsafe ops (e.g., index_put with boolean indices)
- Ops with sparse tensor outputs
Trr  F)r  r  FORBIDDEN_CUDAGRAPH_OPSr}   rP   r  r  r  r  cudagraph_unsafer  r  r  r_  re  r  r  	is_sparse)r  r  rr  valsr   s        rW   r  r  s  s     ^^F 6{-- 	65::0011HHLL))V[[8 4G<< ||&&3&sT5M::uA!U\\**q{{{  r{   c                    SSK Jn  [        XR                  UR                  45      (       a  g[        XR
                  UR                  45      (       d  g[        U SS5      nUb  [        U5      (       a  gg)aH  
Returns True if the node is an op that is not cudagraphable.
This includes:
- Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
- Ops with the cudagraph_unsafe tag
- index_put_ with boolean indices (triggers .nonzero() during capture)
- Control flow nodes (Conditional, WhileLoop)
- Ops with sparse tensor outputs
r7   r  TFr  N)	r  r  r}   Conditional	WhileLoopr4  r?   rO   r  )ro  r  r  s      rW   is_cudagraph_unsafe_opr    sf      $677d..@AAdIt,G:7CCr{   c                 6   [         R                  R                  SS5      n [        R                  " 5       (       a^  SSKJn  U" 5       nU(       aJ  [         R                  R                  USS5      nU (       a   [         R                  R                  X0/5      OUn U $ )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r_  ri   r  libfb.py.parutilr  r  r"  pathsep)r  r  runtime_pathlib_paths       rW   get_ld_library_pathr    sh    ::>>+R0D5')ww||L)UCH8<2::??H#34(DKr{   c                N    SSK Jn  [        X5      =(       a    U R                  S L$ )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  r}   partition_signatures)r  r  s     rW   #is_codegen_graph_partition_subgraphr    s'    L 	79 	5((4r{   c                     [         R                  R                  R                  R                  =(       d    [
        R                  S L=(       a$    [         R                  R                  R                  $ r   )rP   r  ri   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionr   r{   rW   is_using_cudagraph_partitionr    sN    %%00 	F199E1 //
 
 
0
01r{   c                    SSK Jn  UR                  R                  R	                  U S5      (       a;  UR                  R                  R                  U S5      (       a  [        R                  $ [        R                  $ )Nr7   ru  l        i   )	rx  rv  ry  rz  statically_known_ltrU  rP   r\  r]  )r  rv  s     rW   dtype_from_sizer    sX    ww++e 
''


/
/h
?
?{{{{r{   )r  rJ   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN BF16.
r  rJ   TF)rP   r  r"  _is_mkldnn_bf16_supportedr   s    rW   is_mkldnn_bf16_supportedr    3     eyy99;;	+	r{   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN FP16.
r  rJ   TF)rP   r  r"  _is_mkldnn_fp16_supportedr  s    rW   is_mkldnn_fp16_supportedr    r  r{   c           
     x   U Vs/ s H  n[        [        U5      5      PM     nnU  HS  n[        U5      [        U5      :X  d   e[        U5       H'  u  pR[        X5   [        [        U5      5      5      X5'   M)     MU     / nUR	                  SR                  S [        X5       5       5      5        [        U5      [        U5      S-  -   [        U5      S-
  -   nUR	                  SU-  5        U  H3  nUR	                  SR                  S [        XC5       5       5      5        M5     SR                  U5      $ s  snf )N|c              3  4   #    U  H  u  pS X  S 3v   M     g7fr5  Nr   )r   hrv  s      rW   r   tabulate_2d.<locals>.<genexpr>  s     H3G41AaWA,3G   r  r7   r  c              3  4   #    U  H  u  pS X  S 3v   M     g7fr  r   )r   r   rv  s      rW   r   r    s     H7Gtq!Cl7Gr  rY  )rR   r  r\  r   r  r"  r   r  )elementsheadersr   widthsrowr   r  total_widths           rW   tabulate_2dr    s    #*+7ac#a&k7F+3x3w<'''cNDAFIs3q6{3FI #  E	LLH3w3GHHIf+Vq1S[1_EK	LL{"#SXXHs37GHHI 99U ,s   D7c              #     #    [        U R                  5       5      [        UR                  5       5      -  nU H6  nU R                  U5      nUR                  U5      nUUb  UOUUb  UOU4v   M8     g7f)a  
Zip two dictionaries together, replacing missing keys with default values.

Args:
    dict1 (dict): The first dictionary.
    dict2 (dict): The second dictionary.
    d1_default (Any): the default value for the first dictionary
    d2_default (Any): the default value for the second dictionary

Yields:
    tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
           and the value from dict2 (or d2_default if missing).
N)r!   r`  r_  )dict1dict2
d1_default
d2_defaultall_keysrf  value1value2s           rW   	zip_dictsr    sp     ( %**,'*UZZ\*BBH 33 (Fj(Fj
 	
 s   A1A3c                v           SS jn        SS jnU R                  S[        R                  R                  5      nU R	                  5       n U(       aq  U" U SS5        U" U SS5        U" U S[
        R                  R                  (       + 5        U" U SS	5        U" U S
[        R                  R                  5        U" U SS5        U R                  S[        R                  R                  5      nU R                  S[        R                  R                  5      nUS:X  a  U(       a  [        S5      eU $ )a
  
Ensures the configuration is internally consistent for standalone AOTInductor.

If `aot_inductor_mode.compile_standalone` is set to True in the provided
`config_patches` (or falls back to the global config), this function ensures
that the following configs are also enabled:
    - `aot_inductor.package_cpp_only`

Args:
    config_patches (dict[str, Any]): A dictionary of user-provided config
        overrides for AOTInductor compilation.

Returns:
    dict[str, Any]: The possibly-updated `config_patches` dictionary.
c                    U R                  U[        [        U5      5      nUc  X U'   g U(       d  X2:w  a  [        SU SU S35      eg g )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)r_  rO   ri   r  config_patchesconfig_nameconfig_valuer   s       rW   patch_config2maybe_aoti_standalone_config.<locals>.patch_config9  sY     "";0LM=*6;'50";-q>qr  1r{   c                    U R                  U[        [        U5      5      nX2:w  a  [        R	                  SUU5        X U'   g )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)r_  rO   ri   r   r  r  s       rW   force_patch_config8maybe_aoti_standalone_config.<locals>.force_patch_configD  sB     "";0LM KKV
 '3{#r{   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r  dict[str, Any]r  r  r  r   r   r  )r_  ri   aot_inductor_modecompile_standalonecopyrP   re  rq   test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor  )r  r  r  r  r  r  s         rW   maybe_aoti_standalone_configr  (  sk   "	&	58	HK			
3&
358
3HK
3	
3 (++.  33
 $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 	(,,	

 	>+I5Q*..,11
 -00.33
 	).E]
 	

 r{   c                   [         R                  R                  (       a)  [         R                  R                  S:X  a  [	        S5      e[         R                  R                  (       a0  [         R                  R
                  S:X  a  [	        S5      eSnSnX!4$ [         R                  R                  S:X  a  SnSnX!4$ U S::  a  gSn[         R                  " 5       (       + nX!4$ )	z
Decide whether we should mmap weights, and whether to store the weights with .so.

If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

Returns tuple (use_external_weights, use_mmap_weights).
binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)ri   r  force_mmap_weights package_constants_on_disk_formatr  r  r  )consts_sizeuse_mmap_weightsuse_external_weightss      rW   determine_aoti_mmap_flagsr  }  s     	..@@MQJ
 	

 --44	A]   $#55;;}L# #55m# !++--11r{   c                     SSK Jn   U R                  R                  nUc  g[	        U[
        5      (       d  [        S5      eUS:X  a  g[        R                  " SU5      (       d  [        S5      eg)zD
Validates if a model name is suitable for use in code generation.

r   rh   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  ri   r  model_name_for_generated_filesr}   r  r  r   r   )ri   
model_names     rW   is_valid_aoti_model_namer    sn    
 '$$CCJj#&&OPPR 88/<<d
 	
 r{   c                <    U(       a  [        U 5      $ [        U 5      $ r   )r)   r(   )rT   unbacked_onlys     rW   get_free_symbolsr    s    $Q''Ar{   c                 *   0 [         R                  ES[         R                  R                  S[         R                  R	                  [
        R                  5      5      0En [        R                  " 5       (       a  [        R                  " S5      U S'   U $ )z9
Get a base environment for running Python subprocesses.

PYTHONPATHTORCH_CUSTOM_PYTHONPATHr:  
PYTHONHOME)r  r  r_  r  r"  r  r  ri   r  	sysconfigget_path)envs    rW   python_subprocess_envr    so    

** 	bjjnn%rzzsxx'@
	C  %..v6LJr{   c                  .    \ rS rSr% SrS\S'   S\S'   Srg)CUDAGraphWrapperMetadatai  z
Metadata for Customized CUDAGraphWrapper.

Currently assumes there is 1 dynamo graph and will extend to
multiple graphs in the future.
r   num_partitionspartition_indexr   Nr   r   r{   rW   r  r    s      r{   r  .c                  $    \ rS rSr% SrS\S'   Srg)CUDAGraphWrapperi  NzCUDAGraphWrapperType | Noner  r   )r   r   r   r   r  r   r   r   r{   rW   r  r    s    +/G(/r{   r  c                    U [         l        g r   )r  r  )r  s    rW   !set_customized_partition_wrappersr    s    5<*2r{   c                H  ^ U R                   R                  nU R                   R                  / UQU R                   R                  QU R                   R                  5      nU R                   R                  n[
        R                  " X45      u  p4SS jnU Vs/ s H:  nU" U5      (       a(  [        R                  R                  R                  USS9OUPM<     nnSS jmSU4S jjnU Vs/ s H
  og" U5      PM     nn[
        R                  " X45      u  pX4$ s  snf s  snf )	Nc                   [        U [        R                  R                  R                  5      =(       aW    [        U [        R                  R                  R
                  [        R                  R                  R                  45      (       + $ r   )r}   rP   r  r  r@   GeneratorStateOpaqueObjectStater;  s    rW   _is_tensor_ir(snode_args_kwargs.<locals>._is_tensor_ir  s^    !U__//667 

__..0B0B0T0TUA
 =
 	
r{   T)replace_symbols_with_hintsc                ,    [         R                  " XUS9$ )Nr   )rP   r   )r  r   r   s      rW   _tensor"snode_args_kwargs.<locals>._tensor&  s    {{4V<<r{   c                   > [        U [        R                  5      (       d  U $ T" U R                  5       U R                  U R
                  5      nU$ r   )r}   rP   r  r  r   r   )r   r  r  s     rW   to_real_tensor)snode_args_kwargs.<locals>.to_real_tensor)  s:    !U\\**Haffh2
r{   r  )r   r  )r   r   r   r   )ro  r  fill_non_provided_argsconstant_argsr  pytreer"   rP   r  r  ir_node_to_tensortree_unflatten)	rC  r   r  	flat_argsflat_args_pytree_specr  r'  r  r  s	           @rW   snode_args_kwargsr    s   ::D::,,*$*))*

D ZZF'-':':D>'J$I
 	 A  	,,Q4,P	 	  = -66Iq"II6((JLD<%  7s   AD,Dc                    SSK Jn  U R                  nUR                  R                  (       a(  UR	                  UR                  R                  S-   5      nUR                  S5      $ )Nr7   ru  r   )rw  r  fwd_rng_statero  rl  )rx  rv  r   ry  removeprefixr  )rZ  rv  dep_names      rW   is_nonfreeable_buffersr!  4  sN    xxH 	ww||(();<I r{   c                x    [        X S3-  5       nUR                  5       sSSS5        $ ! , (       d  f       g= f)z,Load a template file and return its content.z	.py.jinjaN)openread)r   template_dirr(  s      rW   load_templater&  B  s+    	lvY//	0Avvx 
1	0	0s   +
9c                   U R                   n[        U[        R                  R                  [        R                  R
                  45      (       d   S[        U5       35       e[        R                  (       d  g[        [        R                  R                  R                  R                  [        R                  R                  R                  R                  /5      nX;   a  g[        [        R                  R                  R                   /5      n[        U[        R                  R
                  5      (       a  X;   $ [#        U 5      (       + $ )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r}   rP   r  r  r  r  ri   fallback_by_defaultr!   r  r  _assert_scalarr6  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr   )ro  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rW   should_fallback_by_defaultr/  H  s    [[F&&

(F(FG  O	?V~NO  %% *4IINN))11IINN**22	
*& 3 				@	@AM &%**8899&&&t,,,r{   )z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultz(torch.ops._c10d_functional.isend.defaultz(torch.ops._c10d_functional.irecv.defaultz0torch.ops._c10d_functional.batch_p2p_ops.defaultc                    U [         ;   $ )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r  s    rW   is_collective_opr2    s    n$$r{   c                 p    [         R                  " 5       (       a	   SSKJn   U $ / $ ! [         a    / s $ f = f)Nr   tlx_only_cuda_options)ri   r  )torch._inductor.fb.tlx_templates.registryr5  r  r4  s    rW   r5  r5    s<    	W(( 		  	I	s   & 55c                    X-   S-
  U-  U-  $ )z(Round x up to the nearest multiple of y.r7   r   )rT   ys     rW   	_round_upr9    s    UQY1!!r{   c                   SSK JnJn  U" US5      (       a  UR                  UR                  4$ [        U5      S:  Ga2  U" US   U S   5      (       a  U" US   S5      (       d%  U" US   S5      (       a,  U" US   U S   5      (       a  UR                  UR                  4$ U" US   U S   5      (       a  U" US   [        U S   S5      5      (       d2  U" US   U S   5      (       a6  U" US   [        U S   S5      5      (       a  UR                  UR                  4$ U" US   [        U S   S5      5      (       a6  U" US   [        U S   S5      5      (       a  UR                  UR                  4$ U[        R                  :X  a  SOSnU[        R                  :X  a  U[        R                  :X  a  [        U S   S5      [        [        XS   -  S5      S5      -  n	[        U S   S5      [        [        XS   -  S5      S5      -  n
U" X)5      (       d  U" X*5      (       a  UR                  UR                  4$ U[        R                   :X  Ga
  [        R"                  R$                  (       d  [        U S   S5      [        [        XS   -  S5      S5      -  n	[        U S   S5      [        [        XS   -  S5      S5      -  n
U" X)5      (       d  U" X*5      (       a  UR&                  UR                  4$  g	[        U S   S5      U-  U S   -  n	[        XS   -  S5      U S   -  n
U" X)5      (       d  U" X*5      (       a  UR&                  UR                  4$ g	)
z2
Core implementation for scale/swizzle inference.
r   )r6   SwizzleTyper7   r  rs   rr   rP  rP  NN)torch.nn.functionalr6   r;  
TensorWise
NO_SWIZZLErR   RowWiserj   BlockWise1x128BlockWise128x128rP   rK  rC  r9  BlockWise1x16SWIZZLE_32_4_4rI  re  rq   BlockWise1x32)mat_size
scale_sizescale_numel	mat_dtypescale_dtypeeq_fnr6   r;  K_multiplierexpected_numel_aexpected_numel_bs              rW   _infer_scale_swizzle_implrO    sg    = [!%%{'='=== :!*Q-!--%
1q2I2I*Q-##jmXa[(I(I&&(>(>>> *Q-!--jmWXa[#%>??*Q-!--jmWXa[#%>??--{/E/EEE AS 9::uqM78A;4@
 @
 //1G1GGG "U%;%;;1L E***{e>Q>Q/Q$Xa[#6LA;.3Q:
 
 %Xa[#6LA;.3Q:
 
 //53W3W,,k.H.HHH e***}}  (!c:Y{2B7>    )!c:Y{2B7>   [33u8 8 #00+2L2LLL8   'x{B7,FRST&|qk'A2FRST[33u8 8 #00+2H2HHHr{   c           	         [        U R                  S   U R                  S   4[        UR                  5      UR                  5       U R                  UR                  S S9$ )aR  
Infer the scaling type and swizzle mode from matrix and scale tensor shapes/dtypes.

This function determines how scale factors are laid out relative to the matrix:
- TensorWise: Single scale for entire tensor
- RowWise: One scale per row
- BlockWise1x128/128x128: Block-scaled with float32 scales
- BlockWise1x32: MXFP8 with float8_e8m0fnu scales (swizzled on NVIDIA)
- BlockWise1x16: NVFP4 with float8_e4m3fn scales (swizzled)

Args:
    mat: The matrix tensor (FP8 or FP4)
    scale: The scale factor tensor

Returns:
    Tuple of (ScalingType, SwizzleType) or (None, None) if unrecognized
r   r7   c                
    X:H  $ r   r   r  s     rW   r  %infer_scale_swizzle.<locals>.<lambda>  s    16r{   rF  rG  rH  rI  rJ  rK  )rO  rE  r  numelr   )matscales     rW   infer_scale_swizzlerW    sO    ( %))A,		!-%KKM))KK! r{   c           	     r  ^ SSK Jm  U R                  5       nUR                  5       nU(       a
  US   US   4nU(       a&  [        R                  " [
        R                  US5      OSnSU4S jjn[        [        U5      S:  a
  US   US   4OUS   S4[        U5      UU R                  UR                  US9$ )z
Infer the scaling type and swizzle mode for IR nodes (used during graph lowering).

This is the IR-compatible version of infer_scale_swizzle, using symbolic
size comparisons via V.graph.sizevars.statically_known_equals.
r   ru  r7   c                N   > TR                   R                  R                  X5      $ )z5Compare values using symbolic equality when possible.)ry  rz  rT  )r'  r(  rv  s     rW   symbolic_eq+infer_scale_swizzle_ir.<locals>.symbolic_eq  s    ww77==r{   r  rS  )r'  r   r(  r   r   r9  )r  rv  rF  r  r  r  r   rO  rR   r  r   )rU  rV  	transposerF  rG  rH  rZ  rv  s          @rW   infer_scale_swizzle_irr]    s     .||~H!J QK!- DN)""8<<Q?STK> %/28}/A(1+x{+QRUVGW$))KK r{   r  )ry   r   r   r   )r   r   r   r9  )   d   )r   Callable[[], Any]r   r   r   r   r   r  )r^  r_  F)
r   r`  r   r   r   r   r   r9  r   r  r  )r   ztorch.device | None | strr   torch.device)r#  zIterable[sympy.Expr]r   r   )r+  ri  r,  ri  r   r   )r#  zIterable[_T]r   zValuesView[_T])r6  rg  r7  rg  r   rg  )rf  r5  r   r  )rl  zIterable[int | torch.SymInt]r   zlist[sympy.Expr])rr  int | torch.SymIntr   rg  )r   rg  r   rb  )rl  zIterable[int | sympy.Expr]r   zlist[int | torch.SymInt])r  torch._ops.OpOverloadr   r9  )r  r5   r  z'Callable[[torch._ops.OpOverload], bool]r   r9  )r  r   r   r  r  r  r   z&tuple[GraphModule, list[torch.Tensor]])rH   )r   r  r   r  )r7   rH   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r  r   r  )r   r  r  g      ?rH   )r  rd  r  re  r   r   r  r   r  r  r   r  r   r  )r  r   r  r  r   r  )r  r   r  r   r   r  )r'  r   r(  r   r   r   )rT   zint | Sequence[int]r  r   r   Sequence[int])rT   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   zCachedMethod[P, RV])r  r  r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  *Sequence[BaseSchedulerNode] | ExternKernelr   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r#  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r  )r  rg  r  r:   r   tuple[str, str]r   )r{  zIterable[torch.fx.Node]r|  zCallable[[Any], bool] | Noner   OrderedSet[torch.fx.Node])r   zSequence[IRNode]r  zdict[str, IRNode]r   rj  r  )r  r   r   zValueRanges[Any])r  r  r   r9  )r  re   rl  r   r   sympy.Symbol)r  r9  r   r9  )r   r  r   rk  )rp  r   r  zdict[sympy.Expr, Any]r   r   )r'  r   r   z&TypeGuard[torch.SymInt | torch.Tensor])r   r   r   r9  )r  torch.fx.GraphModuler   ztorch.fx.Node | None)r  rl  r   r5   )r  rl  r   zOrderedSet[torch.device]r  )r  r   r   r   )rf  r  r   r  r   r  )NNT)r$  zdict[str, Any] | Noner  r  r%  r9  r   r  )r.  re  r*  r9  r   	list[int])r{  r+   r.  z)Sequence[int | torch.SymInt | sympy.Expr]r*  r9  r   rm  )r   rj  r   r   r  r  )r  zint | torch.devicer   r9  r  )r  r   r   ra  r  
int | Noner   r8   )rr   )r  r   r   r   )r@  rA   r  zlist[torch.dtype]r   r9  )r!  r  r   r9  )
r@  rA   r(  r9  r)  r9  r*  r9  r   r9  )rl  r@   r5  rh  r6  r9  r   r9  r  )r@  ri  r6  r9  r   r9  )rl  r@   r5  rA   r6  r9  r   r9  )r  r6   r  r6   r  zlist[ScalingType]r   r9  )r  r   r  r   r@  rA   r  r9  r  r9  r  
Any | Noner  ro  r  ro  r   r9  )
r@  rA   r  r   r+  r   r  r   r   r9  r<  )r@  rA   r  r  r+  r  r  r  r  r@   r  r@   r  zIRNode | Noner  z_IntLike | Noner   r9  )r  r  r   r9  r   )
r  r  r+  r  r  r  r  r   r   r9  )r  r  r+  r  r  r  r   r9  )r  r  r+  r  r  r  r   rm  )r   r  r   r  )r   zNtuple[str | None, Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r@  rA   r   r9  )r@  rA   r$  zReinterpretView | Bufferr%  r@   r   r9  )FTFN)r@  rA   r$  r@   r%  r@   r0  r9  r!  r9  rI  r9  r8  rn  r   r9  )r   Callable[P, _T]r   r  r  r  r   ztuple[_T, list[str]])r   rd  r   ztuple[Any, list[str]])r   rp  r   r  r  r  r   r   )r   rp  r   r  r  r  r   r  )r   rp  r   r  r  r  r   ztuple[Any, list[GraphLowering]])r  rd  r  rd  r   r  )r  rd  r  zCallable[..., Any] | Noner   r   )r  r  r   r  r  )ra  re  r   r9  )r  zSequence[torch.Tensor]r   r9  )rr  r   r   rj  )r  r9  r   r   r  r   r   zIterator[Any])r   rj  r   r  )r  r  r   r9  )r  r  r   r   )r
  zIterable[Any]r   r9  )
r  rd  r  r4   r   re  r  r  r   r  )r)  zBuffer | Operation | Noner   r9  )ro  zNode | Operation | Noner  ztorch._ops.OperatorBase | Noner   r9  )ro  zIRNode | Operation | Noner   r9  )rC  rF   rD  z*Callable[[BaseSchedulerNode], bool] | Noner   r9  )rC  rF   r   r9  )ro  zOperation | Noner  z9torch._ops.OpOverload | Collection[torch._ops.OpOverload]r   r9  )rM  r  rN  r  rO  r  r   r   )rC  rF   rY  zMutableSet[BaseSchedulerNode]rN  zdict[str, SchedulerBuffer]rO  zdict[str, BaseSchedulerNode]rV  zCallable[[Any], bool]r   r  )rf  r   rg  r   r   r   )rr  rl  r   r   )rr  rl  r   rm  )r  r   r   r  )r   r  r   r  )r   r  r   r9  )r   r  r   r9  )r   rj  r   r9  )r3  rc  r  r  r  rj  r  rj  r  r  r  r9  r   r9  )r  rh  r   r  )r   r  r   r9  )r  r  r   r9  )r   r  )r   rp  r   r  r  r  r   ztuple[_T, str])r  Sequence[InputType]r   zShapeEnv | None)r  Callable[[list[InputType]], _T]r  rf  r  zOrderedSet[int]r   rr  )rT   r  r   r  )r  r  r  rf  r  zOrderedSet[int] | Noner   z-tuple[list[torch.Tensor], list[torch.Tensor]])r  rq  r  rf  r   rf  )r   r   r   r9  )r  re  r  rE   r   r  )r   rj  r   r  )r   r  r   rj  )r:  r  r   r  r   r9  )r   ztuple[str, ...])r   r  r4  r3   r6  r5  r   r  )r   r  r   r  )r   r  r   r  )r   rj  r   rj  )r   ztype[Any] | Noner   r9  r   r   )r   zlist[int] | None)r   rv  )r  ztorch._ops.OperatorBaser   ri  )r  torch.fx.Noder   r9  )ro  rB   r   r9  )r  r:   r   r9  )r  r   r   rj  )r   r  r   r9  )r  zSequence[Sequence[T]]r  zSequence[T]r   r  )
r  rf  r  rf  r  rg  r  rg  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r  r  r   r  )r  r   r   ztuple[bool, bool])rT   r*   r  r9  r   zOrderedSet[sympy.Symbol])r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )rC  rF   r   z tuple[list[Any], dict[str, Any]])rZ  r;   r   r9  )r   r  r%  r/   r   r  )ro  rs  r   r9  )r   r   )rT   r   r8  r   r   r   )rF  ztuple[Any, Any]rG  ztuple[Any, ...]rH  r   rI  rj  rJ  rj  rK  zCallable[[Any, Any], bool]r   tuple[Any | None, Any | None])rU  r  rV  r  r   ru  )rU  r>   rV  r>   r\  r9  r   ru  (  
__future__r   rZ  r  rl  enumr  r  r  r  r  r  r  r  r  r  r   r!  r   r  r  r  r  r  r  r  collections.abcr   r   r   r   r   r	   r
   r   r   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r~   rP   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   !torch.fx.passes.regional_inductorr   torch.utils._dtype_abbrsr    torch.utils._ordered_setr!   r"   r#   OPTIMUS_EXCLUDE_POST_GRAD#torch.fx.experimental._size_hintingr'   r  r(   r)   r*   r+   r,   r-   r.   pathlibr/   r0   r1   r2   torch._prims_commonr3   torch.fxr4   torch.fx.noder5   r=  r6   r  r8   r  r:   dependenciesr;   ry  r=   r  r>   r?   r@   rA   rB   rC   output_coderE   r  rF   rG   rN   rL   r   rX   torch._dynamo.device_interfacerY   torch._dynamo.utilsrZ   torch.autogradr[   torch.autograd.profiler_utilr\   (torch.fx.passes.graph_transform_observerr]   torch.fx.passes.shape_propr^   torch.utils._sympy.functionsr_   r`   ra   rb   rc   torch.utils._sympy.symbolrd   re   torch.utils._sympy.value_rangesrf   rg   r  ri   runtime.runtime_utilsrj   r5  _IS_WINDOWS	getLoggerr   r   rl   r  r3  	VarRangesr  r   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSr  r  r;  r  r^  rS  r_  rU  r`  r\  r]  r   rM  rO  rQ  rC  rD  rE  rG  rt   r   rx   rz   r   Functionr   rm  r   r   r   r   r  r  r$  r-  r0  ri  rm  rs  r}  r  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r   r  r  r  r(  ry  r  r  r  r  r  r  r  r  r  r  r  	frozensetr  r  r  r  r  r  r  r  r  r  r
  r)  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher2  rB  rF  rH  rL  rO  r  r  r]  r  r  r  r  r  r  r  r  r"  r'  r3  rm  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r)  r"  rM  rO  rh  rp  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  rG  r  r  r  r  r  r  r   r  r   rD  r  Enumr  r%  r'  r,  r8  r;  r@  rG  rJ  rP  rX  r`  ri  ru  ry  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  ra  r  compiler  r  r  r   r#  r1  r3  r7  r9  r:  r<  rD  rG  rH  rI  rK  rp  rt  rv  r  r  r  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTypert  r  r  r  r  r!  r&  r/  r1  r2  r5  r9  rO  rW  r]  )r  r   s   00rW   <module>r     s   "        	     	  	   
               C B    $ $ ? : E 0 / ; ($ 
 <  >>//C$"/,5!$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%! T]UZZ'(	LL3-4	 Ebii(I7S 
  !"  	 
 2<

2 . ( {Q'A-+2B XDX XB5
LENN  d#  $"G GX #(	 
 !	
 4 #(	[[[ 
[ !	[
 [|  ;@
+	*%AP+	%++	"/	#//G @OI	I<I 
I0 *8+0' ' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:)++/+\=,4).4)O4) 	4)n_2=_2!_2 _2H 15*- (G
G$5GG:,^%	DU	>+- $ $'& 
< !# I "	 " "( +/7(7	7 7 	7 7v !5 $ "  49  ( 	$$	2$ 	$
 $N Q7 7*  , , ,
R' R'j
 
 @ @ @?' ?  8 J J ) )I  $  	( +<	  #  	
  
< 7;uBB&3BHLB	BL 5:-1	4 BGOO&,O:>O	O2 BGVV&,V:>V	V OOO %O 
	O Q	  	 Q  B0 Q  >>> > 	>
 > > > > 
>B-n EEE E 	E
 E E E E 
EPJ EJJ&) & EF!)?B	 (  . 5( 5(p @ @ R R:"JH&2:@	: ""&#==
= = 	=
  = = = 
=@'C C"&&& & 	&$ &2:&/(V		 &	2:		## &#2:#$#* ...@.. .$ FJFF)BFF*	B&&   D D '6 '6T  Q	.0(#K(*$)) * 

 
"- 
4A 
HK 
	 
F	(	" *.#
!#&# 
#L( =A	Q	Q9	Q 
	Q#J
JAJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2      ,!)
.2$&$$ $ 	$
 $ $ 
$NHBL'  &2:2:*" ( %	0	: 04$$$$ -$ 3	$ $<$ $*IZ!3B	:&/ '#)* $%
  +?*D*D*FG*F$!*FG  **Y'H	  & . . .
 68 2 7
8 . 
	 /9l O :)# #" )

)
-" 01 -"` D)D   *499  4 42Y5!H"J4
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
  
FRj&2R66 d#  $ 38$./@ 0 0 *:); &=!H
"-L $%
  "
MMM M 	M
 M &M #M`	*"B "	"" " #	" "G Hs   z?