
    3j0                       % S SK Jr  S SKrS SKrS SKJrJr  S SKJr  S SK	r	S SK
rS SKJr  S SKrS SKJr  SSKJrJr  SSKJrJrJrJrJrJrJr  SS	KJrJr  SS
K J!r!  SSK"J#r#J$r$  \(       a  S SK%J&r&J'r'  Sr(Sq)S\*S'   \RV                   " S S5      5       r,\RV                   " S S5      5       r-SS jr.S S jr/\R`                  S!S j5       r1          S"S jr2 " S S\5      r3S#S jr4 " S S5      r5    S$S jr6      S%S jr7g)&    )annotationsN)AnyTYPE_CHECKING)patch)
OrderedSet   )configselect_algorithm)BufferChoiceCallerLayoutMultiTemplateBufferOperationBuffer
StorageBox	TensorBox)KernelInputsMMKernelInputs)SchedulerNode)NullHandlerV)	GeneratorSequencedistributed_autotunedist.ProcessGroup | None_AUTOTUNE_PGc                  6    \ rS rSr% SrSrS\S'   SrS\S'   Srg)	_DistributedAutotuneState'   z9
State used to track autotuning during a graph_context()
r   intautotuned_indexautotuned_local_count N)	__name__
__module____qualname____firstlineno____doc__r    __annotations__r!   __static_attributes__r"       ^/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/_inductor/distributed_autotune.pyr   r   '   s      OS "#3"r*   r   c                  *    \ rS rSr% S\S'   S\S'   Srg)_DistributedAutotuneInfo6   r   indexboollocalr"   N)r#   r$   r%   r&   r(   r)   r"   r*   r+   r-   r-   6   s    JKr*   r-   c                     [         R                  " 5       (       aD  [         R                  " 5       (       a*  [        c  [         R                  R                  SS9q[        $ g )Npt2_distributed_autotune_pg)pg_tag)distis_availableis_initializedr   distributed_c10d_new_group_with_tagr"   r*   r+   get_autotune_pgr:   <   sO    t224400DD4 E L r*   c                t    [         R                  (       d   e[        U 5      n[        U5      n[	        X5        g)z
Finish the distributed autotuning by propagating the autotuning results
between the ranks and then replacing the placeholder with the real Buffer.
N)r	   distributed_max_autotune_gemm_autotune_local_nodes_sync_autotune_remote_nodes)	schedulerautotune_resultschoices_by_indexs      r+   schedulerC   H   s2    
 ////,Y7-.97r*   c               #  &  #    [        [        R                  " SS9[        5      (       a   e[        R                  " [        5       5         Sv   [        R                  " [        5       5        g! [        R                  " [        5       5        f = f7f)zX
Wrapped around processing a graph, sets up figuring out which ranks tune
which shapes.
F)check_poisonedN)
isinstancer   get_distributed_autotune_stater   set_distributed_autotune_stater   r"   r*   r+   graph_contextrI   S   sk      	((>!    $$%>%@A8	((7((7s   ABA. B. BBc                "   [         R                  (       d  g[        5       =n(       d  g[        U5      S::  a  g[        R
                  nUR                  nU=R                  S-  sl        XdR                  5       -  UR                  5       :H  n[        Xg5      [        R                  R                  [        '   U(       a  U=R                  S-  sl        g[        R                  R                   R"                  R%                  ['        XU5      5      $ )z
Used by an op (like `mm`) to determine if the op should be autotuned
locally (returns None) or remotely (returns a placeholder Buffer).
Nr   )r	   r<   r:   lenr   distributed_autotune_stater    sizerankr-   current_nodemeta_DISTRIBUTED_AUTOTUNE_KEYr!   torch	_inductorirr   create_DistributedAutotuneBuffer)namechoicesinputslayoutautotune_pgstater/   r1   s           r+   maybe_autotune_remoter]   d   s     //*,,K,
7|q((E!!E	Q$$&&+*:*:*<<E5M6ANN12 ##q(#??''.."48 r*   c                  h   ^  \ rS rSr% SrS\S'           S	U 4S jjr    S
S jrSS jrSr	U =r
$ )rV      z
A MultiTemplateBuffer which represents a kernel being autotuned on a
different rank. When `schedule` is called this will be replaced by the
"real" buffer.
str_kernel_namec           	     Z   > [         TU ]  UUU R                  / [        0 5      S9  Xl        g )N)choice_timings_fnunfiltered_choicesallowed_prologue_inps)super__init___dummy_choice_timingsr   ra   )selfkernel_namerY   rZ   	__class__s       r+   rg   #_DistributedAutotuneBuffer.__init__   s8     	"88!",R. 	 	
 (r*   c                    [         eN)NotImplementedError)ri   _hint_overrides     r+   rh   0_DistributedAutotuneBuffer._dummy_choice_timings   s
    
 "!r*   c                   SSK Jn  [        R                  " [        R
                  SS5         [        / U R                  Q5      n[        U R                  [        5      (       d   eUR                  U R                  U5      nU" U R                  U/UR                  5       U R                  5      u  pV[        U[        5      (       d   eUsSSS5        $ ! , (       d  f       g= f)z]
Given a _SerializedChoice (autotune results from another rank)
compute the final TensorBox.
r   )autotune_select_algorithmr@   N)r
   rs   r   objectr   graphr   original_inputsrF   rZ   r   
get_choicera   nodesr   )ri   
ser_choicers   kernel_inputschoicebuffer_s          r+   autotune#_DistributedAutotuneBuffer.autotune   s     	@\\!'';5*+BT-A-A+BCMdkk62222**4;;FF1!!##%	IF fi0000 655s   BC
C")ra   )rj   r`   rY   list[Buffer]rZ   r   returnNone)rp   z
int | Noner   zdict[ChoiceCaller, float])ry   _SerializedChoicer   r   )r#   r$   r%   r&   r'   r(   rg   rh   r~   r)   __classcell__)rk   s   @r+   rV   rV      sZ     (( ( 	(
 
( "("	"" r*   rV   c                   [        5       nU(       d   eS/UR                  5       -  n[        R                  R	                  X US9  [        S U 5       5      nS/U-  nSnU HG  nU H>  n[        U[        5      (       d   eXGR                     b   eXtUR                  '   US-  nM@     MI     X5:X  d   SU SU 35       eU$ )zL
Perform the all_gather to collect the autotune results from all the ranks.
N)groupc              3  8   #    U  H  n[        U5      v   M     g 7frn   )rK   ).0xs     r+   	<genexpr>_sync.<locals>.<genexpr>   s     0ZSVVZs   r   r   zcount mismatch:  != )	r:   rM   rR   distributedall_gather_objectsumrF   r   r/   )rA   r[   
all_states
node_countrB   check_countother_resultsr{   s           r+   r>   r>      s    
 "#K; 269I9I9K0KJ	''
K'X0Z00J150CK##Ff&78888#LL1999-3V\\*1K	 $ $ $V(8D&VV$r*   c                  ^    \ rS rSrSrS
S jrSS jr\SS j5       r\SS j5       r	SS jr
Srg	)r      z
This is a serializer for the autotune choice. KernelTemplateChoice can't
be serialized directly (the template and inputs prevent this) so we need to
serialize it by parts and reconstruct later on.
c                    Xl         [        R                  U5      U l        U R	                  UR
                  5      U l        g rn   )r/   r   _template_uid_from_choicetemplate_uid_compute_kwargsdescriptionkwargs)ri   r/   r{   s      r+   rg   _SerializedChoice.__init__   s2    
-GGO**6+=+=>r*   c                &   U R                  5       n0 U R                  EnSU;   aF  UR                  5       S   R                  5       S   n[        R
                  " XTS   5      US   :H  US'   0 nSSKJnJn  U" U5      n	U" X9XaU5      n
U
R                  $ )z-
Deserialize the ChoiceCaller and return it.
BLOCK_Kr   r   EVEN_K)DictKernelTemplateParamsKernelTemplateChoice)
_template_from_uidr   rx   get_sizesympygcdkernel_template_choicer   r   r{   )ri   rZ   rY   templater   kextra_kwargsr   r   paramsktcs              r+   rw   _SerializedChoice.get_choice   s    
 **, DKK
 q!**,Q/A$yy9,=>&BSSF8')	

 *&1"8\6Rzzr*   c                   U (       d  0 $ 0 nU R                  S5       H  nUR                  SS5      u  p4UR                  5       UR                  5       pCUS:X  a  SX'   MB  US:X  a  SX'   MN  UR                  5       (       a  [        U5      X'   Mr  UR	                  S5      (       a  UR                  S5      (       d   eUSS	 X'   M     U$ )
z9
Given a template description turn it into input kwargs.
,=r   TrueTFalseF')splitstripisdigitr   
startswithendswith)r   r   cfgkeyvals        r+   r   !_SerializedChoice._compute_kwargs   s    
 I /1$$S)Cyya(HCyy{CIIKf}"#!#h~~c**s||C/@/@@@!!Bi * r*   c                *   [        U [        R                  5      (       a>  U R                  R                  S:X  a  g[        SU R                  R                  < 35      e[        U [        R                  5      (       a  g[        S[        U 5       35      e)zi
Given a ChoiceCaller figure out which template represents it. This
is reversed by _template_from_uid().
mmz!torch._inductor.kernel.mm.aten_mmzTODO: kernel z%torch._inductor.kernel.mm.mm_templatezTODO: )rF   r
   ExternKernelCallerr{   rW   RuntimeErrorTritonTemplateCallertype)r{   s    r+   r   +_SerializedChoice._template_uid_from_choice  sw     f.AABB}}!!T):"]6==3E3E2H#IJJ 0 E EFF:V~677r*   c                    U R                   R                  S5      n[        5       US      nUSS  H  n[        X#5      nM     U$ )z"
See _template_uid_from_choice().
.r   r   N)r   r   globalsgetattr)ri   partsobjr   s       r+   r   $_SerializedChoice._template_from_uid+  sH     !!'',ia!qrA#/C 
r*   )r/   r   r   N)r/   r   r{   r   r   r   )rZ   r   rY   r   r   zChoiceCaller | None)r   r`   r   zdict[str, int | str | bool])r{   r   r   r`   )r   r   )r#   r$   r%   r&   r'   rg   rw   staticmethodr   r   r   r)   r"   r*   r+   r   r      s>    ?
4  0 8 8$r*   r   c                   / nU R                    H  n[        U[        5      (       d  M  UR                  =nc  M+  [        U[        5      (       a  MB  [        U[
        5      (       d  MY  UR                  =nc  Mj  UR                  =nc  M{  UR                  [        5      nUc  M  UR                  (       d   eUR                  5       u  px[        UR                  U5      n	UR                  U	5        M     [        R                   n
[#        U5      U
R$                  :X  d!   S[#        U5       SU
R$                   S35       eU$ )zh
Go through the nodes in the scheduler and autotune the kernels which
should be autotuned by this rank.
z'incorrect local autotuned nodes found (r   ))rx   rF   r   noderV   r   origin_noderP   getrQ   r1   get_min_choicer   r/   appendr   rL   rK   r!   )r@   rA   r   
inner_noder   rP   info
min_choicer}   r{   r\   s              r+   r=   r=   6  s0    13$..))#J,j"<==*&9::%111K:$$$D-xx12<zzz
 #113
"4::z:'A  D ((E E$?$?? 
1#6F2G1HUMhMhLiijk? r*   c                .   [        U R                  5       H  u  p#[        U[        5      (       d  M  [        UR                  =n[
        5      (       d  M?  UR                  c   eUR                  R                  [           nUR                  XR                     5      nUR                  n[        U[        5      (       d   eUR                  n[        U[        5      (       d   eUR                  UR                  :X  d   eU R                  XX#5        M     g)zc
Go through the nodes in the scheduler and autotune the nodes that were
autotuned on remote ranks.
N)	enumeraterx   rF   r   r   rV   r   rP   rQ   r~   r/   datar   r   rZ   _replace_node)	r@   rB   ir   	dist_noder   out_tensorboxout_storage
out_buffers	            r+   r?   r?   i  s     Y__-dM**z))#Y&@0
 0
 ((444((--.GHD%../?

/KLM',,Kk:6666$))Jj/::::$$	(8(8888##J1C .r*   )r   r   )r@   #torch._inductor.scheduler.Schedulerr   r   )r   zGenerator[None, None, None])
rW   r`   rX   zlist[ChoiceCaller]rY   r   rZ   r   r   zTensorBox | None)rA   list[_SerializedChoice]r   Sequence[_SerializedChoice])r@   r   r   r   )r@   r   rB   r   r   r   )8
__future__r   
contextlibdataclassestypingr   r   unittest.mockr   r   torch._loggingrR   torch.distributedr   r5   torch.fxtorch.utils._ordered_setr    r	   r
   rT   r   r   r   r   r   r   r   rz   r   r   r@   r   virtualizedr   r   collections.abcr   r   rQ   r   r(   	dataclassr   r-   r:   rC   contextmanagerrI   r]   rV   r>   r   r=   r?   r"   r*   r+   <module>r      sD   "   %       / &   8 $ ' 3 3 )-& - # # #   
	8 8 8 
*4@JPB4!4 4p8Z Zz0200fD2D1D 
Dr*   