
    3jF              	          S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKrS SK	r	S SK
Js  Jr  S SKJr  S SKJr  SSKJr  SSKJrJrJr  SS	KJr  \R2                  " \5      r " S
 S\5      r " S S\5      r\ R<                  S\4S j5       rS\ S\4S jr!S\RD                  S\4S jr#S6S\	RH                  S\%S\%4S jjr&S6S\	RH                  S\%S\%4S jjr'S\RD                  S\%4S jr(S\RD                  S\%4S jr) " S S\5      r* " S S\5      r+ " S S\5      r,S /S!//r-S"/S"//S#/S#//S$/S%///r./ S&Q/ S'Q/ S(Q/ S)Q/r/S\0S-  4S* jr1S+\%S,\%S-\S\04S. jr2S\RD                  S\04S/ jr3S0\	Rh                  Rj                  S\%4S1 jr6S0\	Rh                  Rj                  S\%4S2 jr7  S7S0\	Rh                  Rj                  S3\%S-  S4\8S\04S5 jjr9g)8    N)IntEnum)Any)optimization_hint)normalize_function   )ir)get_dtype_sizesnode_args_kwargssympy_product)Vc                   ,    \ rS rSrSrSrSrSrSrSr	Sr
g	)
	NCCL_COLL   r   r                N)__name__
__module____qualname____firstlineno__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER
ALL_TO_ALLUNSUPPORTEDP2P__static_attributes__r       W/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/_inductor/comm_analysis.pyr   r      s     JJNJK
Cr    r   c                   $    \ rS rSrSrSrSrSrSrg)NVIDIA_GPU_TYPE    r   r   r   r   r   N)	r   r   r   r   VOLTAAMPEREHOPPER	BLACKWELLr   r   r    r!   r#   r#       s    EFFIr    r#   returnc                    ^  [         R                  R                  R                  [         R                  R                  R                  5      =(       d    Sm ST ;   a  [
        R                  $ ST ;   a  [
        R                  $ ST ;   a  [
        R                  $ [        U 4S jS 5       5      (       a  [
        R                  $ [
        R                  $ )N V100A100H100c              3   ,   >#    U  H	  oT;   v   M     g 7fNr   ).0gpugpu_infos     r!   	<genexpr>get_gpu_type.<locals>.<genexpr>0   s     A(@H_(@   )B100B200B300)torchutilscollect_envget_gpu_inforunr#   r%   r&   r'   anyr(   )r3   s   @r!   get_gpu_typer@   '   s    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%%	A(@A	A	A((( %%%r    kernel_namec                 Z  ^  T c   eST ;   a  [         R                  $ ST ;   a  [         R                  $ ST ;   a  [         R                  $ [	        U 4S jS 5       5      (       a  [         R
                  $ [	        U 4S jS 5       5      (       a  [         R                  $ [         R                  $ )N
all_reduce
all_gatherreduce_scatterc              3   ,   >#    U  H	  oT;   v   M     g 7fr0   r   r1   commrA   s     r!   r4   7get_collective_type_from_kernel_name.<locals>.<genexpr>?   s     H-GT[ -Gr6   )
all_to_allalltoallc              3   ,   >#    U  H	  oT;   v   M     g 7fr0   r   rG   s     r!   r4   rI   A   s     M-LT[ -Lr6   )isendirecv	batch_p2p)r   r   r   r   r?   r   r   r   )rA   s   `r!   $get_collective_type_from_kernel_namerP   7   s    """{"###		$###	[	('''	H-GH	H	H###	M-LM	M	M}}$$$r    nodec                     [        U [        R                  5      (       d  [        SU  35      eU R                  nUc   e[        U5      $ )Nz!node is not a collective kernel: )
isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namerP   )rQ   names     r!   get_collective_typerX   G   sJ    dB0011<TFCDD""D/55r    sizefallbackc                     [        U 5      n[        U[        R                  5      (       a  [	        U5      $ [
        R                  R                  R                  X!S9$ )NrZ   )	r   rS   sympyIntegerintr   graphsizevarsr   )rY   rZ   numels      r!   get_ir_node_size_numelrc   P   sE    $E%''5z77--e-GGr    c                 d    [         R                  " [        R                  U S5      n[	        X!S9nU$ )Nr   r\   )	functoolsreduceoperatormulr   )rY   rZ   rb   results       r!   get_fx_node_size_numelrj   W   s)    X\\43Eu8FMr    c                     SnU R                    HF  n[        UR                  R                  5      nX[	        UR                  R
                  5      -  -  nMH     U$ )Nr   )inputsrc   layoutrY   r	   dtype)rQ   sz_bytesinprb   s       r!   get_collective_input_size_bytesrq   ]   sJ    H{{&szz7N3::+;+;<<<  Or    c                     [        U [        R                  5      (       a:  [        U [        R                  5      (       d  SSKJn  U" U R                  S   5      $ [        SU  35      e)Nr   _get_group_size_by_namezUnsupported collective type: )rS   r   rT   _WaitKernel"torch.distributed.distributed_c10drt   constant_args	TypeError)rQ   rt   s     r!   get_collective_group_sizerz   e   sQ    $,,--jr~~6V6VN&t'9'9"'=>>7v>??r    c                        \ rS rSrSrSrSrSrg)NCCL_HWs   r   r   r   r   N)r   r   r   r   NVLINKPCINETr   r   r    r!   r|   r|   s   s    F
C
Cr    r|   c                       \ rS rSrSrSrSrg)	NCCL_ALGOy   r   r   r   N)r   r   r   r   TREERINGr   r   r    r!   r   r   y   s    DDr    r   c                       \ rS rSrSrSrg)
NCCL_PROTO~   r   r   N)r   r   r   r   LLr   r   r    r!   r   r   ~   s	     
Br    r   g333333@gffffff@g333333?      ?g      @g@)     C@r   gffffff4@)gU@g     6@g      3@)g     a@g     F@g     A@)g     q@g     V@g     Q@c                 R   U R                   nUc   e[        USS5      nUR                  S   nSSKJn  U" U5      n[
        R                  R                  U5      n[
        R                  " SU 35      n[        U5      n[        U 5      u  pSU;   a  U	SS  U	S   -   n	[
        R                  R                  XWS	9 nU" U	0 U
D6n[
        R                  R                  R                  R                  U5        S S S 5        WR                   nUS:  a  g US
-  nU$ ! , (       d  f       N(= f)NrV   r+   ru   r   )_resolve_process_groupzcuda:all_gather_into_tensor_outr   groupdevice     @@)rQ   getattrrx   rw   r   r:   distributedget_rankr   evalr
   _time_estimatorops_c10d_functionalwait_tensordefaultestimated_time)snodekernelpy_kernel_namepg_namer   pgrankr   fnargskwargstime_estimatorwest_time_usest_time_mss                  r!   /estimate_nccl_collective_runtime_nccl_estimatorr      s"   ZZFV%92>N""2&GI		(B!!**2.D \\E$.)F	n	B$U+LD $~5ABx$q'!				*	*	*	C~		""..66q9 
D !//K Q#K 
D	Cs   :<D
D&tensor_storage_size_bytes
group_sizecollc                 @   U S-  S-  S-  nSn[         R                  " X-  5      nUnUS::  a  gU[        R                  :X  a  g[        R
                  n[        R                  n[        R                  R                  R                  n	[        R                  R                  R                  n
[        5       nUS::  a  US-
  OSnUS:X  a  UOSn[        U   U   nUS:X  a  U	OU
nSnUU-  n[        UUUS:  d  U[        R                   :X  a  SOS-  5      nU[        R                   :X  a	  SUS-
  -  nO]U[        R"                  :X  a	  SUS-
  -  nO@U[        R$                  [        R&                  4;   a  US-
  nOU[        R(                  :X  a  SnSU-  W-  nUU-  nUS	-  n[*        R,                  nU[        R                   :X  a  US:  a  SU-  nOZSnOWU[        R$                  [        R&                  [        R"                  4;   a  US-
  nOU[        R(                  :X  a
  US:  a  SOSn[.        U   U   n[0        U   U   U   n[0        [*        R2                     U   U   nS
nUS:  a  Sn[5        UU5      nUUW-
  U-  UU-  -   -  nUS-  nUU-  nUU-   nUS-  nU$ )a  
Returns estimated NCCL collective runtime in milliseconds (ms).

The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
We aim to estimate the runtime as accurately as possible.

Assumptions:
- only ring algorithm (NCCL_ALGO_RING) is used
- only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
- 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
- collective is one of: allreduce, reducescatter, allgather
i      r   r   r   g      ?gUUUUUU?r   g    eAg        r   g    .A)mathceilr   r   r   r   r   r   r:   	_inductorconfigintra_node_bwinter_node_bwr@   llMaxBwsminr   r   r   r   r   r|   r~   baseLathwLatr   max) r   r   r   tensor_storage_size_GBnum_gpus_per_nodenNodesnRanks	nccl_algo
nccl_protobwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsnsmss                                    r!   %estimate_nccl_collective_runtime_implr      s     7=DtK YYz56FF{y$$$ IJ
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	%%	%fqj!	)**I,@,@A	A!		 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@)BVBVW	Wqj		!A:a1 i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L	
	"B	cBIr    c                 \    [        U 5      n[        U 5      n[        U 5      n[        XU5      $ )  
Returns estimated NCCL collective runtime in nanoseconds (ms).

The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
We aim to estimate the runtime as accurately as possible.

Assumptions:
- only ring algorithm (NCCL_ALGO_RING) is used
- only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
- 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
- collective is one of: allreduce, reducescatter, allgather
)rq   rz   rX   r   )rQ   r   r   r   s       r!    estimate_nccl_collective_runtimer   ^  s6     !@ E*40Jt$D0!t r    fx_nodec                 Z  ^^ SmU R                   U R                  p![        U5      nUR                  SS5        S[        R
                  S[        4S jmS[        R                  R                  4UU4S jjn[        R                  " [        R                  R                  UX45        U R                  R                  SS5      nTb  Uc  g	[        U[        [        45      (       a  [!        U4S
 jU 5       5      nTU-   $ [        U[        R
                  5      (       a  T" U5      nTU-   $ g	)zSEstimate the size of a collective operation in bytes, including inputs and outputs.Nouttr)   c                 `    [        U R                  5       5      [        U R                  5      -  $ r0   )rj   rY   r	   rn   )r   s    r!   tensor_bytes1estimate_fx_collective_size.<locals>.tensor_bytes}  s!    %affh/.2IIIr    rp   c                    > U R                   R                  SS 5      n[        U[        R                  5      (       d  g Tc  SmTT" U5      -  mg )Nvalr   )metagetrS   r:   Tensor)rp   inp_valinput_bytesr   s     r!   add_inp_bytes2estimate_fx_collective_size.<locals>.add_inp_bytes  sG    ((,,ud+'5<<00 K|G,,r    r   r   c              3   v   >#    U  H.  n[        U[        R                  5      (       d  M$  T" U5      v   M0     g 7fr0   )rS   r:   r   )r1   r   r   s     r!   r4   .estimate_fx_collective_size.<locals>.<genexpr>  s)      
%/:a3NOLOOZs   #99)r   r   dictpopr:   r   r_   fxNodepytreetree_map_onlyr   r   rS   listtuplesum)r   r   r   r   
output_valoutput_bytesr   r   s         @@r!   estimate_fx_collective_sizer   s  s   K<<&&\F JJudJ J J-588== - - 	 !!%.Jj0 *tUm,, 
%/
 
 %% 
J	-	-#J/ %% r    c                 L    SSK Jn  [        U 5      nU" U 5      (       d  U$ US-  $ )zEstimate the memory footprint of a collective operation in bytes.

This returns the total bytes that need to be live concurrently in memory.
For all_reduce, we divide by 2 since it can be done in-place.
r   )is_all_reduce_tensorr   )#torch._inductor.fx_passes.bucketingr   r   )r   is_all_reducerY   s      r!   'estimate_fx_collective_memory_footprintr     s,     'w/D$W--4<419<r    override_sizeuse_nccl_estimatorc                   ^ ^^
^^ SSK Jn  T R                  [        R                  R
                  R                  R                  L a  SnTc  [        T 5      nOTn[        T R                  [        5      (       a   e[        T R                  T R                  T R                  SS9nUc   eUu  m
mTS   mU" T5      n[        T R                  [        R                  R                  5      (       d   e[!        T R                  R#                  5       5      nS[$        S-  4U
U UUU4S	 jjnU(       a  U" 5       n	U	b  U	$ ['        XFU5      $ )
r   r   rs   FNT)r   r   normalize_to_only_use_kwargs
group_namer)   c                  Z  >^^ SSK Jn Jn  U " T5      n[        R                  R
                  R                  U5      UR                  :X  a  g [        R                  " S5      n UR                  U5      nUR                  (       d  g [        R                  " TT45      u  pVS[        R                  4U4S jjmS[        S[        4UU4S jjmU Vs/ s H  nT" U5      PM     nn[        R                   " XV5      u  pTR"                  n
[%        U
[        R&                  R(                  5      (       d   e[        R                  R+                  X#S9 nU
" U0 U	D6n[%        U[,        [.        45      (       a=  U H6  n[        R0                  R2                  R4                  R7                  U5        M8     O3[        R0                  R2                  R4                  R7                  U5        S S S 5        WR8                  nUS:  a  g US	-  nU$ ! [         a     g f = fs  snf ! , (       d  f       N== f)
Nr   )r   Backendcudar)   c                 <   > [         R                  " Tc  U OT/UUS9$ )N)rn   r   )r:   empty)rY   rn   r   r   s      r!   _tensorVestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>._tensor  s&    ;;%-M? r    ec                 .  > [        U [        R                  R                  5      (       a  T" U R                  S   5      $ [        U [        R
                  5      (       a6  T" [        U R                  5       5      /U R                  U R                  5      $ U $ )Nr   )
rS   r:   r   r   r   r   rj   rY   rn   r   )r  r	  to_real_tensors    r!   r  ]estimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>.to_real_tensor  sg    !UXX]]++%affUm44!U\\** 6qvvx @A177AHHUUHr    r   r   )rw   r   r  r:   r   distributed_c10dget_backendFAKEr   _get_backendRuntimeErrorsupports_time_estimater   tree_flattenr   r   tree_unflattentargetrS   _ops
OpOverloadr   r   r   r   r   r   r   r   )r   r  r   r   backend	flat_argsflat_args_pytree_speca	real_argsreal_kwargsr   r   r   r   r   r   r	  r  r   r   r  r   r   s                   @@r!   _nccl_estimateEestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate  s   V#J/--99"=Mf%	oof-G --+1+>+>f~+N(		ELL 		c 	c 	 	 1::	1^A&		:!'!6!6y!X	^^"ejj334444.. / 
I--A!dE]++AII..::BB1E  		**66>>qA
 %33 ?!C'U  		* ;

 
s%   #H HBH
HH
H*)rw   rt   r  r:   r   r   all_to_all_singler   r   rS   strr   r   r   r  r  rP   rW   floatr   )r   r   r   rt   r   opt_args_kwargsr   r   r   r   r   r  r   s   ``        @@@r!   -estimate_nccl_collective_runtime_from_fx_noder&    s'   " K~~33EEMMM #$?$H!$1!'..#....(\\~~%)	O &&&"LD&%J(4Jgnnejj&;&;<<<</0C0C0EFD6EDL 6 6p $&"0!t r    )i   )NT):re   loggingr   rg   enumr   typingr   r]   r:   torch.utils._pytreer;   _pytreer   %torch.fx.experimental.symbolic_shapesr   torch.fx.operator_schemasr   r+   r   r	   r
   r   virtualizedr   	getLoggerr   logr   r#   	lru_cacher@   r#  rP   IRNoderX   Sizer_   rc   rj   rq   rz   r|   r   r   r   r   r   r$  r   r   r   r   r   r   r   boolr&  r   r    r!   <module>r5     s{           $ $ C 8  C C  ! g  &o & &%c %i % 6bii 6I 6H Hs HS H s S "))  @BII @# @g  
  	
 		  
	 
	 
		,)8edl Bn"n03n;Dn
nl299  *,& ,&3 ,&^=UXX]] =s =  !%#kXX]]k:k k 	kr    