
    3j:                        % S r SSKJr  SSKJr  SSKJr  SSKJr  SSK	r	SSK
JrJr   SSKJr  \" \5      rSq\S-  \S	'   S
q\\S'   S*S jrS*S jrS\4S jrS\S\S-  4S jrS\S\4S jrS\S\4S jr\" \5      r\\\\   4   \S'   Sq \!\   S-  \S'   S\!\   4S jr"/ r#\\$\\\4      \S'   Sq%\\S'   Sq&\S-  \S'   \S\'\(\'\4   -  4S j5       r)S*S jr*S\	RV                  RX                  SS4S  jr-S\(\\\   4   4S! jr.S*S" jr/S#q0\\S$'   0 r1\(\\4   \S%'   S\	RV                  Rd                  S\4S& jr3S'\'S\4S( jr4\S\	RV                  Rd                  S\'\(\'\4   -  4S) j5       r5g! \ a    Sr GNNf = f)+a  Annotate CUDA graph kernel nodes during capture.

During CUDA graph capture, ``mark_kernels`` uses ``cudaGraphGetNodes``
to count nodes before and after the wrapped region.  Nodes at indices
``[before, after)`` are the ones added within the scope.  Each kernel
or memcpy node found is annotated by its ``toolsId`` so it can later
be matched to profiler trace events.

The annotations can be pickled and later merged into a Chrome profiler
trace using ``torch.cuda._annotate_cuda_graph_trace``.

Requires ``cuda.bindings`` package and a CUDA driver that supports
``cudaGraphNodeGetToolsId`` (CUDA >= 13.1 or appropriate cuda-compat).
When unavailable, ``mark_kernels`` silently becomes a no-op.

Usage during capture::

    from torch.cuda._graph_annotations import (
        enable_annotations,
        mark_kernels,
        resolve_pending_annotations,
        remap_to_exec_graph,
    )

    enable_annotations()

    with torch.cuda.graph(graph):
        with mark_kernels("phase_A"):
            y = workload_a(x)
        with mark_kernels("phase_B"):
            z = workload_b(y)
        resolve_pending_annotations()

    remap_to_exec_graph(graph)
    )defaultdict)contextmanager)	getLogger)AnyN)_check_cuda_bindings_HAS_CUDA_BINDINGS)runtime_tools_id_availableF_annotations_enabledreturnc                      Sq g)z#Enable kernel annotation recording.TNr        W/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/cuda/_graph_annotations.pyenable_annotationsr   A   s
      r   c                      Sq g)z$Disable kernel annotation recording.FNr   r   r   r   disable_annotationsr   G   s
     !r   c                  \    [         (       d  g[        SL a  g[        [        S5      (       d  gg)zBReturn True if we already know cudaGraphNodeGetToolsId is missing.TFcudaGraphNodeGetToolsId)r   r
   hasattr_cuda_runtimer   r   r   _is_tools_id_unavailabler   M   s*    e#=";<<r   nodec                     [         c$   [        [        R                  " U 5      5      nSq U$ [        [        R                  " U 5      5      $ ! [         a    Sq [
        R                  S5         gf = f)z<Return the toolsId for a graph node, or None if unavailable.NFzUcudaGraphNodeGetToolsId not available; CUDA graph kernel annotations will be disabledT)r
   r   r   r   	Exceptionloggerinfo)r   tools_ids     r   _get_tools_idr    X   s     "	+55H #--	
   	"'KKA 	s   A !A/.A/streamc                     [        [        R                  " U 5      5      u  pp4pVU[        R                  R                  :w  a  gU$ )z8Return the graph handle for the active capture, or None.N)r   r   cudaStreamGetCaptureInfocudaStreamCaptureStatuscudaStreamCaptureStatusActive)r!   status_idgraph_deps
_edge_data	_num_depss          r   _get_capture_graphr,   r   sH    7K..	
84Fz 	00NN	O Lr   r(   c                 F    [        [        R                  " U SS95      u  pU$ )z2Return the number of nodes currently in the graph.r   numNodes)r   r   cudaGraphGetNodes)r(   _nums      r   _get_node_countr3      s(    !''A	
FA
 Jr   _kernel_annotations_ANNOTATABLE_TYPESc                      [         c4  [        R                  R                  [        R                  R                  1q [         $ N)r5   r   cudaGraphNodeTypecudaGraphNodeTypeKernelcudaGraphNodeTypeMemcpyr   r   r   _get_annotatable_typesr;      s6    !++CC++CC
 r   _pending_scopes_capture_graph_last_capture_graph_id
annotationc              #     #    [         (       a  [        5       (       a  Sv   g[        U [        5      (       a  SU 0n [        R
                  " [        R                  R                  5       R                  S9n[        U5      nUc  Sv   gUq[        U5      nSv   [        U5      nXC:  a  [        R                  XU45        gg7f)a  Context manager that records node index ranges for later annotation.

During capture, calls ``cudaGraphGetNodes`` to count graph nodes before
and after the scope.  Nodes at indices ``[before, after)`` were added
inside the scope.  After capture, ``resolve_pending_annotations``
enumerates all nodes and annotates kernel/memcpy nodes in those ranges.

Must be called inside an active ``torch.cuda.graph()`` capture.  If the
current stream is not capturing, or if ``cudaGraphNodeGetToolsId`` is not
available, the context manager is a no-op.

Args:
    annotation: Arbitrary object appended to the annotation list for
        every kernel/memcpy node whose index falls within this scope.
Nstr
init_value)r   r   
isinstancerA   r   cudaStream_ttorchcudacurrent_streamcuda_streamr,   r=   r3   r<   append)r?   r!   r(   start_count	end_counts        r   mark_kernelsrM      s     "  #;#=#=*c""Z(
''::,,.::F v&E} N!%(K	&I
CD s   C Cc                     [         (       d  Sqg[        R                  " [        R
                  R                  5       R                  S9n [        U 5      nUc  [        nUc*  [        R                  S5        [         R                  5         g [        U5      nUS:X  a.  [         R                  5         Sq [         R                  5         Sqg[        [        R                  " XS95      u  p2US:  a  [        US   5      nU(       a  US-	  OSq[#        5       n[%        S ['        [         5       5       S S	9nSn/ n[)        U5       GH  n	U(       a1  US
   S   U	::  a%  UR+                  5         U(       a  US
   S   U	::  a  M%  U[-        U5      :  aJ  Xg   S   U	::  a?  Xg   u  ppX:  a  UR/                  X45        US-  nU[-        U5      :  a  Xg   S   U	::  a  M?  U(       d  M  X9   n[        [        R0                  " U5      5      nX;  a  M  [        U5      nUcC  [        R                  S5        [         R                  5         Sq  [         R                  5         Sqg[-        U5      S:X  a!  [2        U   R/                  US   S   5        GMI  0 n[5        U5       H[  u  nn
[7        U
[8        5      (       a.  U
R;                  5        H  u  nnUR=                  UU5        M     MI  UR=                  SU
5        M]     [2        U   R/                  U5        GM     [         R                  5         Sqg! [>         a    [        RA                  S5         N8f = f! [         R                  5         Sqf = f)zResolve pending scope index ranges into kernel annotations.

Enumerates all graph nodes and annotates kernel/memcpy nodes whose
indices fall within recorded scope ranges. Must be called while still
inside the ``torch.cuda.graph()`` capture context.
NrB   z6resolve_pending_annotations: no graph handle availabler   r.       c              3   6   #    U  H  u  nu  p#nX#XA4v   M     g 7fr7   r   ).0iannstartends        r   	<genexpr>.resolve_pending_annotations.<locals>.<genexpr>  s&      ,F(A(C S$,Fs   c                 "    U S   U S   * U S   * 4$ )N         r   )ss    r   <lambda>-resolve_pending_annotations.<locals>.<lambda>  s    1Q4!A$1.r   )keyrY   z:resolve_pending_annotations: toolsId unavailable, abortingnamez"resolve_pending_annotations failed)!r<   r=   r   rE   rF   rG   rH   rI   r,   r   warningclearr3   r   r0   r    r>   r;   sorted	enumeraterangepoplenrJ   cudaGraphNodeGetTyper4   reversedrD   dictitems
setdefaultr   	exception)r!   r(   r2   nodes	first_tidannotatablesorted_scopes	scope_ptractive_stackrR   rS   
_start_idxend_idx_idxr   	node_typer   mergedr1   akavs                        r   resolve_pending_annotationsr|      s;    ? ''::,,.::F v&E}}OPZe$!8!!#!Nh 	g *++

 7%eAh/I:Ci2o",. ,5o,F /
 	.0sA<#3A#6!#;  " <#3A#6!#; c-00]5Ma5PTU5U1>1I.; ''7Q		 c-00]5Ma5PTU5U  8D,22I
 +$T*HP  %%'!%* 	) < A%#H-44\!_Q5GH *,&|4FAs!#t,,&)iikFB"--b"5 '2 ))&#6 5 $H-44V<a h 		  ?=>? 	s@   'L, 
B3L, ?AL, A-L, B9L, ,MM MM M)torch_cuda_graphc                    [         (       d  g[        R                  " U R                  5       S9n[	        [        R
                  " U5      5      n[        n0 n[         R                  5        HL  u  pVUS-	  nUb  Xs:w  a  XdU'   M  US-  nUS-  U-  n	X;   a  XI   R                  U5        M?  [        U5      XI'   MN     [         R                  5         [         R                  U5        g)aH  Remap annotation keys from capture graph ID to exec graph ID.

During capture, toolsId encodes the capture graph's ID in the upper
32 bits. After instantiation, the profiler uses the exec graph's ID.
This function rewrites the keys so annotations match the trace.

Must be called after the ``torch.cuda.graph()`` context exits.
NrB   rO   l    )r4   r   cudaGraphExec_traw_cuda_graph_execr   cudaGraphExecGetIdr>   rl   extendlistrc   update)
r}   exec_handleexec_graph_idcapture_graph_idremappedr   ann_listgraph_idnode_idnew_tools_ids
             r   remap_to_exec_graphr   I  s     //#779K )((	
M .%'H1779r>'H,H!)XZ'%+w6#"))(3%)(^H" : x(r   c                      [         $ )zBReturn the current kernel annotation map (toolsId -> annotations).)r4   r   r   r   get_kernel_annotationsr   t  s    r   c                  X    [         R                  5         [        R                  5         Sqg)z9Clear all recorded kernel annotations and pending scopes.N)r4   rc   r<   r=   r   r   r   clear_kernel_annotationsr   y  s!     Nr   <   _stream_id_counter_stream_id_mapc                 l    U R                   nU[        ;  a  [        [        U'   [        S-  q[        U   $ )z;Return a small, stable stream ID for the given CUDA stream.rY   )rI   r   r   )r!   r_   s     r   _get_stream_idr     s7     

C
. 0sa#r   pg_keyc                 T    U [         ;  a  [        [         U '   [        S-  q[         U    $ )z:Return a unique stream ID for the given process group key.rY   )r   r   )r   s    r   get_stream_for_pgr     s,     ^#!3va&!!r   c              #     #    [         (       d-  [        R                  R                  U 5         Sv   SSS5        gU R                  [        R                  R                  5       R                  :X  a  [        U5         Sv   SSS5        g[        U[        5      (       a  SU0n[        U[        5      (       a  [        U 5      US'   [        R                  R                  U 5         [        U5         Sv   SSS5        SSS5        g! , (       d  f       g= f! , (       d  f       g= f! , (       d  f       N9= f! , (       d  f       g= f7f)u@  Switch to stream, inject its ID into annotation, and mark kernels.

If *stream* is already the current stream, no stream switch or stream ID
injection happens — the kernels stay on whatever stream is active (which
keeps the trace faithful when e.g. FSDP uses the current stream for
copy-in instead of a separate one).
NrA   r!   )r   rF   rG   r!   rI   rH   rM   rD   rA   rk   r   )r!   r?   s     r   mark_streamr     s       ZZv& 'UZZ668DDD*% &% j#&&,Jj$''#1&#9Jx ZZv&j) * '& '&%% *) '&se   +EDA
E<DA$E%D:1D)6D:>	E
DE
D&"E)
D7	3D::
EE)r   N)6__doc__collectionsr   
contextlibr   loggingr   typingr   rF   torch.cuda._utilsr   r   cuda.bindingsr	   r   ImportError__name__r   r
   bool__annotations__r   r   r   r   intr    r,   r3   r   r4   r5   setr;   r<   tupler=   r>   rA   rk   rM   r|   rG   	CUDAGraphr   r   r   r   r   Streamr   r   r   r   r   r   <module>r      s1  "H $ %    F 
8	
 $( TD[ ' # d " !$  d
 4s s 3 3  4?t3D [d3i0 D '+ CHtO *C  /1eCcM*+ 0   &* d
 ) )ES4S>1 )E )EXrj()%***>*> ()4 ()VS$s)^ 4 
  C !#S#X #5::,,  "c "c " 

)) sT#s(^7K  U  Ms   E2 2E>=E>