
    3jG                    "   S SK Jr  S SKrS SKJr  S SKJr  S SKJrJ	r	J
r
  S SKrS SKJrJr  S SKJrJr  S SKJrJr  S S	KJr  S
SKJr  \	(       a  S SKJrJr  S SKJr  \
" SSS9r\R>                  RA                  \!S5      r"\R>                  RA                  \!S5      r#\$\%\RL                  -  S-     r'\\$\   /\'4   r( " S S5      r)\RT                  " SSS9 " S S5      5       r+\RT                  " SSS9 " S S5      5       r,\RT                  " SSS9 " S S5      5       r-    S7S jr.S8S jr/S9S jr0S:S  jr1S;S! jr2      S<S" jr3        S=S# jr4S>S$ jr5    S?S% jr6    S?S& jr7S@S' jr8\RT                   " S( S)5      5       r9          SAS* jr:SBS+ jr; " S, S-\5      r<            SCS. jr=      SDS/ jr>\RT                  " SS09 " S1 S25      5       r?\RT                  " SS09 " S3 S45      5       r@      SES5 jrASFS6 jrBg)G    )annotationsN)Callable)Enum)AnyTYPE_CHECKINGTypeVar)countersget_metrics_context)GraphPartitionMap	InputType)get_plain_tensorsis_fake)
OrderedSet   )is_using_cudagraph_partition)SequenceSet)
OutputCode_OCr   )bound
cudagraphscudagraph_static_inputsc                  V    \ rS rSrSr                SS jrS	S jrS
S jrSrg)CUDAGraphPolicy$   a*  Pluggable policy controlling CUDA graph wrapping in Inductor's post_compile.

Override methods to customize:
  - HOW compiled functions are cudagraph-wrapped (cudagraphify)
  - WHETHER inner CompiledFxGraphs should be wrapped (should_wrap)
  - OUTER wrapping of compound outputs like RegionalOutputCode (wrap_output)

Set via ``torch._inductor.config.cudagraph_policy``.  When ``None``
(the default), the existing built-in behaviour is used unchanged.

Example usage::

    class MyCUDAGraphPolicy(CUDAGraphPolicy):
        def cudagraphify(self, model, example_inputs, static_input_idxs, **kwargs):
            return my_custom_wrapper(model, example_inputs, static_input_idxs)


    with torch._inductor.config.patch("cudagraph_policy", MyCUDAGraphPolicy()):
        compiled_fn = deserialize_artifacts(...)
c               *    SSK Jn  U" UU4UUUS.UD6$ )al  Wrap a single compiled callable with CUDA graph capture/replay.

Called by ``cudagraph_post_compile`` for each ``CompiledFxGraph``.
The default delegates to ``compile_fx.cudagraphify`` (cudagraph_trees).

``example_inputs`` are the example inputs at post_compile time.
The default implementation does not forward them because
``compile_fx.cudagraphify`` defers graph recording to the first
real call via an inner closure.  Subclasses that need the
example inputs for warmup or static-input detection may use them.

When ``config.graph_partition=True``, setting a CUDAGraphPolicy
bypasses ``cudagraph_partition_post_compile`` (which wraps each
partition individually) and routes through ``cudagraph_post_compile``
instead, so this method wraps the *entire* callable, not individual
partitions.  Subclasses that need per-partition control should
handle partitioning internally.
r   )cudagraphify)device_indexis_backwardis_inference)torch._inductor.compile_fxr   )	selfmodelexample_inputsstatic_input_idxsr   r   r    kwargsr   s	            Y/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/_inductor/cudagraph_utils.pyr   CUDAGraphPolicy.cudagraphify:   s4    : 	<
 &#%
 
 	
    c                    g)a)  Whether to apply cudagraph wrapping to this CompiledFxGraph.

Called for each inner ``CompiledFxGraph`` during ``post_compile``.
Return ``False`` to skip wrapping (e.g. when wrapping at the outer
level via ``wrap_output`` instead).

Default: ``True`` (wrap everything, same as current behaviour).
T )r"   compiled_graphs     r'   should_wrapCUDAGraphPolicy.should_wrapb   s     r)   c                    U$ )a  Optional outer-level wrapping after inner post_compile completes.

Called by ``_compile_fx_inner``, ``BundledOutputCodeLoadable.post_compile``,
and ``FxGraphCacheLoadable.post_compile`` on the ``OutputCode`` returned
from ``post_compile``.  Subclasses that only want to wrap specific
output types should check ``isinstance`` and return the input
unchanged for types they don't handle.

Default: identity (no outer wrapping).
r+   )r"   output_codes     r'   wrap_outputCUDAGraphPolicy.wrap_outputm   s
     r)   r+   N)r#   Callable[..., Any]r$   zSequence[InputType]r%   Sequence[int]r   intr   boolr    r6   r&   r   returnr3   )r,   r   r7   r6   )r0   r   r7   r   )	__name__
__module____qualname____firstlineno____doc__r   r-   r1   __static_attributes__r+   r)   r'   r   r   $   se    *&
!&
 ,&
 )	&
 &
 &
 &
 &
 
&
P	r)   r   T)frozenslotsc                  $    \ rS rSr% SrS\S'   Srg)
FunctionID{   z9Unique counter of a function wrapped in cudagraphify_implr5   idr+   Nr8   r9   r:   r;   r<   __annotations__r=   r+   r)   r'   rA   rA   {   s
    ?Gr)   rA   c                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
rg)PlaceholderInfo   z
A serializable version of torch.fx.Node that contains information
pertinent to placeholder stack traces. We use these in logging and error messages
related to cudagraphs, and will cache these results.
strname
str | Nonestack_tracelist[PlaceholderInfo]usersmutating_use_stack_tracer+   NrD   r+   r)   r'   rG   rG      s      I  ((r)   rG   c                  V    \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S\S'   S\S'   Srg)WrappedFunction   z
Represents a function that you want to record for CUDA graph replay,
with a little more metadata so we can identify if we have an applicable
CUDA graph in our CUDA graph tree for it.
r3   r#   r4   r%   rA   rC   ztuple[torch.Tensor, ...]	constantsSequence[PlaceholderInfo]placeholdersmutated_input_idxsr+   NrD   r+   r)   r'   rQ   rQ      s,     $$N''++%%r)   rQ   c                   [        U R                  5      S:X  a8  [        [        U R                  5      5      R                  R                  SS 5      $ U R                   Hg  nUR                  [        R                  R                  R                  R                  L d  M@  UR                  R                  SS 5      =n(       d  Me  Us  $    g )Nr   rL   )lenrN   nextitermetagettargettorchopsatencopy_default)placeholder_nodeuserL   s      r'   &get_mutating_use_stack_trace_from_nodere      s     !!"a'D)//0166::=$OO%%::--555!hhll=$??{?"" &
 r)   c                    U R                   $ N)rO   )placeholder_infos    r'   get_mutating_use_stack_traceri      s    444r)   c                    U R                   nU R                  R                  SS 5      n/ nS nU R                  S:X  a0  U R                   Vs/ s H  n[        U5      PM     nn[        U 5      n[        XX45      $ s  snf )NrL   placeholder)rJ   r[   r\   oprN   to_placeholder_infore   rG   )rc   rJ   rL   rN   rO   is         r'   rm   rm      s      D"''++M4@KE#m+1A1G1GH1GA$Q'1GH#I$
  4eNN Is   A9c                z    U R                    Vs/ s H  oR                  S:X  d  M  [        U5      PM!     sn$ s  snf )Nrk   )nodesrl   rm   )graphnodes     r'   get_placeholder_infors      s9    .3kk.9dWW=U!D!k  s   88c                    SU  3$ )Nzskipping cudagraphs due to r+   )reasons    r'   format_default_skip_messagerv      s    (11r)   c                    SnU H  nX   n[        U5      =n(       d  M    O   [        S[        U5       S35      nU(       a  U SU 3$ U$ )N zmutated inputs (z instances). Found from : 
 )ri   rv   rX   )rU   mutation_indicesrL   idxrk   msgs         r'   get_mutation_stack_tracer}      si     !K"'6{CC;C  
 &
3/01=C (66Jr)   c                   [         R                  R                  R                  R                  (       a?  U R
                   Vs/ s H'  nX0R                  ;   a  M  U" X   5      (       a  M%  UPM)     nnOU R
                  n[        R                  SU R                  5        [        R                  SU5        U(       a  [        U R                  U5      $ S $ s  snf )Nz'check mutation static input indices: %sz#check mutation mutation indices: %s)r^   	_inductorconfigtritoncudagraph_treesrV   r%   static_inputs_logdebugr}   rU   )funcinputsis_cuda_graph_recorded_tensorr{   rz   s        r'   check_for_mutationr      s     $$44 ..+
.---  1=	 . 	 +
  22143I3I ACST  	!!2!24DE !+
s   CC*Cc                x    U R                    H*  nUR                  R                  SS 5      =n(       d  M(  Us  $    g )NrL   )rN   r[   r\   )rr   rd   rL   s      r'   _get_use_stack_tracer      s5    zz((,,}d;;;;  r)   c                V   U R                  [        R                  " S5      S 5        [        5       (       a&  U R                  [        R                  " S5      S 5        U R	                  [        R                  " S5      5      =n(       a=  SUR
                   S3n[        U5      =n(       a  [        U SU 35      $ [        U5      $ [        U 5      S:X  a1  [        [        U R                  5       5      5      R                  S:X  a  g S U  5       n[        S	S
R                  U5       35      $ )Nr[   cpuzcpu device ()ry   r   cudac              3  8   #    U  H  n[        U5      v   M     g 7frg   )repr).0keys     r'   	<genexpr>:check_multiple_devices_or_any_cpu_nodes.<locals>.<genexpr>  s     :&9sc&9s   zmultiple devices: z, )popr^   devicer   r\   rJ   r   rv   rX   rY   rZ   keystypejoin)device_node_mappingcpu_noder|   rL   	keys_reprs        r'   'check_multiple_devices_or_any_cpu_nodesr     s    ELL0$7 $%%U 3T:&**5<<+>??x?X]]O1-.x88;8.#6H/VWW*3// 	 A%)..012776A:&9:I&);DIIi<P;Q'RSSr)   c                    [        U 5      $ rg   )r   )r   s    r'    check_lowering_disable_cudagraphr   #  s     33FGGr)   c                4   [         R                  U 5        [        S   S==   S-  ss'   [        R                  R
                  R                  R                  (       a  [        U 5      e[        5       nUR                  5       (       a  UR                  SU SS9  g g )Ninductorcudagraph_skipsr   cudagraph_skip_reasonT)	overwrite)cudagraphs_logwarningr	   r^   r   r   r   cudagraph_or_errorRuntimeErrorr
   in_progressset)r|   metrics_contexts     r'   #log_cudagraph_skip_and_bump_counterr   )  s|    3Z*+q0+$$773)+O""$$3SDI %r)   c                  *    \ rS rSr% S\S'   SS jrSrg)BoxedDeviceIndexi5  
int | Nonevaluec                D    Ub  [        U[        5      (       d   eXl        g rg   )
isinstancer5   r   )r"   
device_idxs     r'   r   BoxedDeviceIndex.set9  s    !Z
C%@%@@@
r)   )r   N)r   r   r7   None)r8   r9   r:   r;   rE   r   r=   r+   r)   r'   r   r   5  s     r)   r   c                n   [        S5      n[        R                  R                  R                  R
                  (       aZ  [        U5      nU Vs/ s H  ofU;  d  M
  UPM     nn[        U5      S:g  nU(       d  g [        U R                  5      n	[        X5      $ [        U5      S:g  nU(       d  S $ U$ s  snf )Nzmutated inputsr   )rv   r^   r   r   r   r   r   rX   rs   rq   r}   )
gmmutated_inputsrV   r%   default_msgunique_idxsr{   rz   has_mutationrU   s
             r'   3check_for_mutation_ignore_cuda_graph_managed_tensorr   >  s     ..>?K $$44 !23+=X+=CKAWC+=X+,1+BHH5'GG >*a/'t8[8 Ys   	B2B2c                    U R                   (       a  U R                   $ U R                   H"  nUR                   (       d  M  UR                   s  $    g)zE
Gets the first non-empty stack trace of a placeholder or its users.
N)rL   rN   )rk   users     r'   get_placeholder_stack_tracer   V  sF     &&&!!### " r)   c                  .    \ rS rSrSrSrSrSrS	S jrSr	g)
CheckInvariantStatusid  r            c                    U R                   S:X  a  gU R                   S:X  a  gU R                   S:X  a  gU R                    SU R                   3$ )NCudagraphManagedIdxMismatchz-cudagraph managed tensor data pointer changedStaticInputIdxMismatchz!static input data pointer changed&ExpectedDeadIndicesBeforeGraphMismatchz+expected dead indices before graph are livez: )rJ   r   )r"   s    r'   __str__CheckInvariantStatus.__str__q  sK    9955BYY226YYBB@ii[4::,//r)   r+   Nr7   rI   )
r8   r9   r:   r;   SUCCESSr   r   r   r   r=   r+   r)   r'   r   r   d  s$    G #$  ./*0r)   r   c                   [        U5      [        U5      :X  a  [        U5      [        U 5      :X  d   S5       eU Vs/ s H  oQU   PM	     nnU Vs/ s H  oRU   PM	     nnU S3n[        [        Xg5      5       Hy  u  nu  p[        U	[        R
                  5      (       d   eX5   nU	R                  5       U
:w  d  MC  X   nU SUR                   SU
 SU	R                  5        S[        U5       S3
nM{     U$ s  snf s  snf )zq
Logs the mismatch between input data pointers and recorded data pointers.
This checks only idxs in target_idxs.
zClength mismatch between inputs, recorded_data_ptr, and placeholdersz.
zinput name: z. data pointer changed from z to z. input stack trace: 
)	rX   	enumeratezipr   r^   Tensordata_ptrrJ   r   )rU   r   recorded_data_ptrtarget_idxsmismatchrn   	t_tensorst_data_ptrs	error_msgtensorr   indexrk   s                r'   log_data_ptr_mismatchr   |  s    v;#/00S[CDU5U MU %00KqKI01<=AQ'K=*C I!*3y+F!GF&%,,////??(&-K+\+*:*:); <--5Jd6??;L:M N&&A+&N%OrS  "H  1=s   C8C=c                N  ^ [        U R                  5       5      S-   mSU4S jjn[        R                  R                  R
                  R                  (       aM  T[        R                  R                  R
                  R                  :  a  [        R                  U" 5       5        gg)Nr   c                    > ST  S3$ )NzCUDAGraph supports dynamic shapes by recording a new graph for each distinct input size. Recording too many CUDAGraphs may lead to extra overhead. We have observed a0   distinct sizes. Please consider the following options for better performance: a) padding inputs to a few fixed number of shapes; or b) set torch._inductor.config.triton.cudagraph_skip_dynamic_graphs=True. Set torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit=None to silence this warning.r+   )num_cudagraphss   r'   warn_msg4maybe_warning_due_to_dynamic_shape.<locals>.warn_msg  s    00>/? @''		
r)   TFr   )	rX   r   r^   r   r   r   "cudagraph_dynamic_shape_warn_limitr   r   )fn_cachenew_int_keyr   r   s      @r'   "maybe_warning_due_to_dynamic_shaper     st     )A-N

 	%%HH
//
 
 
'
'
J
JK 	xz*r)   )r>   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   S	rg
)CudagraphCachedInfoi  z
Info needed to realign inputs
rT   rU   list[str | None]stack_tracesz	list[str]cudagraph_fail_reasonsr+   NrD   r+   r)   r'   r   r     s     ,+""%%r)   r   c                  L    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
\S'   Srg)CudagraphMetadatai  z&
Metadata for recording a CUDA graph.
rT   rU   OrderedSet[int]r%   rV   r   r   zdict[str, torch.Tensor]rS   r+   NrD   r+   r)   r'   r   r     s'     ,+&&''""&&r)   r   c                v   / n[        5       n[        5       n[        U R                  5       H  u  pVXaR                  ;   a  UR	                  U5        XaR
                  ;   a  UR	                  U5        Ub  UR                  U   nO[        SU R                   SU 3S/ SS9nUR                  U5        M     / nU R                   H7  n	U	b   UR                  UR                  U	   5        M&  UR                  S5        M9     U R                   V
s0 s H  oUR                  U
   _M     nn
[        UUUUU5      $ s  sn
f )z
Convert the cudagraph metadata at the graph level to the graph partition level,
given the graph partition info (i.e., mapping from partition input/output index
to graph input/output index).
N
partition__placeholder_)rJ   rL   rN   rO   )r   r   input_index_mappingr%   addrV   rU   rG   rC   appendoutput_index_mappingr   constant_namesrS   r   )partition_mapmetadatapartition_placeholderspartition_static_input_idxspartition_mutated_input_idxspartition_input_idxgraph_input_idxrk   partition_stack_tracesgraph_output_idxrJ   partition_constantss               r'    get_partition_cudagraph_metadatar     s`     3=<4>L 09))1, 888'++,?@999(,,-@A&"//@K *!-"2"2!3=AT@UV )-	K 	%%k2'1*  )>>'"))(*?*?@P*QR"))$/	 ? 4A3O3O3O4h  &&3O   #$ 	s   D6c                   [        U [        R                  5      (       d
  [        5       $ [        5       n[	        U / S9 H~  n[        U5      [        R                  La  M!  [        U5      (       d+  UR                  (       d  UR                  R
                  S:w  a  M^   UR                  UR                  5       5        M     U$ ! [         a     M  f = f)zODebug helper that collects the data pointers of all CUDA tensors in the object.)outr   )r   r^   r   r   r   r   r   is_metar   r   r   	Exception)objptrsbases      r'   collect_cuda_data_ptrsr    s    c5<<((|&LD!#2.:U\\)4==DLLDKK,<,<,F	HHT]]_% / K  		s   C
CC)rc   torch.fx.Noder7   rK   )rh   rG   r7   rK   )rc   r  r7   rG   )rq   ztorch.fx.Graphr7   rM   )ru   rI   r7   rI   )rU   rT   rz   z AbstractSet[int] | Sequence[int]r7   rI   )r   rQ   r   list[InputType]r   zCallable[[torch.Tensor], bool]r7   rK   )rr   r  r7   rK   )r   z!dict[torch.device, torch.fx.Node]r7   rK   )r|   rI   r7   r   )
r   ztorch.fx.GraphModuler   zOrderedSet[str]rV   r   r%   r4   r7   rK   )rk   rG   r7   rK   )rU   rT   r   r	  r   zSequence[int | None]r   r4   r   r   r7   rI   )r   z)dict[tuple[int, ...], Callable[..., Any]]r   r   r7   r6   )r   r   r   r   r7   r   )r  objectr7   r   )C
__future__r   dataclassescollections.abcr   enumr   typingr   r   r   r^   torch._dynamo.utilsr	   r
   torch._inductor.utilsr   r   torch._subclasses.fake_tensorr   r   torch.utils._ordered_setr   utilsr   r   r   AbstractSettorch._inductor.output_coder   r   _logginggetArtifactLoggerr8   r   r   listr5   r   
OutputType	ModelTyper   	dataclassrA   rG   rQ   re   ri   rm   rs   rv   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r+   r)   r'   <module>r     s   "  $  . .  = > D / / <6e<( 11(LINN44' 
 #$t+,
d9o&
23	T Tn d$/  0 d$/) ) 0) d$/& & 0&#5O2+6 	(
 $B 	>T:TT8H:HH	J      99#9 (9 %	9
 9004 00+ , 	
 # 	>7 
: d#& & $& d#	' 	' $	'3$33 3lr)   