
    ggJ              
       &   S SK r S SKrS SKJrJrJr  \" SS\R                  SS5      \" SS\R                  SS5      \" S5      \" S	S
\R                  SS5      /rSrSr	S r
S rS rS rS rS rS rS rS rS rS rS rS rS rS rS rS rg)    N)
ImportanceMetricRequestRequestedMetricsParserlaunch__block_size
block_sizeFlaunch__grid_size	grid_size&device__attribute_multiprocessor_count launch__waves_per_multiprocessor	num_waves    zSee the @url:Hardware Model:https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model@ description for more details on launch configurations.c                      g)NLaunchConfiguration r       4nsight-compute-2025.1.1/sections/LaunchStatistics.pyget_identifierr   ,   s     r   c                      g)NzLaunch Configurationr   r   r   r   get_namer   0   s    !r   c                      g)Nz$Kernel launch configuration analysisr   r   r   r   get_descriptionr   4   s    1r   c                      g)NLaunchStatsr   r   r   r   get_section_identifierr   8   s    r   c                      S/$ )NSOLBottleneckr   r   r   r   get_parent_rules_identifiersr   <   s    r   c                 X   Sn/ nU R                  5       S:X  a  U R                  5       [        -  S:g  nX4$ [        U R                  5       5       HV  nU R                  U5      nU[        -  S:w  d  M#  SnU R	                  5       R                  U5      nUR                  XT45        MX     X4$ )NFr   T)num_instancesvalue	WARP_SIZErangecorrelation_idsappend)block_size_metrichas_block_size_issueproblematic_block_sizesinstance_idr   	launch_ids         r   get_problematic_block_sizesr*   @   s      &&(A-06689DI  88 !!2!@!@!BCK*00=JI%*'+$-==?EEkR	'..	/FG D  88r   c                     [         R                  " U [        -  5      nU [        -  nUS:X  d  US:X  a  SnOSU-  SU[        -  -
  -  S-  n[        R                  R
                  U4$ Nr      d   )mathceilr!   NvRules	IFrontendSpeedupType_GLOBAL)r   	num_warpsnum_threads_last_warpimprovement_percents       r    get_estimated_speedup_block_sizer7   R   sq    		*y01I&2!Y!^ ]q#89#DDEK 	 //1DDDr   c                     SnSnSnU  H  u  pE[        U5      u  pgXq:  d  M  UnUnUnM!     [        R                  R                  XU4$ Nr   )r7   r1   r2   SpeedupType_LOCAL)block_sizesmax_speeduplaunch_id_at_max_speedupblock_size_at_max_speedupr)   r   _speedup_values           r   $get_max_estimated_speedup_block_sizerA   `   s`    K  !!,	;JG&'K'0$(2% "- ..Wpppr   c           	         SnUS   nUc  g[        U5      u  pVU(       d  gUR                  5       [        R                  R                  :X  a8  [        UR                  5       5      nSR                  U5      n[        U5      u  pO.[        U5      u  ppSR                  [        UUS:X  a  SOSU5      nS	R                  [        U[        [        5      nU R                  [        R                  R                  X5      nU R                  XU
5        U R!                  XR#                  5       U[        R                  R$                  S
R                  [        5      5        g)a2  Execute the block size rule and generate a rule output if necessary.

Check whether the block size of (each) kernel launch is a multiple of the warp size,
and generate a rule message and a speedup estimate otherwise.
In case of range results, find and report the launch with the largest potential speedup.
z
Block Sizer   NzCThis kernel launch is configured to execute {:d} threads per block.zSome kernel launches of this workload are configured to use a number of threads per block that is not a multiple of {} (e.g., {} {} for launch ID {}).r-   threadthreadsa  Threads are executed in groups of {} threads called warps. {} Consequently, some threads in a warp are masked off and those hardware resources are unused. Try changing the number of threads per block to be a multiple of {} threads. Between 128 and 256 threads per block is a good initial range for experimentation. Use smaller thread blocks rather than one large thread block per multiprocessor if latency affects performance.  This is particularly beneficial to kernels that frequently call __syncthreads(). {}z>Arrange the number of threads per block to be a multiple of {})r*   workload_typer1   IActionWorkloadType_KERNELintr    formatr7   rA   r!   HARDWARE_MODEL_REFmessager2   MsgType_MSG_OPTIMIZATIONspeedupfocus_metricnameSeverity_SEVERITY_LOW)feactionmetrics	rule_namer%   r&   r'   r   workload_specific_partspeedup_typer@   r)   rule_messagemsg_ids                 r   apply_block_size_rulerY   q   s_    I-  	 	$$56 2  !D!DD*0023
!f!m!mnx!y&Fz&R#m 11HI 	;Y//5v&!O	0 		
 f"	  ZZ))BBL\FJJv]3OOF224j'BSBSBiBi  ll  ls  ls  t}  l~  r   c                 N    X-
  U-  S-  n[         R                  R                  U4$ )Nr.   )r1   r2   r3   )r	   num_smsr6   s      r   get_estimated_speedup_grid_sizer\      s-    ".'9C?//1DDDr   c                    SnSnSn[        U R                  5       5       HK  nU R                  U5      n[        Xa5      u  pxX:  d  M(  UnU R	                  5       R                  U5      nUnMM     [
        R                  R                  X#U4$ r9   )r"   r   r    r\   r#   r1   r2   r:   )	grid_size_metricr[   r<   r=   grid_size_at_max_speedupr(   r	   r?   r@   s	            r   #get_max_estimated_speedup_grid_sizer`      s    K  -;;=>$**;7	:9N&'K'7'G'G'I'O'OP['\$'0$ ? ..Wooor   c           
      (  ^ SnUS   n[        US   R                  5       5      mUc  gUR                  5       S:X  a  UR                  5       /nO7[        UR                  5       5       Vs/ s H  odR                  U5      PM     nn[	        U4S jU 5       5      (       Ga*  UR                  5       [        R                  R                  :X  a7  [        US   5      nSR                  XwS:X  a  S	OS
T5      n[        UT5      u  pO*[        UT5      u  ppSR                  TXwS:X  a  S	OS
U5      nSR                  U[        5      nU R                  [        R                  R                  X5      nU R!                  XU
5        U R#                  UUR%                  5       U[        R                  R&                  SR                  T5      5        g[	        U4S jU 5       5      (       Ga(  UR                  5       [        R                  R                  :X  a#  [        US   5      nSR                  UT-  5      nOP[)        U5      nUR+                  U5      nUR-                  5       R                  U5      nSR                  UT-  U5      nSR                  U5      nU R                  [        R                  R                  X5      nU R#                  UUR%                  5       U[        R                  R.                  SR                  ST-  5      5        ggs  snf )a  Execute the grid size rule and generate a rule output if necessary.

Check whether the grid size of (each) kernel launch is at least the number of SMs,
and generate a rule message and a speedup estimate otherwise.
In case of range results, find and report the launch with the largest potential speedup.

Also check if the grid size is less than twice the number of SMs for any launch,
and suggest to use at least two blocks per SM when __syncthreads() is used.
z
Small Gridr	   r
   Nr   c              3   ,   >#    U  H	  oT:  v   M     g 7f)Nr   .0sizer[   s     r   	<genexpr>'apply_grid_size_rule.<locals>.<genexpr>   s     
1jd'>js   zrThe grid for this launch is configured to execute only {:d} {}, which is less than the GPU's {:d} multiprocessors.r-   blockblockszAt least one kernel launch of this workload has a grid which is configured to execute fewer blocks than the GPU's {:d} multiprocessors (e.g., {:d} {} for launch ID {:d}).a#  {} This can underutilize some multiprocessors. If you do not intend to execute this kernel concurrently with other workloads, consider reducing the block size to have at least one block per multiprocessor or increase the size of the grid to fully utilize the available hardware resources. {}zCIncrease the grid size towards the number of multiprocessors ({:d})c              3   2   >#    U  H  oS T-  :  v   M     g7f)   Nr   rc   s     r   rf   rg   	  s     7JDAKJs   z2(compared to the currently executed {:.1f} blocks)zE(compared to the currently executed {:.1f} blocks for launch ID {:d})zIf you execute __syncthreads() to synchronize the threads of a block, it is recommended to have at least two blocks per multiprocessor {} This way, blocks that aren't waiting for __syncthreads() can keep the hardware busy.zIIncrease the grid size towards twice the number of multiprocessors ({:d})rk   )rH   r    r   r"   anyrE   r1   rF   rG   rI   r\   r`   rJ   rK   r2   rL   rM   rN   rO   Severity_SEVERITY_HIGHminindexr#   rP   )rQ   rR   rS   rT   r^   
grid_sizesir	   rU   rV   r@   r)   rW   rX   r(   r[   s                  @r   apply_grid_size_rulerr      s    I{+'BCIIKLG 	%%'1,&,,./
9>?O?]?]?_9`a9`A,,Q/9`
a

1j
111!W__%H%HHJqM*IFFLfq.whG # +J)U\*]'L- 44DgN >L77=vY>xQZ8 #
 &/1CD 	 G--FF`


67
!!#44**0&/		
 
7J7	7	7!W__%H%HHJqM*IDKK' # JI$**95K(88:@@MI''-vi'.A9'M #+ ,262H+I	 	 G--FF`
!!#33**0&W*=		
3 
8] bs   *Lc                    / nU R                  5       S:X  aX  U R                  5       n[        R                  " U5      u  p4US:  =(       a    US:  nU(       a  SnUR	                  XcU45        XQ4$ [        U R                  5       5       HY  nU R                  U5      n[        R                  " U5      u  p4US:  =(       a    US:  nU(       d  MF  UR	                  XcU45        M[     [        U5      S:  U4$ )Nr   r-   )r   r    r/   modfr$   r"   len)num_waves_metricproblematic_launchesr   partial_waveswhole_waveshas_tail_effectr(   s          r   get_problematic_num_wavesr{   .  s    %%'1,$**,	%)YYy%9"'!+@q0@K ''[(QR44-;;=>$**;7	%)YYy%9"'!+@q0@? ''[(QR ? #$q(*>>>r   c                     U S:X  a  [         R                  R                  S4$ SSU-   -  S-  n[         R                  R                  U4$ r,   )r1   r2   r:   r3   )rx   ry   r6   s      r   !get_estimated_speedup_tail_effectr}   E  sM      22A55q;/#5//1DDDr   c                     SnSnSnSnU  H"  u  pVn[        Xg5      u  pX:  d  M  U	nUnUnUnM$     [        R                  R                  XX44$ r9   )r}   r1   r2   r:   )
rw   r<   r=   partial_waves_at_max_speedupwhole_waves_at_max_speedupr)   rx   ry   r?   r@   s
             r   %get_max_estimated_speedup_tail_effectr   M  su    K #$ !"1E-	+<]X&'K'0$+8()4& 2F ..Ws  P  Pr   c                    SnUS   nUS   nUc  gSn[        U5      u  pxU(       Ga  UR                  5       [        R                  R                  :X  al  US   u  pnUR                  5       nUR                  5       n[        X5      u  p[        R                  " XU-  -  5      nSR                  [        U5      U5      nO[        U5      u  pnpUR                  5       R                  U5      nUR                  U5      nUR                  U5      n[        R                  " XU-  -  5      nSR                  U[        U5      U5      nX:  a  gS	R                  UU[        5      nU R                  [        R                  R                   UU5      nU R#                  UX5        U R%                  UUR'                  5       U[        R                  R(                  S
5        gg)zExecute the tail effect rule and generate a rule output if necessary.

Check whether any kernel launch has a partial wave of thread blocks.
In case of range results, report the launch with the largest (local) speedup.
zTail Effectr	   r   N   r   zWThis kernel launch results in {:d} full waves and a partial wave of {:d} thread blocks.zAt least one kernel launch of this workload results in a partial wave (e.g., for launch ID {:d}, {:d} full waves and a partial wave of {:d} thread blocks are executed).a#  A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical occupancy of the kernel. {} Under the assumption of a uniform execution duration of all thread blocks, this partial wave may account for up to {:.1f}% of the total runtime of this kernel. Try launching a grid with no partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for a grid. {}zQDecrease the number of partial waves (the fractional part of the number of waves))r{   rE   r1   rF   rG   r    r}   r/   r0   rI   rH   r   r#   rJ   rK   r2   rL   rM   rN   rO   Severity_SEVERITY_DEFAULT)rQ   rR   rS   rT   r^   rv   speedup_thresholdrz   rw   r?   rx   ry   r	   r   rV   r@   partial_wave_blocksrU   r(   r)   rW   rX   s                         r   apply_tail_effect_ruler   `  s    I{+{+ 	,EFV,W)O!W__%H%HH,@,C)Ak(..0I(..0I*KM*g'L"&))I9R,S"T//5v$&90 # 66JK QLm(88:@@MI(..{;I(..{;I"&))I9R,S"T55;Vs;/1D6 # ,	 &&" 	" G--FFV_`


6<7
!!#77@		
e r   c                    [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       n[        X5      R                  [        5      n[        X2U5        [        X2U5        [        X2U5        g r9   )r1   get_contextrange_by_idxaction_by_idxfrontendr   parserequested_metricsrY   rr   r   )handlectxrR   rQ   rS   s        r   applyr     sl    


f
%Ca ..q1F	B$V4::;LMG"g.W-2w/r   )r/   r1   RequestedMetricsr   r   r   OPTIONALr   r!   rJ   r   r   r   r   r   r*   r7   rA   rY   r\   r`   rr   r{   r}   r   r   r   r   r   r   <module>r      s   2   N N &j6I6I4QVW%{J4G4GuU:;4k:CVCVX\^cd	  	> !"29$Eq"7tEp$g
T?.EP&N
b	0r   