
    gg                         S SK r S SKJrJrJr  \" SS\R
                  SS5      \" SS\R
                  SS5      /rS rS	 rS
 r	S r
S rS rS rg)    N)
ImportanceMetricRequestRequestedMetricsParser&sm__maximum_warps_per_active_cycle_pcttheoretical_occupancyFz1sm__warps_active.avg.pct_of_peak_sustained_activeachieved_occupancyc                      g)NAchievedOccupancy r       5nsight-compute-2025.1.1/sections/AchievedOccupancy.pyget_identifierr   "   s    r   c                      g)NzAchieved Occupancyr   r   r   r   get_namer   %   s    r   c                      g)Nz"Analysis of the Achieved Occupancyr   r   r   r   get_descriptionr   (   s    /r   c                      g)N	Occupancyr   r   r   r   get_section_identifierr   +   s    r   c                      S/$ )NIssueSlotUtilizationr   r   r   r   get_parent_rules_identifiersr   .   s    "##r   c                    US   R                  5       nUS   R                  5       nX#-
  U-  nSnXP;   a0  [        R                  R                  n[	        X   U5      nUS-  nXh4$ [        R                  R
                  nUS-  nXh4$ )a;  Estimate potential speedup from increasing the achieved occupancy.

The performance improvement is approximated as relative proportion of the difference
of theoretical and achieved occupancy.
In case it's available, the performance improvement can be upper-bounded by the
speedup estimate of IssueSlotUtilization.

r   r   "issue_slot_util_speedup_normalizedd   )valueNvRules	IFrontendSpeedupType_GLOBALminSpeedupType_LOCAL)	parent_weightsmetricsr   r   improvement_localparent_speedup_namespeedup_typeimprovement_globalimprovement_percents	            r   get_estimated_speedupr)   2   s     $$;<BBD !56<<>.CG\\>,((;; /1B
 136
 ,, ((::/#5,,r   c           
         [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       n[        X5      R                  [        5      n[        S UR                  5        5       5      (       a  g UR                  S5      nSnSnUR                  5       [         R                  R                  :X  a  US   R                  5       nUS   R                  5       n	X-
  n
X::  a  g SR                  XU5      nUR!                  [         R"                  R$                  U5      n['        XT5      u  pUR)                  XU5        UR+                  XS   R-                  5       U	[         R"                  R.                  S	R                  U5      5        g US   nUS   R                  5       n	/ n[1        UR3                  5       5       HQ  nUR                  U5      nX-
  n
X:  d  M  UR5                  5       R                  U5      nUR7                  UU45        MS     [9        U5      S:X  a  g UR;                  S
S S9  SR                  [9        U5      S:X  a  SR                  US   S   5      O)SR                  SR=                  S US S  5       5      5      U5      nUR!                  [         R"                  R$                  U5      ng )Nr   c              3   (   #    U  H  oS L v   M
     g 7f)Nr   ).0metrics     r   	<genexpr>apply.<locals>.<genexpr>S   s     
9(8fT>(8s   r   a  Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the @url:CUDA Best Practices Guide:https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy@ for more details on optimizing occupancy.
   r   r   zThe difference between calculated theoretical ({:.1f}%) and measured achieved occupancy ({:.1f}%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. {}zGIncrease the achieved occupancy towards the theoretical limit ({:.1f}%)Tc                     U S   $ )N   r   )xs    r   <lambda>apply.<locals>.<lambda>   s    !r   )reversekeya  The large difference between the calculated theoretical occupancy (per launch) and the measured achieved occupancy (of the entire workload) for some launches of this workload (e.g., {}) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. {}r2   zlaunch ID {}zlaunch IDs {}z, c              3   >   #    U  H  n[        US    5      v   M     g7f)r   N)str)r,   launchs     r   r.   r/      s     5nSmc&)nnSms      )r   get_contextrange_by_idxaction_by_idxfrontendr   parserequested_metricsanyvaluesreceive_dict_from_parentworkload_typeIActionWorkloadType_KERNELr   formatmessager   MsgType_MSG_OPTIMIZATIONr)   speedupfocus_metricnameSeverity_SEVERITY_DEFAULTrangenum_instancescorrelation_idsappendlensortjoin)handlectxactionfer#   r"   load_imbalance_adviceoccupancy_difference_thresholdr   r   occupancy_differencerI   msg_idr&   speedup_valuetheoretical_occupancy_metriclow_occupancy_launchesinstance_id	launch_ids                      r   applyrc   M   s   


f
%Ca ..q1F	B$V4::;LMG

9(8
999 	001GHN	5  &("!D!DD '(? @ F F H$%9:@@B4IA &%;P	 	 G--FFP&;N&T#


67
(< = B B DFXZaZkZk  [F  [F  HQ  HX  HX  Yn  Ho  	p (//F'G$$%9:@@B!# !=!K!K!MNK$@$F$F{$S!#8#M  $D8HHJPPQ\]	&--y:O.PQ O %&!+ 	##Dn#EP QWPVGJKaGbfgGg%%&<Q&?&BC$++DII5nSijlklSm5n,no%Q	 	 G--FFPr   )r   RequestedMetricsr   r   r   OPTIONALrA   r   r   r   r   r   r)   rc   r   r   r   <module>rf      su   2  N N :<SU_UhUhjnpuvEG[]g]p]prvx}~  0$-6QQr   