
    ggx                     j    S SK r S SKJrJr  \" SS5      \" SS5      /rS rS rS	 rS
 rS r	S r
S rg)    N)MetricRequestRequestedMetricsParserz2smsp__thread_inst_executed_per_inst_executed.ratiothread_inst_executedz:smsp__thread_inst_executed_pred_on_per_inst_executed.ratiothread_inst_executed_NPOc                      g)NThreadDivergence r	       4nsight-compute-2025.1.1/sections/ThreadDivergence.pyget_identifierr   "   s    r
   c                      g)NzThread Divergencer	   r	   r
   r   get_namer   %   s    r
   c                      g)Nz%Warp and thread control flow analysisr	   r	   r
   r   get_descriptionr   (   s    2r
   c                      g)NWarpStateStatsr	   r	   r
   r   get_section_identifierr   +   s    r
   c                      S/$ )NComputer	   r	   r
   r   get_parent_rules_identifiersr   .   s
    ;r
   c                     [        X5      nSUS-  -
  nSnXP;   a'  [        R                  R                  nX@U   -  S-  nXg4$ [        R                  R                  nUS-  nXg4$ )N       compute_throughput_normalizedd   )minNvRules	IFrontendSpeedupType_GLOBALSpeedupType_LOCAL)parent_weightsr   r   num_threads_usedimprovement_localcompute_throughput_namespeedup_typeimprovement_percents           r   get_estimated_speedupr'   2   s    /J-22=0((;;/AX2YY\__
 ,, ((::/#5,,r
   c           	         [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       n[        X5      R                  [        5      nUR                  S5      nUS   R                  5       nUS   R                  5       n/ nSn	Xi:  d  Xy:  Ga  SR                  U5      n
UR                  US   R                  5       U[         R                  R                  S45        Xv:  aR  U
SR                  U5      -  n
UR                  US   R                  5       U[         R                  R                  S	45        UR!                  [         R                  R"                  U
5      n[%        XVU5      u  pUR'                  XU5        U H#  nUR)                  XS   US
   US   US   5        M%     g g )Nr   r   r   r      a  Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early thread completion, and divergent flow control can significantly lower the number of active threads in a warp per cycle. This workload achieves an average of {0:.1f} threads being active per cycle.z9Increase the number of threads per instruction towards 32aO   This is further reduced to {0:.1f} threads per warp due to predication. The compiler may use predication to avoid an actual branch. Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads execute the instructions. Try to avoid different execution paths within a warp when possible.zGIncrease the number of predicated-on threads per instruction towards 32r         )r   get_contextrange_by_idxaction_by_idxfrontendr   parserequested_metricsreceive_dict_from_parentvalueformatappendnamer   Severity_SEVERITY_LOWSeverity_SEVERITY_HIGHmessageMsgType_MSG_OPTIMIZATIONr'   speedupfocus_metric)handlectxactionfemetricsr!   r   r   fms	thresholdr9   msg_idr%   speedup_valuefms                  r   applyrG   A   s   


f
%Ca ..q1F	B$V4::;LMG00;N"#9:@@B&'ABHHJ
CI'+C+O i  p  p  qE  F

G2388:<PRYRcRcRyRy  |w  x  	y#:  i  p  p  qI  J  JGJJ :;@@BD\^e^o^o  _G  _G  IR  S  TG--FFP&;Nbz&{#


67BOOFqE2a5"Q%A?  ,Pr
   )r   RequestedMetricsr   r   r1   r   r   r   r   r   r'   rG   r	   r
   r   <module>rI      sS   2  B FH^_NPjk 3-@r
   