
    gg5                     x    S SK r S SKJrJrJr  \" SS\R
                  SS5      /rS rS rS r	S r
S	 rS
 rS rg)    N)
ImportanceMetricRequestRequestedMetricsParsersass__inst_executed_per_opcodeFc                      g)NFPInstructions r	       2nsight-compute-2025.1.1/sections/FPInstructions.pyget_identifierr   !   s    r
   c                      g)NzFP32/64 Instructionsr	   r	   r
   r   get_namer   $   s    !r
   c                      g)Nz$Floating-point instruction analysis.r	   r	   r
   r   get_descriptionr   '   s    1r
   c                      g)NInstructionStatsr	   r	   r
   r   get_section_identifierr   *   s    r
   c                      S/$ )NHighPipeUtilizationr	   r	   r
   r   get_parent_rules_identifiersr   -   s    !""r
   c                     X!-   nSX#-  -  nU b!  [         R                  R                  nX@-  nXV4$ [         R                  R                  nUS-  nXV4$ )Ng      ?d   )NvRules	IFrontendSpeedupType_GLOBALSpeedupType_LOCAL)pipeline_utilization_pctfused_instructionsnon_fused_instructionsall_instructionsimprovement_localspeedup_typeimprovement_percents          r   get_estimated_speedupr$   0   sn    
 .B5HI+((;;/J
 ,, ((::/#5,,r
   c                    [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       nUR                  5       [         R                  R                  :w  a  g [        X5      R                  [        5      n[        S UR                  5        5       5      (       a  g UR                  S5      n/ SQ/ SQS.nUS   nUR                  5       nUR                  5       n	U GH  n
[!        5       nXj   n[#        SU5       H<  nU	R%                  U5      R'                  5       nX;   d  M)  UR)                  U5      X'   M>     Sn[#        SS5       H  nX   nX;   d  M  XU   -  nM     SnUS   nX;   a  UX   -  nUS:  d  US:  d  M  XU-   -  S-  nUS	:  d  M  S
R+                  UX5      nUSR+                  U
SU-  U
5      -  nSR+                  U
5      nUR-                  [         R.                  R0                  UU5      nS nSR+                  U
5      nUU;   a  UU   n[3        UUU5      u  nnUR5                  UUU5        UR7                  USU[         R.                  R8                  S5        Uc  GM  U
S:X  a  SnOSnUR7                  UUU[         R.                  R:                  S5        GM     g )Nr   c              3   (   #    U  H  oS L v   M
     g 7f)Nr	   ).0metrics     r   	<genexpr>apply.<locals>.<genexpr>J   s     
9(8fT>(8s   r   )FADDFMULFFMA)DADDDMULDFMA)    @   r      g?zAThis kernel executes {} fused and {} non-fused FP{} instructions.a`   By converting pairs of non-fused instructions to their @url:fused:https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point@, higher-throughput equivalent, the achieved FP{} performance could be increased by up to {:.0f}% (relative to its current performance). Check the Source page to identify where this kernel executes FP{} instructions.g      Y@zFP{} Non-Fused Instructionszfp{}_pipeline_utilization_pctzUDecrease the number of non-fused floating-point instructions (FADD, FMUL, DADD, DMUL)r1   z;sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_activez<sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_activezLThe higher the utilization of the pipeline the more severe the issue becomes)r   get_contextrange_by_idxaction_by_idxfrontendworkload_typeIActionWorkloadType_KERNELr   parserequested_metricsanyvaluesreceive_dict_from_parentnum_instancescorrelation_idsdictrange	as_stringupper	as_uint64formatmessager   MsgType_MSG_OPTIMIZATIONr$   speedupfocus_metricSeverity_SEVERITY_HIGHSeverity_SEVERITY_LOW)handlectxactionfemetricsparent_weightsfp_typesinst_per_opcodenum_opcodesopcodesfp_typefp_insts
fp_opcodesiop	non_fusedfusedratiorH   message_titlemsg_idr   parent_weight_namer"   speedup_valuemetric_names                             r   applyre   A   s   


f
%Ca ..q1F	B!D!DD$V4::;LMG

9(8
999 	001FGN ('H >?O!//1K--/G 6&
q%A""1%++-B.88; & 	q!AB~b\)	 
 ]>X\!Eq=EAIe"349Es{]ddejlu ggmgmnuw{  D  xD  FM  hNN !> D DW MG$5$5$N$NPWYfg+/(%D%K%KG%T"%7/=>P/Q,.CD\^cen.o+m

6<?(H)U\UfUfU}U}  @W  X+7"}&c&dOOFK9QSZSdSdSzSz  }K  L[ r
   )r   RequestedMetricsr   r   r   OPTIONALr<   r   r   r   r   r   r$   re   r	   r
   r   <module>rh      sW   2  N N 2D*:M:MtUZ[ 
"2#-"HLr
   