
    gg                         S SK r S SKJrJr  \" SS5      \" SS5      \" SS5      \" S	S
5      \" SS5      \" SS5      \" SS5      \" SS5      /rS rS rS rS rS r	S r
S rg)    N)MetricRequestRequestedMetricsParserz@sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustainedinst_executed_ffma_peakz@sm__sass_thread_inst_executed_op_dfma_pred_on.sum.peak_sustainedinst_executed_dfma_peakzEsmsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsedinst_executed_faddzEsmsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsedinst_executed_fmulzEsmsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsedinst_executed_ffmazEsmsp__sass_thread_inst_executed_op_dadd_pred_on.sum.per_cycle_elapsedinst_executed_daddzEsmsp__sass_thread_inst_executed_op_dmul_pred_on.sum.per_cycle_elapsedinst_executed_dmulzEsmsp__sass_thread_inst_executed_op_dfma_pred_on.sum.per_cycle_elapsedinst_executed_dfmac                      g)NSOLFPRoofline r       9nsight-compute-2025.1.1/sections/SpeedOfLight_Roofline.pyget_identifierr   (   s    r   c                      g)NRoofline Analysisr   r   r   r   get_namer   +   s    r   c                      g)Nz Floating Point Roofline Analysisr   r   r   r   get_descriptionr   .   s    -r   c                      g)NSpeedOfLight_RooflineChartr   r   r   r   get_section_identifierr   1   s    'r   c                      S/$ )NHighPipeUtilizationr   r   r   r   get_parent_rules_identifiersr   4   s    !""r   c                     XC-  S:  a  [         R                  R                  S4$ X!U-   -  SXC-  -
  -  nSU ;   a$  [         R                  R                  nXPS   -  nXg4$ [         R                  R                  nUS-  nXg4$ )N   r   fp64_pipeline_utilization_pctd   )NvRules	IFrontendSpeedupType_LOCALSpeedupType_GLOBAL)parent_weightsachieved_fp32achieved_fp64	peak_fp32	peak_fp64improvement_localspeedup_typeimprovement_percents           r   get_estimated_speedupr.   7   s    
 q   22A55&-*GH	I!! '.8((;;/A`2aa
 ,, ((::/#5,,r   c                 B   [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       n[        X5      R                  [        5      nUR                  S5      nSUS   R                  5       -  nSUS   R                  5       -  nUS   R                  5       nUS   R                  5       n	US   R                  5       n
X-   SU
-  -   nUS	   R                  5       nUS
   R                  5       nUS   R                  5       nX-   SU-  -   nSnSnX-  nUS:  d  US:X  a  SOSnX-  nUS:  d  US:X  a  SOSnSR                  Xg-  5      nUSR                  USU-  USU-  5      -  nSnUU:  Ga!  UU:  Ga  US-  nUU-  nUR                  [         R                  R                  US5      n[        X[XU5      u  nnUR                  UUU5        US:  a  UR!                  UUS	   R#                  5       U[         R                  R$                  S5        UR!                  UUS
   R#                  5       U[         R                  R$                  S5        UR!                  UUS   R#                  5       U[         R                  R$                  S5        g g UU:  a<  UU:  a6  US-  nUU-  nUR                  [         R                  R                  US5      ng UU-  nUR                  [         R                  R&                  US5      ng )Nr   r      r   r   r   r   r	   r
   r   r   g333333?g333333?g{Gz?g         z
 close to zWThe ratio of peak float (fp32) to double (fp64) performance on this device is {:.0f}:1.zs The workload achieved {}{:.0f}% of this device's fp32 peak performance and {}{:.0f}% of its fp64 peak performance.g      Y@z See the @url:Kernel Profiling Guide:https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline@ for more details on roofline analysis.z If @section:ComputeWorkloadAnalysis:Compute Workload Analysis@ determines that this workload is fp64 bound, consider using 32-bit precision floating point operations to improve its performance.zFP64/32 UtilizationzDecrease fp64 ADD instructionszDecrease fp64 MUL instructionszDecrease fp64 FMA instructionsz If @section:SpeedOfLight:Speed Of Light@ analysis determines that this workload is compute bound, consider using integer arithmetic instead where applicable.zHigh FP Utilizationr   )r"   get_contextrange_by_idxaction_by_idxfrontendr   parserequested_metricsreceive_dict_from_parentvalueformatmessager#   MsgType_MSG_OPTIMIZATIONr.   speedupfocus_metricnameSeverity_SEVERITY_HIGHMsgType_MSG_OK)handlectxactionfemetricsr&   r)   r*   fp32_add_achievedfp32_mul_achievedfp32_fma_achievedr'   fp64_add_achievedfp64_mul_achievedfp64_fma_achievedr(   high_utilization_thresholdlow_utilization_thresholdachieved_fp64_pctfp64_prefixachieved_fp32_pctfp32_prefixr;   message_profiling_guidemsg_idr,   speedup_values                              r   applyrV   L   s   


f
%Ca ..q1F	B$V4::;LMG001FGNG56<<>>IG56<<>>I 45;;= 45;;= 45;;=%9A@Q<QQM 45;;= 45;;= 45;;=%9A@Q<QQM!% $%1)T15F#5M"S_K%1)T15F#5M"S_Kgnnox  pE  FG  E  L  L  MX  Z_  bs  Zs  u@  BG  J[  B[  \  \G w55:KNg:g  X  	X**G--FFQfg&;N[hu~&#m


6<71OOFG,@$A$F$F$HJ[]d]n]n  ^F  ^F  Hh  iOOFG,@$A$F$F$HJ[]d]n]n  ^F  ^F  Hh  iOOFG,@$A$F$F$HJ[]d]n]n  ^F  ^F  Hh  i  
7	7<MPj<j  t  	t**G--FFQfg**G--<<gGZ[r   )r"   RequestedMetricsr   r   r7   r   r   r   r   r   r.   rV   r   r   r   <module>rX      s   2  B TVopTVopY[opY[opY[opY[opY[opY[op	 .(#-*3\r   