
    gg]                     H   S SK r S SKJrJrJr  / \" SS5      P\" SS5      P\" SS\R
                  SS5      P\" S	S\R
                  SS5      P\" S
S\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" SS\R
                  SS5      P\" S S\R
                  SS5      P\" S!S\R
                  SS5      P\" S"S\R
                  SS5      P\" S#S\R
                  SS5      P\" S$S\R
                  SS5      P\" S%S\R
                  SS5      P\" S&S\R
                  SS5      P\" S'S\R
                  SS5      P\" S(S\R
                  SS5      P\" S)S\R
                  SS5      P\" S*S\R
                  SS5      P\" S+S\R
                  SS5      P\" S,S\R
                  SS5      P\" S-S\R
                  SS5      P\" S.S\R
                  SS5      P\" S/S\R
                  SS5      P\" S0S\R
                  SS5      P\" S1S\R
                  SS5      P\" S2S\R
                  SS5      P\" S3S4\R
                  SS5      PrS5 rS6 rS7 r	S8 r
S9 rS: rS; r " S< S=5      r " S> S?\5      r " S@ SA\5      rSB rg)C    N)
ImportanceMetricRequestRequestedMetricsParser*device__attribute_compute_capability_majorcc_major*device__attribute_compute_capability_minorcc_minorz;sm__pipe_alu_cycles_active.avg.pct_of_peak_sustained_activeF;sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active<sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_activez>sm__pipe_shared_cycles_active.avg.pct_of_peak_sustained_activez:sm__pipe_tc_cycles_active.avg.pct_of_peak_sustained_activez>sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_activezAsm__pipe_tensor_cycles_active_v2.avg.pct_of_peak_sustained_activezKsm__pipe_tensor_subpipe_dmma_cycles_active.avg.pct_of_peak_sustained_activezKsm__pipe_tensor_subpipe_hmma_cycles_active.avg.pct_of_peak_sustained_activezKsm__pipe_tensor_subpipe_imma_cycles_active.avg.pct_of_peak_sustained_activezFsm__pipe_tensor_op_dmma_cycles_active.avg.pct_of_peak_sustained_activezFsm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_activezFsm__pipe_tensor_op_imma_cycles_active.avg.pct_of_peak_sustained_activezIsm__pipe_tensor_op_hmma_cycles_active_v2.avg.pct_of_peak_sustained_activezIsm__pipe_tensor_op_imma_cycles_active_v2.avg.pct_of_peak_sustained_activez;sm__pipe_tma_cycles_active.avg.pct_of_peak_sustained_activez=sm__mem_tensor_cycles_active.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_adu.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_alu.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_cbu.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_fma.avg.pct_of_peak_sustained_activezEsm__inst_executed_pipe_fma_type_fp16.avg.pct_of_peak_sustained_activez<sm__inst_executed_pipe_fp16.avg.pct_of_peak_sustained_activez<sm__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_activezDsm__inst_executed_pipe_fp64_op_dmma.avg.pct_of_peak_sustained_activezDsm__inst_executed_pipe_fp64_op_fp64.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_activez:sm__inst_executed_pipe_tc.avg.pct_of_peak_sustained_activezKsm__inst_executed_pipe_tensor_subpipe_dmma.avg.pct_of_peak_sustained_activezKsm__inst_executed_pipe_tensor_subpipe_hmma.avg.pct_of_peak_sustained_activezKsm__inst_executed_pipe_tensor_subpipe_imma.avg.pct_of_peak_sustained_activezFsm__inst_executed_pipe_tensor_op_dmma.avg.pct_of_peak_sustained_activezFsm__inst_executed_pipe_tensor_op_gmma.avg.pct_of_peak_sustained_activezFsm__inst_executed_pipe_tensor_op_hmma.avg.pct_of_peak_sustained_activezFsm__inst_executed_pipe_tensor_op_imma.avg.pct_of_peak_sustained_activezIsm__inst_executed_pipe_tensor_op_hmma_v2.avg.pct_of_peak_sustained_activezIsm__inst_executed_pipe_tensor_op_imma_v2.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_tex.avg.pct_of_peak_sustained_activez;sm__inst_executed_pipe_tma.avg.pct_of_peak_sustained_activez<sm__inst_executed_pipe_tmem.avg.pct_of_peak_sustained_activez?sm__inst_executed_pipe_uniform.avg.pct_of_peak_sustained_activez>sm__inst_executed_pipe_workid.avg.pct_of_peak_sustained_activez:sm__inst_executed_pipe_xu.avg.pct_of_peak_sustained_activez'smsp__issue_active.avg.per_cycle_activeissue_activec                      g)NHighPipeUtilization r       7nsight-compute-2025.1.1/sections/HighPipeUtilization.pyget_identifierr   S   s     r   c                      g)NzHigh Pipe Utilizationr   r   r   r   get_namer   V   s    "r   c                      g)Nz)High pipe utilization bottleneck analysisr   r   r   r   get_descriptionr   Y   s    6r   c                      g)NComputeWorkloadAnalysisr   r   r   r   get_section_identifierr   \   s    $r   c                      S/$ )NComputer   r   r   r   get_parent_rules_identifiersr   _   s
    ;r   c                 V    SU S-  -
  n[         R                  R                  nUS-  nX#4$ )N   d   )NvRules	IFrontendSpeedupType_LOCAL)max_utilization_acimprovement_localspeedup_typeimprovement_percents       r   get_estimated_speedupr'   b   s8    /#56$$66L+c1,,r   c                 N   SnS nUS   R                  5       S-  US   R                  5       -   nU  Ho  nUR                  b  XER                  :  a  M!  UR                  b  XER                  :  a  M?  UR                  nX   c  MR  X   R                  5       nXr:  d  Mk  UnUnMq     X24$ )Ng        r   
   r	   )valuecc_mincc_maxmetric)	pipelinesmetricsmax_utilizationmax_pipeccpipemetric_namer*   s           r   get_max_pipeliner5   j   s    OH			"	"	$r	)GJ,?,E,E,G	GB;;"rKK'7;;"rKK'7kk+(..0E&"'  &&r   c                   $    \ rS rSrSS jrS rSrg)Pipeline   Nc                 H    Xl         X l        X0l        US-   U l        XPl        g )Nz!.avg.pct_of_peak_sustained_active)namer+   r,   r-   description)selfr:   r+   r,   r-   r;   s         r   __init__Pipeline.__init__   s$    	BB&r   c                     U R                   $ Nr;   )r<   r/   s     r   r   Pipeline.get_description   s    r   )r,   r+   r;   r-   r:   r@   )__name__
__module____qualname____firstlineno__r=   r   __static_attributes__r   r   r   r7   r7      s    ' r   r7   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )CompositePipeline   c                 4   > [         TU ]  XX4U5        X`l        g r@   )superr=   sub_pipelines)r<   r:   r+   r,   r-   r;   rM   	__class__s          r   r=   CompositePipeline.__init__   s    D*r   c                     U R                   n[        U R                  U5      u  p4Ub  USR                  UR                  5      -  nU$ )Nz'. It's dominated by its {} sub-pipeline)r;   r5   rM   formatr:   )r<   r/   r;   r1   _s        r   r   !CompositePipeline.get_description   sG    &&&t'9'97CDKKHMMZZKr   )rM   rC   rD   rE   rF   r=   r   rG   __classcell__rN   s   @r   rI   rI      s    + r   rI   c                   4   ^  \ rS rSrU 4S jrU 4S jrSrU =r$ )SharedPipeline   c                 *   > [         TU ]  XX4S U5        g r@   )rL   r=   )r<   r:   r+   r,   r-   rM   rN   s         r   r=   SharedPipeline.__init__   s    mLr   c           	         > US   R                  5       S-  US   R                  5       -   nSSSSSSSSS.nSnX#;   a  XCU   -  nX@l        [        TU ]  U5      nU$ )	Nr   r)   r	   zA. It executes 64- and 16-bit floating point and tensor operationsz9. It executes 16-bit floating point and tensor operationsz9. It executes 64-bit floating point and tensor operations)F   H   K   P   Z   r   e   x   z_is the logical sum of several other pipelines which can't achieve full utilization on their own)r*   r;   rL   r   )r<   r/   r2   descriptionsr;   rN   s        r   r   SharedPipeline.get_description   s    Z &&(2-
0C0I0I0KK ULLLLMMM	
 x++K&g-g6r   rA   rT   rV   s   @r   rX   rX      s    M r   rX   c                    [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       n[        X5      R                  [        5      nUS   R                  5       S-  US   R                  5       -   nUR                  US   R                  5       US   R                  5       S.5        SnS	nS
nSn	Sn
US:X  a
  SnSnSnSn	Sn
[        SS S SS5      [        SS S SS5      [        SS S SS5      [        SS S S[        SS S S5      [        SSS S 5      [        S!SS S"5      [        S#SSS$5      [        SS S%U5      [        S!S&S%U5      [        S#S'S'S(5      [        S#S%S%S(5      /5      [        S)SS*S+S,5      [        S-S S US.[        SSS S 5      [        S!SS S"5      [        S#SSS$5      [        SS S%U5      [        S!S&S%U5      [        S#S'S'S(5      [        S#S%S%S(5      /5      [        S/S%S S0S15      [        S2SS*S3S45      1n[        S5S S S65      [        SS S S7S5      [        S8S S S95      [        SS S S:S5      [        S;S S'S<S=5      [        S>S?S S@S=5      [        SS S SAS5      [        SBS?S SCSD5      [        SES?S SFSG5      [        SHS S SISJ5      [        S)SS*SKS,5      [        SSS SLSM5      [        S!SS SNSO5      [        S#SSSPSQ5      [        S#S'S'SRSQ5      [        S#S%S%SRSQ5      [        SSS%S%STSU5      [        SS S%U	SM5      [        S!S&S%U
SO5      [        SVS S SWSX5      [        S/S%S SYS15      [        S2S S SZS[5      [        S\S]S S^5      [        S_SS S`Sa5      [        SbS S Sc5      1nSdnSenS'n[        X5      u  nnUGbL  SfnSgnShnUSi   b  USi   R                  5       nUSj:  a  SknUU:  a  SlnUSm-  nUR                  [         R                  R                   USn5      n[#        U5      u  nnUR%                  UUU5        UR'                  UUR(                  U[         R                  R*                  SoR-                  UR.                  5      5        g SpR-                  UR.                  U5      nUR1                  U5      nUb  USqU-   Sr-   -  nUU:  a3  SsnUSt-  nUR                  [         R                  R2                  UU5        g UU:  a  SunUSv-  nOSwnUSx-  n[        X5      u  nnUbZ  USyR-                  UUR.                  5      -  nUR1                  U5      nUb  USqU-   Sr-   -  nUU-  nUSz:  a  US{-  nOUS|:  a  US}-  nUUU-   U-   -  nUR                  [         R                  R                   UU5      nUR'                  UUR(                  U[         R                  R4                  S~R-                  UR.                  5      5        g g )Nr   r   r)   r	   r
   r   )fp32_pipeline_utilization_pctfp64_pipeline_utilization_pctsm__pipe_tensor_cycles_active%sm__pipe_tensor_op_hmma_cycles_active%sm__pipe_tensor_op_imma_cycles_active%sm__inst_executed_pipe_tensor_op_hmma%sm__inst_executed_pipe_tensor_op_immaY    sm__pipe_tensor_cycles_active_v2(sm__pipe_tensor_op_hmma_cycles_active_v2(sm__pipe_tensor_op_imma_cycles_active_v2(sm__inst_executed_pipe_tensor_op_hmma_v2(sm__inst_executed_pipe_tensor_op_imma_v2ALUsm__pipe_alu_cycles_activez%executes integer and logic operationsFMAsm__pipe_fma_cycles_activezZexecutes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD) operationsFP64sm__pipe_fp64_cycles_activez)executes 64-bit floating point operationsSharedsm__pipe_shared_cycles_activezTensor (FP)r   *sm__pipe_tensor_subpipe_hmma_cycles_activezTensor (INT)*sm__pipe_tensor_subpipe_imma_cycles_activezTensor (DP)*sm__pipe_tensor_subpipe_dmma_cycles_activera   r^   r`   %sm__pipe_tensor_op_dmma_cycles_activeTCrb   sm__pipe_tc_cycles_activezNexecutes Tensor Core (UTCBAR, UTCCP, UTC*MMA, UTCSHIFT and UTC*SWS) operationsTensorz9is the logical aggregation of individual tensor pipelinesTMAsm__pipe_tma_cycles_activez3executes Tensor Memory Accelerator (TMA) operationszTMEM (Tensor Memory)sm__mem_tensor_cycles_activezDincrements for LDT(M), STT(M), UTCCP, UTCMMA and UTCSHIFT operationsADUsm__inst_executed_pipe_adusm__inst_executed_pipe_aluCBUsm__inst_executed_pipe_cbusm__inst_executed_pipe_fmaFP16sm__inst_executed_pipe_fp16z)executes 16-bit floating point operationsz
FMA (FP16)V   $sm__inst_executed_pipe_fma_type_fp16sm__inst_executed_pipe_fp64zFP64 (DMMA)#sm__inst_executed_pipe_fp64_op_dmmazexecutes DMMA operationszFP64 (FP64)#sm__inst_executed_pipe_fp64_op_fp64z2executes non-DMMA 64-bit floating point operationsLSUsm__inst_executed_pipe_lsuz%executes load/store memory operationssm__inst_executed_pipe_tc*sm__inst_executed_pipe_tensor_subpipe_hmmaz0executes 16-bit floating point tensor operations*sm__inst_executed_pipe_tensor_subpipe_immaz*executes 4/8-bit integer tensor operations*sm__inst_executed_pipe_tensor_subpipe_dmmaz0executes 64-bit floating point tensor operations%sm__inst_executed_pipe_tensor_op_dmmazTensor (Warp Group)%sm__inst_executed_pipe_tensor_op_gmmaz%executes warp group tensor operationsTEXsm__inst_executed_pipe_texz#executes texture/surface operationssm__inst_executed_pipe_tmasm__inst_executed_pipe_tmemzIexecutes Tensor Memory (FENCE.VIEW.ASYNC.T, LDT(M) and STT(M)) operationsUniformr_   sm__inst_executed_pipe_uniformWorkIDsm__inst_executed_pipe_workidz"executes UGETNEXTWORKID operationsXUsm__inst_executed_pipe_xu   <   z See the @url:Kernel Profiling Guide:https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-decoder@ or hover over the pipeline name to understand the workloads handled by each pipeline.zx The @section:InstructionStats:Instruction Statistics@ section shows the mix of executed instructions for this workload. r   g?zi Check the @section:WarpStateStats:Warp State Statistics@ section for which reasons cause warps to stall.z|All compute pipelines are under-utilized. Either this workload is very small or it doesn't issue enough warps per scheduler.z Check the @section:LaunchStats:Launch Statistics@ and @section:SchedulerStats:Scheduler Statistics@ sections for further details.zLow Utilizationz@Increase the utilization of the busiest pipeline (currently: {})z{} is the highest-utilized pipeline ({:.1f}%) based on active cycles, taking into account the rates of its different instructions.z It .Balancedz5 It is well-utilized, but should not be a bottleneck.zHigh UtilizationzT The pipeline is well-utilized, but might become a bottleneck if more work is added.zVery High UtilizationzC The pipeline is over-utilized and likely a performance bottleneck.z] Based on the number of executed instructions, the highest utilized pipeline ({:.1f}%) is {}.g333333?zg Comparing the two, the overall pipeline utilization appears to be caused by high-latency instructions.gffffff?zp Comparing the two, the overall pipeline utilization appears to be caused by frequent, low-latency instructions.zGTry to decrease the utilization of the busiest pipeline (currently: {}))r    get_contextrange_by_idxaction_by_idxfrontendr   parserequested_metricsr*   send_dict_to_childrenr7   rX   rI   r5   messager!   MsgType_MSG_OPTIMIZATIONr'   speedupfocus_metricr-   Severity_SEVERITY_HIGHrQ   r:   r   MsgType_MSG_OKSeverity_SEVERITY_DEFAULT) handlectxactionfer/   r2   ri   rj   rk   rl   rm   ac_pipelinesinst_pipelineslow_utilization_thresholdhigh_utilization_threshold bottleneck_utilization_thresholdmax_pipe_acr#   doc_msginst_section_msg	stall_msgr   r   msg_idr%   speedup_value	pipe_infomessage_namemax_pipe_instmax_utilization_instpipe_info_instutilization_diffs                                    r   applyr      s   


f
%Ca ..q1F	B$V4::;LMG			"	"	$r	)GJ,?,E,E,G	GB)01n)o)u)u)w)01o)p)v)v)x  %D!,S),S),S),S)	Rx(J%0Z-0Z-0Z-0Z- 	D$8TV}~D$8T  Ws  	tD$8U  XC  	DxD$8WD$8UVS$8deS$8deS38deDB8]^bB8]^bB8_`bB8_`	
	 	S38S  Vf  	g(D$8U  XSS$8deS$8deS38deDB8]^bB8]^bB8_`bB8_`
	
 	b$8T  WL  	M'S38V  Y_  	`9LF 	D$8TUD$8TV}~D$8TUD$8T  Ws  	tDB8U  XC  	Db$8^  aL  	MD$8U  XC  	Db$8]_yzb$8]  `T  	UD$8TV}~S38S  Vf  	gS$8d  gY  	ZS$8d  gS  	TS38d  gY  	ZbB8_  bT  	UbB8_  bT  	U&bB8_  bI  	JDB8]  `R  	SbB8]  `L  	MD$8TV{|b$8T  WL  	M'D$8U  Xc  	db$8XYS$8WY}~D$8ST5N< !#!#')$ )9(O%[$ a V	>".">288:Lc! H	  99 UG  \  \GZZ 1 1 J JGUfgF*?@R*S'L-JJv|];OO"""!!88RYYZeZjZjk [  b  b  cn  cs  cs  uG  HG#33G<I$6I-33!$>>)RR

7,,;;WlS%(HH#5LuuG#:LddG 9I8a5 4 ,~   F   F  G[  ]j  ]o  ]o   p  pG%2%B%B7%KN%16N#:S#@@ (<>P'P$'#-  $M  M)C/  $V  V7%55	AAG$5$5$N$NPWYef&&&%%??]ddepeueuvA r   )r    RequestedMetricsr   r   r   OPTIONALr   r   r   r   r   r   r'   r5   r7   rI   rX   r   r   r   r   <module>r      s%  2  N N4>
K4>
K4
 OQUWaWjWjlprwx4 OQUWaWjWjlprwx4 PRVXbXkXkmqsxy4 RTXZdZmZmosuz{4 NPTV`ViVikoqvw4 RTXZdZmZmosuz{4 UW[]g]p]prvx}~4 _aegqgzgz  }A  CH  I4 _aegqgzgz  }A  CH  I4 _aegqgzgz  }A  CH  I4 Z\`blbubuw{  ~C  D4  Z\`blbubuw{  ~C  D!4" Z\`blbubuw{  ~C  D#4$ ]_ceoexexz~  AF  G%4& ]_ceoexexz~  AF  G'4( OQUWaWjWjlprwx)4* QSWYcYlYlnrtyz+40 OQUWaWjWjlprwx142 OQUWaWjWjlprwx344 OQUWaWjWjlprwx546 OQUWaWjWjlprwx748 Y[_akatatvz  }B  C94: PRVXbXkXkmqsxy;4< PRVXbXkXkmqsxy=4> XZ^`j`s`suy  |A  B?4@ XZ^`j`s`suy  |A  BA4B OQUWaWjWjlprwxC4D NPTV`ViVikoqvwE4F _aegqgzgz  }A  CH  IG4H _aegqgzgz  }A  CH  II4J _aegqgzgz  }A  CH  IK4L Z\`blbubuw{  ~C  DM4N Z\`blbubuw{  ~C  DO4P Z\`blbubuw{  ~C  DQ4R Z\`blbubuw{  ~C  DS4T ]_ceoexexz~  AF  GU4V ]_ceoexexz~  AF  GW4X OQUWaWjWjlprwxY4Z OQUWaWjWjlprwx[4\ PRVXbXkXkmqsxy]4^ SUY[e[n[nptv{|_4` RTXZdZmZmosuz{a4b NPTV`ViVikoqvwc4f ;^ZM`M`bfhmng4 n!#7%-'*	  	  & :jr   