
    ggT                        S SK r S SKrS SKJrJrJr  / \" SS5      P\" SS5      P\" SS\R                  S5      P\" S	S
\R                  S5      P\" SS\R                  S5      P\" SS\R                  S5      P\" SS\R                  SS5      P\" SS\R                  SS5      P\" SS\R                  S5      P\" SS\R                  SS5      P\" SS\R                  S5      P\" SS\R                  S5      P\" SS\R                  SS5      P\" SS\R                  S5      P\" S S!\R                  S5      P\" S"S#\R                  S5      P\" S$S%\R                  S5      P\" S&S'\R                  S5      P\" S(S)\R                  S5      P\" S*S+\R                  S5      P\" S,S-\R                  S5      P\" S.S/\R                  S5      P\" S0S1\R                  S5      P\" S2S3\R                  S5      P\" S4S5\R                  S5      P\" S6S7\R                  SS5      P\" S8S9\R                  S5      P\" S:S;\R                  SS5      P\" S<S=\R                  S5      P\" S>S?\R                  S5      P\" S@SA\R                  SS5      P\" SBSC\R                  S5      P\" SDSE\R                  S5      P\" SFSG\R                  S5      P\" SHSI\R                  S5      P\" SJSK\R                  S5      P\" SLSM\R                  S5      P\" SNSO\R                  S5      P\" SPSQ\R                  SS5      PrSR rSS r	ST r
SU rSV rSW rSX rg)Y    N)
ImportanceMetricRequestRequestedMetricsParserz'smsp__issue_active.avg.per_cycle_activeissue_activez0smsp__average_warp_latency_per_inst_issued.ratiowarp_cycles_per_issuez@smsp__average_warps_issue_stalled_barrier_per_issue_active.ratiosmsp_average_barrierzIsmsp__average_warps_issue_stalled_branch_resolving_per_issue_active.ratiosmsp_average_branch_resolvingzGsmsp__average_warps_issue_stalled_dispatch_stall_per_issue_active.ratiosmsp_average_dispatch_stallz>smsp__average_warps_issue_stalled_drain_per_issue_active.ratiosmsp_average_drainz=smsp__average_warps_issue_stalled_gmma_per_issue_active.ratiosmsp_average_warpgroup_arriveFzAsmsp__average_warps_issue_stalled_imc_miss_per_issue_active.ratiosmsp_average_imc_misszDsmsp__average_warps_issue_stalled_lg_throttle_per_issue_active.ratiosmsp_average_lg_throttlezHsmsp__average_warps_issue_stalled_long_scoreboard_per_issue_active.ratiosmsp_average_long_scoreboardzKsmsp__average_warps_issue_stalled_math_pipe_throttle_per_issue_active.ratiosmsp_average_math_pipe_throttlez?smsp__average_warps_issue_stalled_membar_per_issue_active.ratiosmsp_average_membarzEsmsp__average_warps_issue_stalled_mio_throttle_per_issue_active.ratiosmsp_average_mio_throttlez=smsp__average_warps_issue_stalled_misc_per_issue_active.ratiosmsp_average_misczGsmsp__average_warps_issue_stalled_no_instruction_per_issue_active.ratiosmsp_average_no_instructionzEsmsp__average_warps_issue_stalled_not_selected_per_issue_active.ratiosmsp_average_not_selectedzIsmsp__average_warps_issue_stalled_short_scoreboard_per_issue_active.ratiosmsp_average_short_scoreboardzAsmsp__average_warps_issue_stalled_sleeping_per_issue_active.ratiosmsp_average_sleepingzEsmsp__average_warps_issue_stalled_tex_throttle_per_issue_active.ratiosmsp_average_tex_throttlez=smsp__average_warps_issue_stalled_wait_per_issue_active.ratiosmsp_average_waitsmsp__pcsamp_sample_countpc_sampling_count(smsp__pcsamp_warps_issue_stalled_barrierpc_sampling_barrier1smsp__pcsamp_warps_issue_stalled_branch_resolvingpc_sampling_branch_resolving/smsp__pcsamp_warps_issue_stalled_dispatch_stallpc_sampling_dispatch_stall&smsp__pcsamp_warps_issue_stalled_drainpc_sampling_drain)smsp__pcsamp_warps_issue_stalled_imc_misspc_sampling_imc_miss,smsp__pcsamp_warps_issue_stalled_lg_throttlepc_sampling_lg_throttle0smsp__pcsamp_warps_issue_stalled_long_scoreboardpc_sampling_long_scoreboard3smsp__pcsamp_warps_issue_stalled_math_pipe_throttlepc_sampling_math_pipe_throttle'smsp__pcsamp_warps_issue_stalled_membarpc_sampling_membar-smsp__pcsamp_warps_issue_stalled_mio_throttlepc_sampling_mio_throttle%smsp__pcsamp_warps_issue_stalled_miscpc_sampling_misc0smsp__pcsamp_warps_issue_stalled_no_instructionspc_sampling_no_instruction-smsp__pcsamp_warps_issue_stalled_not_selectedpc_sampling_not_selected1smsp__pcsamp_warps_issue_stalled_short_scoreboardpc_sampling_short_scoreboard)smsp__pcsamp_warps_issue_stalled_sleepingpc_sampling_sleeping-smsp__pcsamp_warps_issue_stalled_tex_throttlepc_sampling_tex_throttle%smsp__pcsamp_warps_issue_stalled_waitpc_sampling_wait1smsp__pcsamp_warps_issue_stalled_warpgroup_arrivepc_sampling_warpgroup_arrivec                      g)NCPIStall rB       ,nsight-compute-2025.1.1/sections/CPIStall.pyget_identifierrE   M   s    rC   c                      g)Nz
Warp StallrB   rB   rC   rD   get_namerG   P   s    rC   c                      g)NzWarp stall analysisrB   rB   rC   rD   get_descriptionrI   S   s     rC   c                      g)NWarpStateStatsrB   rB   rC   rD   get_section_identifierrL   V   s    rC   c                      S/$ )NIssueSlotUtilizationrB   rB   rC   rD   get_parent_rules_identifiersrO   Y   s    "##rC   c                     X-  nSnX@;   a0  [         R                  R                  n[        X   U5      nUS-  nXW4$ [         R                  R                  nUS-  nXW4$ )N"issue_slot_util_speedup_normalizedd   )NvRules	IFrontendSpeedupType_GLOBALminSpeedupType_LOCAL)parent_weightswarp_cycles_per_stallr   improvement_localparent_speedup_namespeedup_typeimprovement_globalimprovement_percents           rD   get_estimated_speedupr_   \   sv    -E>,((;; !DFWX036
 ,, ((::/#5,,rC   c                 
   [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       n[        X5      R                  [        5      nUR                  S5      n0 SS_SS_SS_S	S
_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_S#S$_S%S&0EnUS'   R                  5       nUS(   R                  5       nS)n	S*n
US+   nU GH(  nS,U 3nXM   c  M  XM   nUR                  5       nUR                  5       UR                  5       :X  d  MH  [        UR                  5       5       GH  nUR                  U5      S:g  =(       a&    UR                  U5      UR                  U5      -  U
:  nUR                  5       S:g  =(       a%    UR                  U5      UR                  5       -  U	:  nU(       d  M  U(       d  M  UR                  U5      nUR                  S-R                  UR                  U5      UR                  5       -  S.-  UR                  U5      UR                  U5      -  S.-  U5      U[         R                   R"                  [         R                   R$                  5        UR'                  U5      nUS :w  d  GM]  UR)                  5       nUR+                  5       nUR                  S/U[         R                   R,                  U[         R                   R$                  5        GM     GM+     / nU GHV  nS0U 3nXM   c  M  XM   R                  5       nS1nS*nUU:  d  M.  US:  d  M6  UUU-  :  d  MA  S2U-  U-  nXl   n[.        R0                  " S3S4US   5      nUS5   nS6R                  UU5      nUS7R                  UU5      -  nU(       a  US8U-   -  n[3        UUU5      u  n n!US'   R5                  5       U[         R                   R6                  S94UUUS::  a  [         R                   R8                  O[         R                   R:                  S;R                  UR=                  S<S85      5      4/n"UR?                  UUUU"U U!45        GMY     [A        US= S>S?9n#U# H  n$U$S   R=                  S<S85      RC                  5       S@-   n%URE                  [         R                   RF                  U$SA   U%5      n&U$SB   U$SC   n!n URI                  U&U U!5        U$SD    H$  n'URK                  U&U'S   U'S5   U'SA   U'SD   5        M&     M     [M        U#5      S:  a+  URE                  [         R                   RN                  SE5        g g )FNr   rN   barrier)a  Warp was stalled waiting for sibling warps at a CTA barrier. A high number of warps waiting at a barrier is commonly caused by diverging code paths before a barrier. This causes some warps to wait a long time until other warps reach the synchronization point. Whenever possible, try to divide up the work into blocks of uniform workloads. If the block size is 512 threads or greater, consider splitting it into smaller groups. This can increase eligible warps without affecting occupancy, unless shared memory becomes a new occupancy limiter. Also, try to identify which barrier instruction causes the most stalls, and optimize the code executed before that synchronization point first.Nbranch_resolving)aF  Warp was stalled waiting for a branch target to be computed, and the warp program counter to be updated. To reduce the number of stalled cycles, consider using fewer jump/branch operations and reduce control flow divergence, e.g. by reducing or coalescing conditionals in your code. See also the related No Instructions state.Ndispatch_stall)zWarp was stalled waiting on a dispatch stall. A warp stalled during dispatch has an instruction ready to issue, but the dispatcher holds back issuing the warp due to other conflicts or events.Ndrain)a  Warp was stalled after EXIT waiting for all outstanding memory operations to complete so that warp's resources can be freed. A high number of stalls due to draining warps typically occurs when a lot of data is written to memory towards the end of a kernel. Make sure the memory access patterns of these store operations are optimal for the target architecture and consider parallelized data reduction, if applicable.Nimc_miss)a  Warp was stalled waiting for an immediate constant cache (IMC) miss. A read from constant memory costs one memory read from device memory only on a cache miss; otherwise, it just costs one read from the constant cache. Immediate constants are encoded into the SASS instruction as 'c[bank][offset]'. Accesses to different addresses by threads within a warp are serialized, thus the cost scales linearly with the number of unique addresses read by all threads within a warp. As such, the constant cache is best when threads in the same warp access only a few distinct locations. If all threads of a warp access the same location, then constant memory can be as fast as a register access.Nlg_throttle)a`  Warp was stalled waiting for the L1 instruction queue for local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple lower-width memory operations into fewer wider memory operations and try interleaving memory operations and math instructions.Nlong_scoreboard)a  Warp was stalled waiting for a scoreboard dependency on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to shared memory.Nmath_pipe_throttle)a[  Warp was stalled waiting for the execution pipe to be available. This stall occurs when all active warps execute their next instruction on a specific, oversubscribed math pipeline. Try to increase the number of active warps to hide the existent latency or try changing the instruction mix to utilize all available pipelines in a more balanced way.Nmembar)zWarp was stalled waiting on a memory barrier. Avoid executing any unnecessary memory barriers and assure that any outstanding memory operations are fully optimized for the target architecture.Nmio_throttle)aw  Warp was stalled waiting for the MIO (memory input/output) instruction queue to be not full. This stall reason is high in cases of extreme utilization of the MIO pipelines, which include special math instructions, dynamic branches, as well as shared memory instructions. When caused by shared memory accesses, trying to use fewer but wider loads can reduce pipeline pressure.Nmisc)z5Warp was stalled for a miscellaneous hardware reason.Nno_instruction)a  Warp was stalled waiting to be selected to fetch an instruction or waiting on an instruction cache miss. A high number of warps not having an instruction fetched is typical for very short kernels with less than one full wave of work in the grid. Excessively jumping across large blocks of assembly code can also lead to more warps stalled for this reason, if this causes misses in the instruction cache. See also the related Branch Resolving state.Nnot_selected)a  Warp was stalled waiting for the micro scheduler to select the warp to issue. Not selected warps are eligible warps that were not picked by the scheduler to issue that cycle as another warp was selected. A high number of not selected warps typically means you have sufficient warps to cover warp latencies and you may consider reducing the number of active warps to possibly increase cache coherence and data locality.Nshort_scoreboard)ao  Warp was stalled waiting for a scoreboard dependency on a MIO (memory input/output) operation (not to L1TEX). The primary reason for a high number of stalls due to short scoreboards is typically memory operations to shared memory. Other reasons include frequent execution of special math instructions (e.g. MUFU) or dynamic branching (e.g. BRX, JMX). Consult the Memory Workload Analysis section to verify if there are shared memory operations and reduce bank conflicts, if reported. Assigning frequently accessed values to variables can assist the compiler in using low-latency registers instead of direct memory accesses.Nsleeping)a  Warp was stalled due to all threads in the warp being in the blocked, yielded, or sleep state. Reduce the number of executed NANOSLEEP instructions, lower the specified time delay, and attempt to group threads in a way that multiple threads in a warp sleep at the same time.Ntex_throttle)a`  Warp was stalled waiting for the L1 instruction queue for texture operations to be not full. This stall reason is high in cases of extreme utilization of the L1TEX pipeline. Try issuing fewer texture fetches, surface loads, surface stores, or decoupled math operations. If applicable, consider combining multiple lower-width memory operations into fewer wider memory operations and try interleaving memory operations and math instructions. Consider converting texture lookups or surface loads into global memory lookups. Texture can accept four threads' requests per cycle, whereas global accepts 32 threads.Nwait)a  Warp was stalled waiting on a fixed latency execution dependency. Typically, this stall reason should be very low and only shows up as a top contributor in already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to lower-latency instructions, e.g. by making use of fast math compiler options.Nwarpgroup_arrive)zNWarp was stalled waiting on a WARPGROUP.ARRIVES or WARPGROUP.WAIT instruction.Nr   r   g?g333333?r   pc_sampling_zlThis line is responsible for {:.1f}% of all warp stalls. {:.1f}% of the stalls for this line are of type {}.rR   zaThis line is responsible for a high number of warp stalls. See markers on SASS lines for details.smsp_average_g?g      Y@z^Warp was stalled     zLOn average, each warp of this workload spends {:.1f} cycles being stalled {}zq This stall type represents about {:.1f}% of the total average of {:.1f} cycles between issuing two instructions. z<Increase the average number of instructions issued per cycle
   z0Decrease the number of cycles spent in {} stalls_c                     U S   $ )Nrv   rB   )stalls    rD   <lambda>apply.<locals>.<lambda>   s    eAhrC   T)keyreversez Stalls            a-  Check the @section:SourceCounters:Warp Stall Sampling (All Samples)@ table for the top stall locations in your source based on sampling data. The @url:Kernel Profiling Guide:https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference@ provides more details on each stall reason.)(rS   get_contextrange_by_idxaction_by_idxfrontendr   parserequested_metricsreceive_dict_from_parentvaluecorrelation_idsnum_instancesrange	as_uint64source_markerformatrT   MarkerKind_SASSMsgType_MSG_WARNINGsource_infoline	file_nameMarkerKind_SOURCEresubr_   nameSeverity_SEVERITY_HIGHSeverity_SEVERITY_DEFAULTSeverity_SEVERITY_LOWreplaceappendsortedtitlemessageMsgType_MSG_OPTIMIZATIONspeedupfocus_metriclenMsgType_MSG_OK)(handlectxactionfemetricsrX   stall_typesr   r   total_sample_count_ratiohigh_stall_ratiosample_count_metric
stall_namestall_metric_namestall_metricr   i!percent_of_local_stalls_condition!percent_of_total_stalls_conditionaddressr   line_numberr   reported_stallsrY   issue_active_thresholdratio_thresholdwarp_cycles_avg
stall_infostall_descriptionstall_extrar   r\   speedup_valuefocus_metricssorted_stallsr{   message_namemsg_idfms(                                           rD   applyr   j   s   


f
%Ca ..q1F	B$V4::;LMG001GHN7 7 	 	7 	 7 	 7 	 7  	 !7& 	 '7, 	  -72 	 378 	 97> 	 ?7D 	 E7J 	 K7P 	 Q7V 	 W7\ 	 ]7b 	 c7h 	 i7Kr >*002L#$;<BBD"!"56!
*:,7%1"5L*::<O))+/B/P/P/RR|99;<A8K8U8UVW8X\]8]  9rbnbxbxyzb{  R  \  \  ]^  _  c_  br  cr58K8U8U8W[\8\  9ata~a~  @A  bB  EX  Eb  Eb  Ed  bd  g  b588=^=^"1";";A">((  *X  *_  *_  `s  `}  `}  ~  `@  CV  C`  C`  Cb  `b  eh  `h  jv  j@  j@  AB  jC  FY  Fc  Fc  de  Ff  jf  il  jl  nx  *y  {B  DK  DU  DU  De  De  gn  gx  gx  gL  gL  M&,&8&8&A&$.*5*:*:*<K(3(=(=(?I,,  .Q  S^  `g  `q  `q  `C  `C  EN  PW  Pa  Pa  Pu  Pu  v = "* O!
+J<8%- ' : @ @ B!$005JQ5NSbf{  T  gT  TU"%::=RRO$0J "';RA O$Q-Kdkk  mB  DU  VG  K  R  R  Sb  dy  z  zG3,,*?Peg|*}'L-(--/w?P?P?g?g  jh  i"$9j  CE  kE7;L;L;f;f  KR  K\  K\  Kr  Kr  tf  tm  tm  nx  n@  n@  AD  FI  nJ  tK  LMM ""J0EwP]_kmz#{|/ "2 ?0FPTUMQx''S1779IEG--FFaR^_&+Aham


6<7(BOOFBqE2a5"Q%A?   =A


7$$33o	p rC   )r   rS   RequestedMetricsr   r   r   OPTIONALr   rE   rG   rI   rL   rO   r_   r   rB   rC   rD   <module>r      s^  2 
  N N,;^L,DF]^,
 TVlnx  oB  oB  DH  I, ]_~  AK  AT  AT  VZ  [, []z  }G  }P  }P  RV  W, RThjtj}j}  @D  E, QSrt~  uH  uH  JN  PU  V, UWnpz  qD  qD  FJ  LQ  R, XZt  wA  wJ  wJ  LP  Q, \^|  I  R  R  TX  Z_  `, _  bC  EO  EX  EX  Z^  _, SUjlvll  BF  G, Y[v  yC  yL  yL  NR  TY  Z,  QSfhrh{h{  ~B  C!," []z  }G  }P  }P  RV  W#,$ Y[v  yC  yL  yL  NR  S%,& ]_~  AK  AT  AT  VZ  [',( UWnpz  qD  qD  FJ  K),* Y[v  yC  yL  yL  NR  S+,, QSfhrh{h{  ~B  C-,2 -/BJDWDWY]^3,4 <>SU_UhUhjno5,6 EGegqgzgz  }A  B7,8 CEacmcvcvx|}9,: :<OQ[QdQdfjk;,< =?UWaWjWjlprwx=,> @B[]g]p]prvw?,@ DFceoexexz~  AF  GA,B GIikuk~k~  AE  FC,D ;=QS]SfSfhlmE,F AC]_i_r_rtxz  AG,H 9;MzObObdhiI,J DFbdndwdwy}~K,L AC]_i_r_rtxyM,N EGegqgzgz  }A  BO,P =?UWaWjWjlpqQ,R AC]_i_r_rtxyS,T 9;MzObObdhiU,V EGegqgzgz  }A  CH  IW, ^!$-CprC   