
    ggC                        S SK Jr  S SKJr  S SKrS SKJrJrJr  \" SS\R                  SS5      \" SS	\R                  SS5      \" S
S\R                  SS5      \" SS\R                  SS5      \" SS\R                  SS5      \" SS\R                  SS5      \" SS\R                  SS5      \" SS\R                  SS5      \" SS5      \" SS5      \" SS5      \" SS5      \" SS\R                  SS5      \" S S!\R                  SS5      \" S"S#5      \" S$S%\R                  SS5      \" S&S'\R                  SS5      /r
\" S(/ S)Q5      r\" S*/ S+QS,S,\R                  R                  S / 4S-9rS. rS/ rS0 rS1 rS2 rS3 rS4 rS5 rS6 rg)7    )
namedtuple)productN)
ImportanceMetricRequestRequestedMetricsParserz?smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratiobytes_per_sector_global_loadFzBsmsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate max_bytes_per_sector_global_loadz?smsp__sass_average_data_bytes_per_sector_mem_global_op_st.ratiobytes_per_sector_global_storezBsmsp__sass_average_data_bytes_per_sector_mem_global_op_st.max_rate!max_bytes_per_sector_global_storez>smsp__sass_average_data_bytes_per_sector_mem_local_op_ld.ratiobytes_per_sector_local_loadzAsmsp__sass_average_data_bytes_per_sector_mem_local_op_ld.max_ratemax_bytes_per_sector_local_loadz>smsp__sass_average_data_bytes_per_sector_mem_local_op_st.ratiobytes_per_sector_local_storezAsmsp__sass_average_data_bytes_per_sector_mem_local_op_st.max_rate max_bytes_per_sector_local_storez6l1tex__t_sector_pipe_lsu_mem_global_op_ld_hit_rate.pct"l1tex_global_load_hit_rate_percentz6l1tex__t_sector_pipe_lsu_mem_global_op_st_hit_rate.pct#l1tex_global_store_hit_rate_percentz5l1tex__t_sector_pipe_lsu_mem_local_op_ld_hit_rate.pct!l1tex_local_load_hit_rate_percentz5l1tex__t_sector_pipe_lsu_mem_local_op_st_hit_rate.pct"l1tex_local_store_hit_rate_percentz"lts__t_sector_op_read_hit_rate.pctl2_load_hit_rate_percentz#lts__t_sector_op_write_hit_rate.pctl2_store_hit_rate_percentz3l1tex__throughput.avg.pct_of_peak_sustained_elapsedl1tex_throughput_percentz1lts__throughput.avg.pct_of_peak_sustained_elapsedl2_throughput_percentz6gpu__dram_throughput.avg.pct_of_peak_sustained_elapseddram_throughput_percentFocusMetric)namevalueseverityadvice
RuleResult)messagetitlespeedup_typespeedup_valuefocus_metrics )defaultsc                      g)NMemoryCacheAccessPattern r(       <nsight-compute-2025.1.1/sections/MemoryCacheAccessPattern.pyget_identifierr+   i   s    %r)   c                      g)NzMemory Cache Access Patternr(   r(   r)   r*   get_namer-   m   s    (r)   c                      g)NzLDetection of inefficient memory access patterns in the L1/TEX and L2 caches.r(   r(   r)   r*   get_descriptionr/   q   s    Yr)   c                      g)NMemoryWorkloadAnalysis_Tablesr(   r(   r)   r*   get_section_identifierr2   u   s    *r)   c                      S/$ )NMemoryr(   r(   r)   r*   get_parent_rules_identifiersr5   y   s
    :r)   c                 *    U  SU 3nSU 3nSU 3nX44$ )N_bytes_per_sector_max_bytes_per_sector_r(   )memory_space	operationmetric_extensionbytes_per_sector_namemax_bytes_per_sector_names        r*   !get_bytes_per_sector_metric_namesr?   }   s:    &q4/0@/AB"78H7I J ;;r)   c                 j    [        X5      u  p4X   R                  5       nX   R                  5       nXV4$ )N)r?   r   )metricsr:   r;   r=   r>   bytes_per_sectormax_bytes_per_sectors          r*   get_bytes_per_sector_metricsrD      s?    ),B 55;;="=CCE11r)   c                    [        X1U5      u  pEUS:X  d  US:X  a  [        R                  R                  S4$ SnSnSnSn	U S;   a  US:X  a3  US:X  a-  SUS   R	                  5       S-  -
  nUS   R                  5       nOUS:X  a3  US	:X  a-  SUS
   R	                  5       S-  -
  nUS
   R                  5       nOqUS:X  a3  US:X  a-  SUS   R	                  5       S-  -
  nUS   R                  5       nO8US:X  a2  US	:X  a,  SUS   R	                  5       S-  -
  nUS   R                  5       nU S:X  ae  US:X  a-  SUS   R	                  5       S-  -
  nUS   R                  5       n	O2US	:X  a,  SUS   R	                  5       S-  -
  nUS   R                  5       n	Sn
SnU S:X  a*  US   R	                  5       S-  n
US   R                  5       nOeU S:X  a*  US   R	                  5       S-  n
US   R                  5       nO5U S:X  a/  US   b)  US   R	                  5       S-  n
US   R                  5       nU
(       a  [        R                  R                  nO[        R                  R                  nSXE-  -
  U(       a  UOS-  U(       a  UOS-  U
(       a  U
OS-  S-  n/ n[        U[        X5      S      R                  5       U[        R                  R                  SUS S35      nUR                  U5        U(       aG  [        UX8   R	                  5       [        R                  R                  S5      nUR                  U5        U	(       aG  [        U	X9   R	                  5       [        R                  R                  S5      nUR                  U5        U(       aY  [        UX;   R	                  5       [        R                  R                  SU R                  5        S35      nUR                  U5        XU4$ )a  Speedup Estimation and Focus Metrics for memory access patterns in all caches.

Assuming that at each cache level the bandwidth is independent of the amount
of data moved, the speedup can be estimated as follows:

  s = time_old / time_new
    = (data_old * bandwidth_new) / (data_new * bandwidth_old)
    = (data_old / data_new)
    = (max_bytes_per_sector_old * num_sectors_old)
          / (max_bytes_per_sector_new * num_sectors_new)
    = (num_sectors_new * bytes_per_sector_new / bytes_per_sector_old)
          / num_sectors_new
    = bytes_per_sector_new / bytes_per_sector_old

where we used that the "useful" amount of data moved, remains constant, i.e.
bytes_per_sector_old * num_sectors_old = bytes_per_sector_new * num_sectors_new.
Thus, the maximal speedup is s_max = max_bytes_per_sector / bytes_per_sector.
Using that the maximal improvement = 1 - (1 / s_max), we get

  improvement_percent = (1 - bytes_per_sector / max_bytes_per_sector) * 100

At each cache level, the relevant amount of data is given by the sectors missed
at its respective lower-level cache, introducing new factors of `cache_miss_rate`.
To get a global estimate, we can use the cache's throughput as a weight.
r   N)l2dramglobalload   r   d   storer   localr   r   rG   r   r   l1texr   rF   r   r   zAIncrease the average number of bytes utilized per sector towards .0fz byteszJTry to increase the hit rate in L1TEX to benefit from its higher bandwidthzGTry to increase the hit rate in L2 to benefit from its higher bandwidthzThe higher the z- throughput the more severe the issue becomes)rD   NvRules	IFrontendSpeedupType_LOCALr   r   SpeedupType_GLOBALr   r?   Severity_SEVERITY_HIGHappendSeverity_SEVERITY_DEFAULTSeverity_SEVERITY_LOWupper)cacher:   r;   rA   rB   rC   l1_miss_ratel2_miss_ratel1_hit_rate_namel2_hit_rate_name
throughputthroughput_namer!   improvement_percentr#   bytes_per_sector_focus_metricl1_hit_rate_focus_metricl2_hit_rate_focus_metricthroughput_focus_metrics                      r*   get_speedup_and_focus_metricsre      sO   6 	%WIF + 1 4 9  22A55 LL8#	V(;w'KLRRTWZZZL&'KLQQSX%)w*>w'LMSSUX[[[L&'LMRRTW$f)<w'JKQQSVYYYL&'JKPPRW$g)=w'KLRRTWZZZL&'KLQQSw'ABHHJSPPL&'ABGGI'!w'BCIIKcQQL&'BCHHJ JO78>>@3F
!"<=BBD	$45;;=C
!"9:??A
6/(A B N67==?#E
!";<AAC((;;((:: 
4	4'<Q	0'<Q	0 $:	, 		  M$/1,J1MNSSU00
K
$F	,	%! 67#.%++-77X	$
  	56#.%++-77U	$
  	56"-$**,33ekkm_ - 	#
 	45m;;r)   c                    [         R                  " U 5      nUR                  S5      R                  S5      nUR	                  5       nUR                  5       [         R                  R                  :w  a  g [        X5      R                  [        5      nUR                  5        H  u  pVSU;   d  M  Ub  M    g    / SQnSS/nSS/n	Sn
[        5       [        5       S.[        5       [        5       S.S	.n[        XxU	5       GH  u  pnUS
:X  a	  US   c  M  OUS:X  a  US   b  US   c  M*  [        XMU5      u  nnSUs=:  a  U:  d  MH  O  ML  [        XX5      u  nnnUX   U   R                   ::  a  Ms  UR#                  5        SUR%                  5        SUR%                  5        S3nSnUS
:X  a  US:X  a  US:X  a  SUS   R'                  5       -
  nOhUS:X  a  US:X  a  SUS   R'                  5       -
  nOEUS:X  a  US:X  a  SUS   R'                  5       -
  nO"US:X  a  US:X  a  SUS   R'                  5       -
  nSWS S3nOFUS:X  a@  US:X  a  SUS   R'                  5       -
  nOUS:X  a  SUS   R'                  5       -
  nSWS S3nSU SU SUS:X  a  SOS SUR#                  5        SUS SUS  S!3U-   S"U SU S#3-   n[        UUUUU5      X   U'   GM     [        X5       GH  u  pX   U   R(                  (       d  M  X   U   R                   U
:  d  M5  UR)                  [         R*                  R,                  X   U   R(                  X   U   R$                  5      nUR/                  UX   U   R0                  X   U   R                   5        X   U   R2                   H@  nUR5                  UUR6                  UR&                  UR8                  UR:                  5        MB     GM     g )$Nr   sass)rN   rF   rG   rH   rM   rI   rL   )rI   rL   )rH   rM   rF   r   rG   r    z Access Patternr$   rK   r   r   r   r   zThis applies to the z.1fz% of sectors missed in L1TEX. z% of sectors missed in L2. zThe memory access pattern for zs fromtoz( might not be optimal. On average, only z of the rO   z; bytes transmitted per sector are utilized by each thread. zThis could possibly be caused by a stride between threads. Check the @section:SourceCounters:Source Counters@ section for uncoalesced zs.)rP   get_contextrange_by_idxaction_by_idxfrontendworkload_typeIActionWorkloadType_KERNELr   parserequested_metricsitemsr   r   rD   re   r"   rX   r    r   r   rQ   MsgType_MSG_OPTIMIZATIONspeedupr!   r#   focus_metricr   r   r   )handlectxactionferA   r   metriccache_levelsmemory_spaces
operationsthreshold_speedup_percentrule_resultsrY   spacer;   rB   rC   r!   r"   r#   
rule_titlecache_level_messagerZ   r[   rule_message
message_ids                             r*   applyr   	  sw   


f
%Ca ..q1F	B!D!DD$V4::;LMGT>fn 	 (L 	M
 	J !" L\

 L\
	L $+<
#SiD=12: ;f_12:gFa>b>j )C 	/. 6"666-eIO 7L-  3I > L LL!KKM?!EKKM?!IOO<M;No^J"$}H$f)<#&1U)V)\)\)^#^Lh&9+?#&1V)W)])])_#_Lg%)v*=#&1T)U)[)[)]#]Lg%)w*>#&1U)V)\)\)^#^L*<*<<Z[ $ &&#&1K)L)R)R)T#TL')#&1L)M)S)S)U#UL*<*<<WX $ 1q2&&06d;1U[[]O L$$4S#9', -#	$ &&$gQyk5	5  .8.L	*m $T~ $M>	*222#I.<<@YY!!::#I.66#I.44J
 JJ#I.;;#I.<<
 '-i8FFKKLLOOMM G ?r)   )collectionsr   	itertoolsr   rP   RequestedMetricsr   r   r   OPTIONALrs   r   rQ   rR   r   r+   r-   r/   r2   r5   r?   rD   re   r   r(   r)   r*   <module>r      s  2 #   N N SUsu  vI  vI  KO  QV  WVXz  }G  }P  }P  RV  X]  ^SUt  wA  wJ  wJ  LP  RW  XVX{  ~H  ~Q  ~Q  SW  Y^  _RTqs}  tG  tG  IM  OT  UUWx  {E  {N  {N  PT  V[  \RTrt~  uH  uH  JN  PU  VUWy  |F  |O  |O  QU  W\  ]@, @- ?+ ?, ," -# =" ; @!k< | +
 J"g''991bA
&)Z+<2z<zAr)   