
    3j                     n	   % S SK Jr  S SKrS SKrS SKJrJrJr  SSKJ	r	  S SK
JrJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJr  S SKJr  S SKrSS/r\" S5      r\" S5      r\R<                  " \5      r  S SK!J"r#  \RN                  RP                  r(S r)0 r*\+\\4   \,S'   S r-SDS\\\\4   /\\\4   4   4S jjr.\." \(R^                  5      SS.S\04S jj5       r1\." \(Rd                  5      SES\04S jj5       r3\." \(Rh                  5      SES\04S jj5       r5\." \(Rl                  5      SES\04S jj5       r7\." \(Rp                  5           SFS\04S  jj5       r9 SDS!\:\0   S"\:\0   S#\:\0   S$\;S\04
S% jjr<\." \(Rz                  \(R|                  \(R~                  \(R                  \(R                  /5      SS.S\04S& jj5       rB\." \(R                  5      S\04S' j5       rDS( rE\." \(R                  \(R                  \(R                  /5      SS.S\04S) jj5       rIS* rJSS+.S\\K\K\0S,4   \K\0S,4   \K\0S,4   \K\0S,4   S-  4      4S- jjrLSS+.S\\K\K\0S,4   \K\0S,4   \K\0S,4   \K\0S,4   S-  4      4S. jjrM\." \(R                  S/S09SS.S\04S1 jj5       rO\." \(R                  S/S09S\04S2 j5       rQS3 rR\." \(R                  \(R                  \(R                  /5      SS.S\04S4 jj5       rV\." \(R                  S/S09S\04S5 j5       rX\." \(R                  S/S09S\04S6 j5       rZSS7.S\04S8 jjr[SS7.S\04S9 jjr\SS7.S\04S: jjr]0 \(R^                  \1_\(Rd                  \3_\(Rh                  \5_\(Rl                  \7_\(Rp                  \9_\(Rz                  \B_\(R|                  \B_\(R~                  \B_\(R                  \B_\(R                  \B_\(R                  \D_\(R                  \I_\(R                  \I_\(R                  \I_\(R                  \V_\(R                  \V_\(R                  \V_\(R                  \O\(R                  \Q\(R                  \X\(R                  \Z0Er*S; r^/ S<Qr_S= r`S> raS\b4S? jrcS@ rd " SA S5      re " SB SC\5      rfg! \$ a+    \%" S S 5       5      (       a  \ RM                  S5        \r# GN0f = f)G    )NoneTypeN)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_PJITFunctionc              #   \   #    U  H"  n[        [        R                  US 5      S Lv   M$     g 7fN)getattrtorchversion).0attrs     R/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/utils/flop_counter.py	<genexpr>r"      s$     
]F\d75==$-T9F\s   *,)cudahipxpuz@triton not found; flop counting will not work for triton kernelsc                 \    [        U [        R                  5      (       a  U R                  $ U $ r   )
isinstancer   Tensorshape)is    r!   	get_shaper+   #   s!    !U\\""wwH    flop_registryc                 8   ^  [        T 5      S S.U 4S jj5       nU$ )Nout_valc                 B   > [        [        XU 45      u  pnT" USU0UD6$ )N	out_shape)r   r+   )r0   argskwargsr2   fs       r!   nfshape_wrapper.<locals>.nf+   s.    "*9tW6M"Ni$6)6v66r,   r   r5   r6   s   ` r!   shape_wrapperr9   *   s#    
1X 7 7 Ir,   returnc                 h   ^ ^ S[         [        [        4   S[         [        [        4   4UU 4S jjnU$ )Nflop_formular:   c                    >^  T(       d  [        T 5      m SU 4S jjn[        R                  R                  R	                  UT5        T $ )Nc                    > [        U [        R                  R                  [        45      (       d  [        SU  S[        U 5       35      eU [        ;   a  [        SU  35      eT[        U '   g )Nz|register_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), or JitFunction, got z which is of type zduplicate registrations for )	r'   r   _opsOpOverloadPacket_JITFunction
ValueErrortyper-   RuntimeError)targetr<   s    r!   register=register_flop_formula.<locals>.register_fun.<locals>.register7   sp    v

(C(C\'RSS #H$6tF|nFG G &"%A&#JKK$0M&!r,   )r:   N)r9   r   utils_pytree	tree_map_)r<   rF   get_rawtargetss   ` r!   register_fun+register_flop_formula.<locals>.register_fun3   s7    (6L	1 	%%h8r,   )r   r   r   )rL   rK   rM   s   `` r!   r   r   1   s5    8BF#3 R8H  & r,   )r2   c                R    U u  pVUu  pxXg:w  a  [        SU SU 35      eXX-  S-  U-  $ )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper2   r3   r4   mkk2ns	            r!   mm_floprZ   H   sE    
 DAEBwRSTRUUZ[]Z^_``519q=r,   c                     [        X5      $ )zCount flops for addmm.rZ   
self_shaperT   rU   r2   r4   s        r!   
addmm_flopr_   T   s     7$$r,   c                     U u  pEnUu  pxn	XG:w  a  [        SU SU 35      eXh:w  a  [        SU SU 35      eXE-  U	-  S-  U-  n
U
$ )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got rP   z0bmm: inner dimensions must match (k == k2), got rQ   rR   )rT   rU   r2   r4   brV   rW   b2rX   rY   flops              r!   bmm_floprd   Y   ss    
 GA!IBAwOPQsRWXZW[\]]wOPQsRWXZW[\]]519q=1DKr,   c                     [        X5      $ )z&Count flops for the baddbmm operation.)rd   r]   s        r!   baddbmm_floprf   h   s    
 G%%r,   c	                     [        X5      $ )zCount flops for _scaled_mm.r\   )
rT   rU   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr2   r4   s
             r!   _scaled_mm_floprn   o   s     7$$r,   x_shapew_shaper2   
transposedc                 |    U S   nU(       a  U OUSS nUtpgn [        U5      [        U5      -  U-  U-  U-  S-  n	U	$ )a  Count flops for convolution.

Note only multiplication is
counted. Computation for bias are ignored.
Flops for a transposed convolution are calculated as
flops = (x_shape[2:] * prod(w_shape) * batch_size).
Args:
    x_shape (list(int)): The input shape before convolution.
    w_shape (list(int)): The filter shape.
    out_shape (list(int)): The output shape after convolution.
    transposed (bool): is the convolution transposed
Returns:
    int: the number of flops
r   rQ   Nr   )
ro   rp   r2   rq   
batch_size
conv_shapec_outc_infilter_sizerc   s
             r!   conv_flop_countrx      s[    ( J''Y;J 'E+ 
d;//*<uDtKaODKr,   c                    [        XXvS9$ )zCount flops for convolution.rq   )rx   )
ro   rp   _bias_stride_padding	_dilationrq   r2   r3   r4   s
             r!   	conv_flopr      s     7YNNr,   c                 0   S nSn U
S   (       a"  [        US   5      nU[        XX(       + 5      -  nU
S   (       aY  [        US   5      nU(       a#  U[        U" U 5      U" U5      U" U5      SS9-  nU$ U[        U" U5      U" U 5      U" U5      SS9-  nU$ )Nc                 4    U S   U S   /[        U SS  5      -   $ )Nr   r   rQ   )list)r)   s    r!   tconv_backward_flop.<locals>.t   s$    a%(#d59o55r,   r   r   Frz   )r+   rx   )grad_out_shapero   rp   r{   r|   r}   r~   rq   _output_padding_groupsoutput_maskr2   r   
flop_countgrad_input_shapegrad_weight_shapes                   r!   conv_backward_flopr      s    6JDL 1~$Yq\2on?OQ_``
1~%il3/!N*;QwZK\I]joppJ
  /!G*a6GK\I]joppJr,   c                 &   U u  p4pVUu  pxpUu  ppX7s=:X  a  U:X  a  O  OX:X  a
  Xj:X  a  X:X  d  [        SU  SU SU 35      eXH:  d  XH-  S:w  a  [        SU SU S35      eSnU[        X4-  XV4X4-  Xi45      -  nU[        X4-  XY4X4-  X45      -  nU$ )z
Count flops for self-attention.

Supports GQA (grouped-query attention) where key/value have fewer heads
than the query. The kernel broadcasts KV heads to match query heads.
z<sdpa_flop_count: query/key/value shapes are incompatible: q=z, k=z, v=r   zsdpa_flop_count: query heads ()) must be a multiple of key/value heads ()rS   rd   )query_shape	key_shapevalue_shapera   h_qs_qd_q_b2h_kvs_k_d2_b3_h3_s3d_vtotal_flopss                   r!   sdpa_flop_countr     s     #AC#Cs$CcOO
szT)D?
 	
 zSZ1_,SE 2  $vQ(
 	
 K8QWc/!'31DEEK8QWc/!'31DEEKr,   c                    [        XU5      $ )Count flops for self-attention.r   )r   r   r   r2   r3   r4   s         r!   	sdpa_flopr   5  s     ;;??r,   c                     SSK Jn  SSKJn  [	        XU45      (       d8  U R
                  R                  S:w  a  U R                  5       R                  5       $ U/U R                  S5      S-
  -  $ )z
If the offsets tensor is fake, then we don't know the actual lengths.
In that case, we can just assume the worst case; each batch has max length.
r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r'   devicerC   difftolistsize)offsetsmax_lenr   r   s       r!   _offsets_to_lengthsr   >  s\    
 9Dg,<=>>7>>CVCVZ`C`||~$$&&9Q!+,,r,   )grad_out.c              #     #    UGb+  [        UR                  5      S:w  a  [        S5      e[        UR                  5      S:w  a  [        S5      eUb%  UR                  U R                  :w  a  [        S5      eU R                  u  pn
UR                  u  pnUR                  u  pnUc  [        S5      eUc  [        S5      eUR                  UR                  :w  a  [        S5      e[        XF5      n[        XW5      n[	        UUS	S
9 H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a'  
Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr)   rS   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_r   r   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r!   %_unpack_flash_attention_nested_shapesr   J  sp    $  syy>Q !Z[[u{{q  !\]]HNNekk$A !ghhkkiikk !NOO !NOO??ioo- !dee+I=+I=&)-t&T"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUU 'U 	
++syy%++AUx~~[_
__s   E&E(c              #     #    UGb.  [        UR                  5      S:w  a  [        S5      e[        UR                  5      S:w  a  [        S5      eUb%  UR                  U R                  :w  a  [        S5      eU R                  u    pn
UR                  u    pnUR                  u    pnUc  [        S5      eUc  [        S5      eUR                  UR                  :w  a  [        S5      e[        XF5      n[        XW5      n[	        UUS	S
9 H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a+  
Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r!   )_unpack_efficient_attention_nested_shapesr   ~  s    $  syy>Q !tuuu{{q  !vwwHNNekk$A   "B  C  C131313 !kll !kll!3!33  "Z [ ['C	'C		9TBLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUU C 	
++syy%++AUx~~[_
__s   E)E+T)rK   c          
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   r   r   r   r   s        r!   r"   0_flash_attention_forward_flop.<locals>.<genexpr>  &      6;2KK 	<<6;   r   sum)r   r   r   r   r   r   r   r2   r3   r4   sizess              r!   _flash_attention_forward_flopr     s?    " 2E  6;  r,   c           
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   s        r!   r"   4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r   r   r   )r   r   r   biasr   r   r   r   r3   r4   r   s              r!   !_efficient_attention_forward_flopr     s?    " 6!!!!E  6;  r,   c                    Uu  pEpgUu  ppUu  ppU u  nnnnXHs=:X  a  Us=:X  a  U:X  a  O  OX:X  a  UU:X  d  [        S5      eXY:  d  XY-  S:w  a  [        SU SU	 S35      eX{:X  a  UU:X  a  X:X  a  UU:X  d  [        S5      eSnU[        XE-  Xg4XE-  Xz45      -  nU[        XE-  Xo4XE-  X45      -  nU[        XE-  X4XE-  Xo45      -  nU[        XE-  Xj4XE-  X45      -  nU[        XE-  Xv4XE-  Xj45      -  nU$ )Nz<sdpa_backward_flop_count: batch/heads mismatch among tensorsr   z'sdpa_backward_flop_count: query heads (r   r   zJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatibler   )r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4r   s                        r!   sdpa_backward_flop_countr     sa   "AC#Cs$Cc'Cc3""s"t{sczJ
 	
 zSZ1_5cU ;  $vQ(
 	
 J3#:#*X
 	
 K 8QWc/!'31DEEK 8QWc/!'31DEEK8QWc/!'31DEEK 8QWc/!'31DEEK8QWc/!'31DEEKr,   c                    [        XX#5      $ )z(Count flops for self-attention backward.r   )r   r   r   r   r2   r3   r4   s          r!   sdpa_backward_flopr     s    
 $NXXr,   c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )Nr   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   r   r   r   r   s        r!   r"   1_flash_attention_backward_flop.<locals>.<genexpr><  &      CI?KK 	!iUUCIr   r   )r   r   r   r   out	logsumexpr   r   r   r   r3   r4   shapess                r!   _flash_attention_backward_flopr   !  sB    " 3	F  CI  r,   c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   s        r!   r"   5_efficient_attention_backward_flop.<locals>.<genexpr>]  r   r   r   )r   r   r   r   r   r   r   r   r   r   r3   r4   r   s                r!   "_efficient_attention_backward_flopr   B  sB    " 7!!!!	F  CI  r,   r/   c          
      N    [        U UUUUb  UOUUUS9n
[        S U
 5       5      $ )z$Count flops for varlen_attn forward.r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   s        r!   r"   ,_varlen_attn_forward_flop.<locals>.<genexpr>y  r   r   r   )r   r   r   cu_seq_qcu_seq_kr   r   r0   r3   r4   r   s              r!   _varlen_attn_forward_flopr  c  sF     2&2(E  6;  r,   c          	          [        XX4XVU5      $ )z(Count flops for varlen_attn_out forward.)r  )r   r   r   r   r  r  r   r   r0   r3   r4   s              r!   _varlen_attn_out_flopr    s     %EXe r,   c
                F    [        UUUU UUUU	S9n[        S U 5       5      $ )z%Count flops for varlen_attn backward.r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   s        r!   r"   -_varlen_attn_backward_flop.<locals>.<genexpr>  s&      CH?KK 	!iUUCHr   r   )r   r   r   r   r   lser  r  r   r   r0   r3   r4   r   s                 r!   _varlen_attn_backward_flopr    sB      2	E  CH  r,   c                 6    [        U [        5      (       d  U 4$ U $ r   )r'   tuple)xs    r!   normalize_tupler    s    atHr,   ) KMBTc                     [        S[        [        [        5      S-
  [        [	        U 5      5      S-
  S-  5      5      n[        U   $ )Nr   r   rQ   r   )maxminr   suffixesstr)numberindexs     r!   get_suffix_strr    s=     3s8}q(3s6{+;a+?A*EFGEE?r,   c                 X    [         R                  U5      nU SU-  -  S nU[         U   -   $ )Ni  z.3f)r  r  )r  suffixr  r   s       r!   convert_num_with_suffixr!    s2    NN6"E%c*E8E?""r,   c                     US:X  a  gX-  S $ )Nr   0%z.2% )numdenoms     r!   convert_to_percent_strr'    s    zk#r,   c                 0   ^  [        T 5      U 4S j5       nU$ )Nc                 >   > [        U 5      u  pT" U6 n[        X25      $ r   )r   r   )r3   	flat_argsspecr   r5   s       r!   r6   )_pytreeify_preserve_structure.<locals>.nf  s#    &t,	mc((r,   r   r8   s   ` r!   _pytreeify_preserve_structurer-    s     
1X) )
 Ir,   c                     ^  \ rS rSrSr    SS\R                  R                  \\R                  R                     -  S-  S\	S\
S\\\4   S-  SS4
U 4S	 jjjrS\	4S
 jrS\\\\\	4   4   4S jrSS jrS rS rS rSrU =r$ )r   i  a  
``FlopCounterMode`` is a context manager that counts the number of flops within its context.

It does this using a ``TorchDispatchMode``.

It also supports hierarchical output by passing a module (or list of
modules) to FlopCounterMode on construction. If you do not need hierarchical
output, you do not need to use it with a module.

Example usage

.. code-block:: python

    mod = ...
    with FlopCounterMode(mod) as flop_counter:
        mod.sum().backward()

Nmodsdepthdisplaycustom_mappingr:   c                 n  > [         TU ]  5         [        S 5      U l        X l        X0l        S U l        Uc  0 nUb  [        R                  " SSS9  0 [        EUR                  5        VVs0 s H%  u  pVU[        USS5      (       a  UO
[        U5      _M'     snnEU l	        [        5       U l        g s  snnf )Nc                       [        [        5      $ r   )r   intr$  r,   r!   <lambda>*FlopCounterMode.__init__.<locals>.<lambda>  s
    +VYJZr,   z<mods argument is not needed anymore, you can stop passing itrQ   )
stacklevel_get_rawF)super__init__r   flop_countsr0  r1  modewarningswarnr-   itemsr   r9   r   mod_tracker)selfr/  r0  r1  r2  rW   v	__class__s          r!   r;  FlopCounterMode.__init__  s     	6ABZ6[
-1	!NMMXefg

WeWkWkWmnWmtqqwq*e44!-:JJWmn
 )? os   +,B1c                 N    [        U R                  S   R                  5       5      $ )NGlobal)r   r<  valuesrB  s    r!   get_total_flopsFlopCounterMode.get_total_flops  s!    4##H-44677r,   c                     U R                   R                  5        VVs0 s H  u  pU[        U5      _M     snn$ s  snnf )zReturn the flop counts as a dictionary of dictionaries.

The outer
dictionary is keyed by module name, and the inner dictionary is keyed by
operation name.

Returns:
    Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
)r<  r@  dict)rB  rW   rC  s      r!   get_flop_countsFlopCounterMode.get_flop_counts  s7     (,'7'7'='='?@'?tq47
'?@@@s   :c                 (  ^ ^
^^ Uc  T R                   nUc  SnSS KnSUl        / SQn/ nT R                  5       m
[	        T
5      mSmU
UUU 4S jn[        T R                  R                  5       5       HB  nUS:X  a  M  UR                  S5      S	-   nXq:  a  M&  U" XgS	-
  5      nUR                  U5        MD     ST R                  ;   a'  T(       d   U H  n	S
U	S   -   U	S'   M     U" SS5      U-   n[        U5      S:X  a  / SQ/nUR                  XCSS9$ )Ni?B r   T)ModuleFLOPz% TotalFc           	        > [        T
R                  U    R                  5       5      nT	UT:  -  m	SU-  n/ nUR                  X0-   [	        UT5      [        UT5      /5        T
R                  U    R                  5        H<  u  pVUR                  US-   [        U5      -   [	        UT5      [        UT5      /5        M>     U$ )N z - )r   r<  rH  appendr!  r'  r@  r  )mod_namer0  r   paddingrH  rW   rC  global_flopsglobal_suffixis_global_subsumedrB  s          r!   process_mod.FlopCounterMode.get_table.<locals>.process_mod8  s     d..x8??ABK+"==EkGFMM"']C&{LA 
 ((288:eOc!f,+A}=*1l;  ; Mr,   rG  .r   rT  )rG  0r#  )leftrightr`  )headerscolalign)r0  tabulatePRESERVE_WHITESPACErJ  r  sortedr<  keyscountextendr   )rB  r0  rc  headerrH  r[  mod	mod_depth
cur_valuesr   rX  rY  rZ  s   `         @@@r!   	get_tableFlopCounterMode.get_table(  s%   =JJE=E 	'+$.++-&|4"	 	, $**//12Ch		#*I $Sa-8JMM*% 3 t'''0Bq>a   !1-6Fv;!+,F  B\ ]]r,   c                     U R                   R                  5         U R                  R                  5         [	        U 5      U l        U R
                  R                  5         U $ r   )r<  clearrA  	__enter___FlopCounterModer=  rI  s    r!   rq  FlopCounterMode.__enter__g  sG     ""$$T*			r,   c                    U R                   c  [        S5      eU R                   R                  " U6 nS U l         U R                  R                  5         U R                  (       a$  [        U R                  U R                  5      5        U$ )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r=  rS   __exit__rA  r1  printrm  r0  )rB  r3   ra   s      r!   ru  FlopCounterMode.__exit__n  sf    99 !_``II%	!!#<<$..,-r,   c                     XR                   ;   a[  U R                   U   nU" U0 UDSU0D6n[        U R                  R                  5       H  nU R                  U   U==   U-  ss'   M     U$ )Nr0   )r-   setrA  parentsr<  )rB  func_packetr   r3   r4   flop_count_funcr   pars           r!   _count_flopsFlopCounterMode._count_flopsx  sm    ,,,"00=O($F&F#FJ4++334  %k2j@2 5
r,   )r0  r1  r<  r-   rA  r=  )NrQ   TNr   )__name__
__module____qualname____firstlineno____doc__r   nnrQ  r   r5  boolrM  r	   r;  rJ  r  rN  rm  rq  ru  r~  __static_attributes____classcell__)rD  s   @r!   r   r     s    * DH 48+((//D$99D@+ + 	+
 !cNT1+
 >B+ +*8 8
Ac4S>&9!: 
A<^~ r,   c                   @    \ rS rSrSrS\SS4S jrS rS rSS	 jr	S
r
g)rr  i  Tcounterr:   Nc                     Xl         g r   r  )rB  r  s     r!   r;  _FlopCounterMode.__init__  s    r,   c                    SSK nUR                  U R                  R                  5      nU    U" U6 nSSS5        UR                  U R                  R                  5      nX@R                  l        WU4$ ! , (       d  f       NG= f)a]  Execute a branch function and capture its FLOP counts without
affecting self.counter.flop_counts

Args:
    branch_fn: The branch function to execute
    operands: Arguments to pass to the branch function

Returns:
    Tuple of (result, flop_counts) where result is the branch output
    and flop_counts is a copy of the FLOP counts after execution
r   N)copyr  r<  )rB  	branch_fnoperandsr  checkpointed_flop_countsresultr<  s          r!   $_execute_with_isolated_flop_counting5_FlopCounterMode._execute_with_isolated_flop_counting  sg     	#'99T\\-E-E#F )F ii 8 89#; {""	 Ts   A33
Bc                    U[         R                  R                  R                  [         R                  R                  R                  1;   nU(       au  SSKJn  SSKJn  U" US   5      n[        X5      (       d1  [        US5      (       a  UR                  nOO[        X5      (       d  M1  U R                  R                  US X45      $ U[         R                  R                  R                  L GaL  Uu  ppU R                  X5      u  pU[         L a  [         $ U R                  X5      u  nnU[         L a  [         $ [#        UR%                  5       5      [#        UR%                  5       5      -  n0 nU H  nUU   nUU   n0 n[#        UR%                  5       5      [#        UR%                  5       5      -  nU H6  nUR'                  US5      nUR'                  US5      n[)        UU5      UU'   M8     UUU'   M     UR+                  5        H.  u  nnU R                  R,                  U   R/                  U5        M0     U$ [         $ )Nr   )
get_kernelr   
kernel_idxfn)r   opshigher_ordertriton_kernel_wrapper_mutation triton_kernel_wrapper_functional*torch._higher_order_ops.triton_kernel_wrapr  triton.runtime.jitr   r'   hasattrr  r  r~  condr  NotImplementedry  rf  getr  r@  r<  update)rB  functypesr3   r4   	is_tritonr  r   kernel_namepredtrue_branchfalse_branchr  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dicts                               r!   _handle_higher_order_ops)_FlopCounterMode._handle_higher_order_ops  s    UYY33RR"YY33TTV V	M6$VL%9:K ::;--"-..K	 !::
 <<,,[$MMUYY++000
 9=5D|)-)R)R*&H >)%%+/+T+T,(I( N*%% /4467#>O>T>T>V:WWL!#)	#3I#> $5i$@!%'" #$4$9$9$; <sCTCYCYC[?\ \ -H/33Ha@H 1 5 5h BI36x3K&x0 !.
 1C"9- * *<)A)A)C%	:((3:::F *D
 O!!r,   c                 d   U(       a  UO0 nU[         R                  R                  R                  R                  [         R                  R                  R
                  R                  [         R                  R                  R
                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                   R                  [         R                  R                  R"                  R                  [         R                  R$                  R&                  R                  1;   a  [(        $ [+        U[         R,                  R.                  5      (       a  U R1                  XX45      $ XR2                  R4                  ;  ac  U[         R                  R$                  R6                  R                  La2  U    UR8                  " U0 UD6nU[(        La  UsS S S 5        $  S S S 5        U" U0 UD6nU R2                  R;                  UR<                  XcU5      $ ! , (       d  f       N== fr   )r   r  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutr  r'   r?   HigherOrderOperatorr  r  r-   r   	decomposer~  _overloadpacket)rB  r  r  r3   r4   rr   s          r!   __torch_dispatch__#_FlopCounterMode.__torch_dispatch__  s9   !r EIINN44<<IINN0088IINN00>>IINN99AAIINN??GGIINN''//IINN++33IINN))11IINN--55IINN1199IINN55==IINN((00IINN,,44IINN&&..IINN))113 3  "!dEJJ::;;00dKK ||111d%))..BWBWB_B_6_NND3F3N* *  D#F#||(()=)=s&QQ s   N!!
N/r  )r$  N)r  r  r  r  supports_higher_order_operatorsr   r;  r  r  r  r  r$  r,   r!   rr  rr    s,    &*# D #(;"z"Rr,   rr  )Fr   )NNNFN)gr  r   loggingr   torch.utils._pytreer   r   r   module_trackerr   typingr	   r
   collections.abcr   r   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r>  __all__r   r   	getLoggerr  logr  r   rA   ImportErroranywarningr  r  r+   r-   rM  __annotations__r9   r   mmr5  rZ   addmmr_   bmmrd   baddbmmrf   
_scaled_mmrn   r   r  rx   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideabler   convolution_backwardr   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r  r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r  r  r  r  r  r  r!  r  r'  r-  r   rr  r$  r,   r!   <module>r     s~      F F )  $ $ ' # :   5
6T]t_!> yy~~
 !#tCH~ "XxB?O>PRZ[]_a[aRb>b5c . tww/3 	# 	  	 tzz"%# % #% txx C  ! t||$&C & %& t' % 	% (%( 	$#Y$#Y$ Cy$ 	$
 	$L (())..1155	7 8
 cg Oux O8
O t001e e 2eN8 DD@@@@B C EI @WZ @C@	-" 1` eE#s(OU38_eCHouSRUXY]G]]^_1`r 4` eE#s(OU38_eCHouSRUXY]G]]^_4`n t44dC  	 D> t88$G 	 H>"J MMIIIIK L ^b Yps YLY t55tD 	 E@ t994H 	 IR  	L  	>  	@GGWJJ
 	HHh 	LL,	
 	OO_ 	i 	y 	I 	!!9 	y 	1 	00) 	,,i 	,,i 	99;M  	557I!" 	557I#$ 	!!#@%%'H""$B&&(J+0 $# #  
N N`yR( yRK  

]F\
]]]VWLs   =R -R43R4