
    3jWa                        S SK r S SKrS SKJr  S SKJrJrJr  \R                  S\R                  4S j5       r\ R                  4S\ R                  S\ R                  S\ R                  S\ R                  4S	 jjr\R                  S\R                  4S
 j5       r\ R"                  4S\ R                  S\ R                  S\ R                  S\ R                  4S jjr\R                  S 5       r\R                  S\R                  S\R                  S\R                  4S j5       r   SS\ R                  S\ R                  S\S\S\S\\ R                  \ R                  4   4S jjr\R                  S\R                  S\R                  S\R                  4S j5       r\ R"                  S4S\ R                  S\ R                  S\ R                  S\ R                  S\S\ R                  4S jjr\R                  S\R                  S\R                  4S j5       r SS\ R                  S\S\\ R                  \ R                  4   4S jjrg)    N)F8_E4M3_MAXF8_E5M2_MAXceil_div
block_sizec                    [         R                  " S5      nXe-  nU[         R                  " SU5      -   nX:  n	[         R                  " U5      n
[         R                  " X-   U	SS9nUR	                  [         R
                  5      U
-  n[         R                  " [         R                  " X5      U* 5      n[         R                  " X-   XS9  g Nr           maskotherr   )	tl
program_idarangeloadtofloat32maximumminimumstore)x_ptr
output_ptr	scale_ptrlp_max
n_elementsr   pidblock_startoffsetsr   scalexscaledclampeds                 d/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/backends/triton/quantization.pyquantize_fp8_kernel_tlr$      s     --
C"KBIIa44GD GGIE
d#6ATT"**%FjjF3fW=GHHZ!76    r    r   output_typereturnc           	         U[         R                  :X  a  [        nO*U[         R                  :X  a  [        nO[        SU S35      eU R                  5       (       d  U R                  5       n U R                  nU R                  5       nUR                  5       n[         R                  " XRS9nUS:  a  SnOUS:  a  SnOUS:  a  S	nOS
n[        R                  " Xh5      4n	[        U	   " UUUUUUS9  UR                  U5      nU$ )NzUnsupported output_type: z3. Expected torch.float8_e4m3fn or torch.float8_e5m2dtype                     r   )torchfloat8_e4m3fnr   float8_e5m2r   
ValueErroris_contiguous
contiguousshapeflattennumel
empty_liketritoncdivr$   view)
r    r   r&   r   
orig_shapex_flatr   outputr   grids
             r#   quantize_per_tensor_fp8rD   3   s     e)))	))	)'}4gh
 	
 ??LLNJYY[FJf8F E
	f	
	f	

KK
/1D4  [[$FMr%   c                 B   [         R                  " S5      nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " U5      n	[         R                  " X-   USS9n
U
R	                  [         R
                  5      U	-  n[         R                  " X-   XS9  g r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r    dequantizeds               r#   dequantize_fp8_kernel_tlrG   b   s     --
C"KBIIa44GD GGIE
d#6A$$rzz"U*KHHZ!;:r%   c                 |   U R                  5       (       d  U R                  5       n U R                  nU R                  5       nUR	                  5       n[
        R                  " XBS9nUS:  a  SnOUS:  a  SnOUS:  a  SnOSn[        R                  " XW5      4n[        U   " UUUUUS	9  UR                  U5      nU$ )
Nr)   r+   r,   r-   r.   r/   r0   r1   r2   )r7   r8   r9   r:   r;   r3   r<   r=   r>   rG   r?   )	r    r   r&   r@   rA   r   rB   r   rC   s	            r#   dequantize_per_tensor_fp8rI   x   s     ??LLNJYY[FJf8FE
	f	
	f	

KK
/1DT" [[$FMr%   c                     U S-  nUS-  nU S-  nUS-  nUS-  nUS-  n	XB-  U-   n
US-  U-   nU
S-  U	S-  -   U-   nX-  nX-  nX-  U-   $ )zCompute the swizzled offset for a scale at logical position (in_row, in_col).

This implements the cuBLAS blocked layout transformation (to_blocked).
Used by both quantize (write) and dequantize (read) kernels.
r,          r0       )in_rowin_coln_col_blockspadded_scale_cols	row_block	col_blockin_block_rowin_block_col	sub_blockfine_rowcombined_blockintermediate_col
linear_idxout_rowout_cols                  r#   _compute_swizzled_scale_offsetr^      s     #I!I C<LA:L "Ib H-	9N 1}|3  #%258HHJ -G,G&00r%   blocks_per_programhi_firstc           
          [         R                  " SS9n[         R                  " SS9U-  n[         R                  " U5      n[        U5       GH  nUU-   nUU:  d  M  UU-  [         R                  " SU5      -   nUU:  nX-  U-   n[         R                  " U U-   USS9R                  [         R                  5      n[         R                  " U5      n[         R                  " USS9nUS-  nUU-  n[         R                  " US5      nUR                  [         R                  5      n[         R                  " US5      n[        UUUU
5      nX:  a   UU:  a  [         R                  " UU-   U5        UR                  [         R                  5      nUU-  nUS	:  n [         R                  " U S
U5      nUU-  n![         R                  " U SU!5      n![         R                  " SUS-  5      n"U"S-  n#U"S-  S-   n$[         R                  " SU5      n%[         R                  " [         R                  " U%U#SS2S4   :H  U!S5      SS9n&[         R                  " [         R                  " U%U$SS2S4   :H  U!S5      SS9n'U(       a  U&U'n)n(OU'U&n)n([         R                   " SSU(U)/[         R"                  SSS9n*U*S-  R                  [         R$                  5      n+XS-  -  UUS-  -  -   U"-   n,UU-  U#-   U:  n-[         R                  " UU,-   U+U-S9  GM     g)af  Single Triton kernel for NVFP4 quantization with packing and scale swizzling.

Performs all operations in one kernel:
1. Computes block-wise scales
2. Quantizes and packs data to FP4
3. Applies to_blocked swizzle pattern to scales

Optimized with:
- Vectorized processing of multiple blocks per thread
- Efficient packing using interleave operations
- Coalesced memory accesses

Args:
    x_ptr: Input tensor pointer (m x n)
    packed_output_ptr: Output packed FP4 data (m x n//2)
    swizzled_scales_ptr: Output swizzled FP8 block scales (padded_scale_rows x padded_scale_cols)
    per_tensor_scale_ptr: Pointer to global scaling factor tensor
    m: Number of rows in input
    n: Number of columns in input (must be divisible by block_size)
    num_blocks: Number of blocks per row (n // block_size)
    scale_rows: Unpadded scale rows (m)
    scale_cols: Unpadded scale cols (num_blocks)
    padded_scale_rows: Padded scale rows for swizzle
    padded_scale_cols: Padded scale cols for swizzle
    block_size: Size of each quantization block (typically 16)
    blocks_per_program: Number of blocks to process per program
r   axis   r	   r
   g      @      |@rK   g|=      ?   Na  
                {
                    .reg .b8 fp4_byte;
                    .reg .b16 result;
                    cvt.rn.satfinite.e2m1x2.f32 fp4_byte, $1, $2;
                    mov.b16 result, {fp4_byte, 0};
                    mov.u16 $0, result;
                }
                z=h,f,fTasmconstraintsargsr*   is_purepack   r   )r   r   r   ranger   r   r   absmaxr   
float8e4nvr>   r^   r   wheresuminline_asm_elementwiseuint16uint8).r   packed_output_ptrswizzled_scales_ptrper_tensor_scale_ptrmn
num_blocks
scale_rows
scale_colspadded_scale_rowsrR   r   r_   r`   pid_m
pid_n_baseper_tensor_scaleblock_offsetpid_noffs_nr   x_offsr    x_absmax_absblock_scalescaled_block_scalescaled_block_scale_fp8rQ   swizzled_offsscaled_block_scale_fp32total_scalezero_scale_maskdata_scaledpair_idxeven_idxodd_idxindicesf32_evenf32_odd
asm_arg_hi
asm_arg_lopacked_bytes_u16packed_bytesout_offsout_masks.                                                 r#   quantize_nvfp4_kernel_tlr      s   Z MMq!EA&);;J ww34 01\) :Z'"))Az*BBFA:DY'F T=@@LA FF1IEffU+G "C-K "-/?!?!#,>!F &8%:%:2==%I" 77:q1L:ul,=M !ej&8,}<>TU '=&?&?

&K#*-DDK)E1O((?CEK k/K((?CEK yyJ!O4H!|HlQ&G ii:.Gvvbhhw(1d72C'C[RST[\]HffRXXgD1A&A;PQRYZ[G )17J
J)0(J
!88 % *-ii " -t377AL Q'%:?*CChNH
*X5:HHH&1<hOg 2r%   Tr   epsilonpad_16xc                    U R                   nU(       ap  U R                   u  pgUS-   S-  S-  U-
  nUS-   S-  S-  U-
  n	US:  d  U	S:  a:  [        R                  R                  R	                  U SU	SU45      n U R                   nSn
U R                  US   SU
5      n U R                   u  pnX-  nU R                  X5      R                  5       nUnUn[        US5      n[        US5      nUS-  nUS-  n[        R                  " XS-  4[        R                  U R                  S9n[        R                  " UU4[        R                  U R                  S9nX-  nUS	:  a  S
nOUS:  a  SnOSnU[        R                  " UU5      4n[        U   " UUUUUUUUUUUU
UUS9  [!        U5      nUS   S-  US'   UR                  U5      nUU4$ )N   rM   r   r,   rK   rg   r*   devicer1   rd      )r   r_   r`   )r9   r3   nn
functionalpadreshaper8   r   emptyrw   r   zerosr4   r=   r>   r   list)r    r   r   r   r`   r@   rowscolspad_rowspad_colsr   r{   r}   _r|   x_2dr~   r   n_row_blocksrQ   r   rR   packed_outputswizzled_scalestotal_blocksr_   rC   packed_shapes                               r#   quantize_nvfp4r   J  s    J WW
2I"$r)D02I"$r)D0a<8a<##''AxH+EFA JJ 	
		*Q-Z0AwwA1A 99Q?%%'D JJJ,LJ*L$s*$q( KKF5;;qxxPMkk	-.!!xxO >Ld		 v{{:'9:;DT"		-$ 
#L#B'1,L!)),7M/))r%   	tile_sizec           	      d   [         R                  " S5      nX-  nU[         R                  " SU	5      -   nXS-  -  nXS-  -  nXS-  :  n[         R                  " U5      n[         R                  " X-   USS9n[         R                  " SSU/[         R
                  SSS9nUS	-  R                  [         R                  5      R                  [         R                  SS
9R                  [         R                  5      nUS-	  R                  [         R                  5      R                  [         R                  SS
9R                  [         R                  5      nUS-  nUS-  S-   nX-  U-   nX-  U-   nUU-  nUU-  n[        UUXg5      n[        UUXg5      n[         R                  " UU-   UUU:  -  SS9n[         R                  " UU-   UUU:  -  SS9nU
(       aK  UUR                  [         R                  5      -  U-  n UUR                  [         R                  5      -  U-  n!OJUUR                  [         R                  5      -  U-  n UUR                  [         R                  5      -  U-  n!UUU:  -  n"UUU:  -  n#[         R                  " UU-   U U"S9  [         R                  " UU-   U!U#S9  g)a  Dequantizes FP4 packed data using per-block scaling factors.

Args:
    packed_ptr (tl.pointer): Pointer to packed uint8 tensor (m x n//2)
    scale_ptr (tl.pointer): Pointer to swizzled per-block scale tensor
    output_ptr (tl.pointer): Pointer to output tensor (m x n)
    global_scale_ptr (tl.pointer): Pointer to global scale tensor
    n (int): Number of columns in unpacked tensor
    scale_cols (int): Number of scale columns (n // block_size)
    n_col_blocks (int): Number of 4-column blocks in scales
    padded_scale_cols (int): Padded scale columns (n_col_blocks * 4)
    block_size (tl.constexpr): Size of each FP4 quantization block
    tile_size (tl.constexpr): Size of the processing tile (in packed elements)
r   rg   r
   a0  
        {
            .reg .b8 byte0, byte1, byte2, byte3;
            mov.b32 {byte0, byte1, byte2, byte3}, $4;
            cvt.rn.f16x2.e2m1x2 $0, byte0;
            cvt.rn.f16x2.e2m1x2 $1, byte1;
            cvt.rn.f16x2.e2m1x2 $2, byte2;
            cvt.rn.f16x2.e2m1x2 $3, byte3;
        }
        z=r,=r,=r,=r,rTrK   rh   i  )bitcastrM   rd   rf   r   N)r   r   r   r   ru   uint32castrv   float16r   r^   r   r   )$
packed_ptrr   global_scale_ptrr   r|   r   rQ   rR   r   r   r`   r   packed_startpacked_offspacked_row_idxpacked_col_idxpacked_maskglobal_scalepacked_datax_f16x2_packedval_lowval_highout_col_lowout_col_highout_offs_lowout_offs_highblock_col_lowblock_col_highscale_offs_lowscale_offs_high	scale_low
scale_highresult_even
result_oddout_mask_lowout_mask_highs$                                       r#   dequantize_nvfp4_kernel_tlr     s   : --
C ?L1i!88K !!V,N F+N !F+K 77+,L ''*2ANK ..	 $]iiN$ 
&	 &&ryy166rzz46PUUVXV`V`a  
2	##BII.33BJJ3MRRSUS]S]^ 
 !1$K!A%)L!%3L"&5M  :-M!Z/N 4|N 5O
 N"MJ67I
 O#NZ78J bjj!99LHz}}RZZ88<G
	RZZ 88<G
bjj 99LH
 +/2L<!#34MHHZ,&,GHHZ-'-Hr%   qxblock_scalesc                   ^  SnSnT R                   S   nUS-  nX-  n	[        U	S5      n
U
S-  n[        T R                   5      nXS'   [        R                  " XT R
                  S9nU 4S jn[        U   " T UUUUU	U
UUUUS9  U$ )	NrM   r,   r   rg   rK   r   c                 V   > [         R                  " TR                  5       U S   5      4$ )Nr   )r=   r>   r;   )metar   s    r#   rC   dequantize_nvfp4.<locals>.grid2  s"    BHHJ[(9:<<r%   )r   r   r`   )r9   r   r   r3   r   r   r   )r   r   r   r&   r`   r   r   packed_nr|   r   rQ   rR   output_shaperB   rC   s   `              r#   dequantize_nvfp4r     s     JIxx|H1AJ J*L$q( >L[[KF= t$
	 Mr%   c                    [         R                  " SS9n[         R                  " SS9U
-  nSn[        U
5       GH  nX-   nX:  d  M  X-  [         R                  " SU	5      -   nUU:  nX-  U-   n[         R                  " U U-   USS9R                  [         R                  5      n[         R                  " U5      n[         R                  " USS9nUU-  n[         R                  " US5      n[         R                  " US5      n[         R                  " U5      n[         R                  R                  U5      R                  [         R                  5      nUS	-   n[         R                  " US5      n[         R                  " US
5      n[         R                  " US	-
  R                  [         R                  5      5      n[         R                   " US5      n[#        XUU5      nX:  a<  X:  a7  [         R$                  " UU-   UR                  [         R&                  5      5        [         R(                  " US:  SU5      nUU-  n[         R(                  " US:  SU5      n[         R                  " [         R                  " UU5      U* 5      nX-  U-   n [         R$                  " UU -   UR                  [         R*                  5      US9  GM     g)a&  Single Triton kernel for MXFP8 quantization with E8M0 scale swizzling.

Performs:
1. Computes block-wise max and converts to E8M0 (power-of-2) scales
2. Quantizes data to FP8 E4M3
3. Applies to_blocked swizzle pattern to E8M0 scales

Args:
    x_ptr: Input tensor pointer (m x n)
    output_ptr: Output FP8 data (m x n)
    swizzled_scales_ptr: Output swizzled E8M0 block scales
    m: Number of rows in input
    n: Number of columns in input (must be divisible by block_size)
    num_blocks: Number of blocks per row (n // block_size)
    scale_rows: Unpadded scale rows (m)
    scale_cols: Unpadded scale cols (num_blocks)
    padded_scale_cols: Padded scale cols for swizzle
    block_size: Size of each quantization block (32 for MXFP8)
    blocks_per_program: Number of blocks to process per program
r   rb   rd   re   r	   r
   g       8g      G      rK   gKH9rf   r   N)r   r   ro   r   r   r   r   rp   rq   r   r   log2mathceilint32exp2r>   r^   r   rw   rs   rr   )!r   r   ry   r{   r|   r}   r~   r   rR   r   r_   r   r   fp8_maxr   r   r   r   r   r    r   r   scale_ratio
log2_ratioexp_unbiased
exp_biasedr   rQ   r   
safe_scaler   data_clampedr   s!                                    r#   quantize_mxfp8_kernel_tlr   E  sJ   F MMq!EA&);;J "G 01)'"))Az*BBFA:DY'F T=@@LA FF1IEffU+G "G+K**[-@K**[*=K -J77<<
366rxx@L%+J J2JJ4J '':#3"7"7

"CDK 77:q1L:l,=M !e&8,}<jmmBHH>UV +"5sKHJj.K((;#6[IK ::bjjg&FQL y6)HHHZ(*LOOBMM,JQUVs 2r%   pad_32xc                    SnU(       ad  U R                   u  p4US-   S-  S-  U-
  nUS-   S-  S-  U-
  nUS:  d  US:  a.  [        R                  R                  R	                  U SUSU45      n U R                   u  pxX-  n	U R                  5       n Un
U	n[        U
S5      n[        US5      nUS-  nUS-  n[        R                  " Xx4[        R                  U R                  S9n[        R                  " X4[        R                  U R                  S9nXy-  nUS:  a  SnOUS	:  a  S
nOSnU[        R                  " U	U5      4n[        U   " U UUUUU	U
UUUUS9  UR                  [        R                   5      nUU4$ )a&  Quantize tensor to MXFP8 format with block-wise E8M0 scaling.

MXFP8 uses block size 32 with power-of-2 (E8M0) block scales.

Args:
    x: Input tensor (2D, shape M x K)
    pad_32x: If True, pad dimensions to be divisible by 32

Returns:
    Tuple of (quantized_fp8_tensor, block_scales_e8m0)
rL      r   r,   rK   r   r1   rd   r   rg   )r   r_   )r9   r3   r   r   r   r8   r   r   r4   r   r   rw   r=   r>   r   r?   float8_e8m0fnu)r    r   r   r   r   r   r   r{   r|   r}   r~   r   r   rQ   r   rR   rB   r   r   r_   rC   s                        r#   quantize_mxfp8r     s    J WW
2I"$r)D02I"$r)D0a<8a<##''AxH+EFA77DAJ 	
A JJJ,LJ*L$s*$q( [[!u':':188LFkk	.kkxxO >Ld		 v{{:'9:;DT"			- &**5+?+?@O?""r%   )r	   FT)F)r3   r=   triton.languagelanguager   comfy_kitchen.float_utilsr   r   r   jit	constexprr$   r4   Tensorr*   rD   rG   bfloat16rI   r^   r   floatbooltupler   r   r   r   r   rN   r%   r#   <module>r     s        7 7 70 FKEXEX,||,!LL,7<{{,
\\,^ ;
 ; ;, FK^^!||!!LL!7<{{!
\\!H  1  1F FP FP FP llFP FPX R*||R*llR* R* 	R*
 R* 5<<%&R*j vI vI ||vI llvI vIz  %~~**ll* ,,* 	*
 * \\*X bW bW bW bWN M#||M#M# 5<<%&M#r%   