
    3jy@                     X
   S SK r SSKJr  SSKJr  SSKJr  SSKJ	r	  SSK
JrJrJrJr  SSKJrJrJr  SS	KJr  S
r/ SQr\ R,                  4S\ R.                  S\ R.                  S\ R0                  S\ R.                  4S jjr\ R4                  4S\ R.                  S\ R.                  S\ R0                  S\ R.                  4S jjr\ R,                  4S\ R.                  S\ R.                  S\ R0                  S\ R.                  4S jjr   SJS\ R.                  S\ R.                  S\S\S\S\\ R.                  \ R.                  4   4S jjr \ R4                  S4S\ R.                  S\ R.                  S\ R.                  S\ R0                  S\S\ R.                  4S jjr!   SKS\ R.                  S\ R.                  S\ R.                  S \ R.                  S!\ R.                  S"\ R.                  S#\ R.                  S-  S$\ R0                  S-  S%\ R.                  S-  S\ R.                  4S& jjr" SLS\ R.                  S'\S\\ R.                  \ R.                  4   4S( jjr#\ R4                  4S\ R.                  S\ R.                  S\ R0                  S\ R.                  4S) jjr$  SMS\ R.                  S\ R.                  S!\ R.                  S"\ R.                  S#\ R.                  S-  S$\ R0                  S-  S\ R.                  4S* jjr%   SNS\ R.                  S+\ R.                  S,\ R.                  S-\&S.\S/\ R.                  S-  S\\ R.                  \ R.                  \ R.                  4   4S0 jjr'  SOS1\ R.                  S2\ R.                  S3\ R.                  S4\ R.                  S5\ R.                  S6\ R.                  S#\ R.                  S-  S.\S\ R.                  4S7 jjr(  SPS\ R.                  S8\ R.                  S4\ R.                  S9\ R.                  S#\ R.                  S-  S:\&S\ R.                  4S; jjr)S<\ R.                  S=\ R.                  S>\ R.                  S\\ R.                  \ R.                  4   4S? jr*S\ R.                  S>\ R.                  S\ R.                  4S@ jr+S<\ R.                  S=\ R.                  S>\ R.                  S\\ R.                  \ R.                  4   4SA jr,S\ R.                  S>\ R.                  S\ R.                  4SB jr-SC\.\/   SS4SD jr0SE\/SS4SF jr1SE\/SS4SG jr2S\34SH jr4SE\/4SI jr5g)Q    N   )cuda)eager)triton)DTYPE_TO_CODE)BackendErrorBackendNotFoundErrorBackendNotImplementedErrorNoCapableBackendError)from_blockedswap_nibbles
to_blocked)registryz0.1.0)quantize_per_tensor_fp8dequantize_per_tensor_fp8quantize_nvfp4dequantize_nvfp4quantize_mxfp8dequantize_mxfp8quantize_svdquant_w4a4scaled_mm_nvfp4scaled_mm_mxfp8scaled_mm_svdquant_w4a4gemv_awq_w4a16
apply_ropeapply_rope1apply_rope_split_halfapply_rope_split_half1r   r   r   list_backendsset_backend_priorityenable_backenddisable_backendstochastic_rounding_fp8use_backendr   r	   r
   r   xscaleoutput_typereturnc                 h    [         U   n[        R                  R                  R	                  XU5      $ )zQuantize tensor to FP8 format with per-tensor scaling.

Args:
    x: Input tensor
    scale: Scale tensor (scalar)
    output_type: FP8 dtype (float8_e4m3fn or float8_e5m2)

Returns:
    Quantized FP8 tensor
)r   torchopscomfy_kitchenquantize_fp8r%   r&   r'   
dtype_codes       P/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/__init__.pyr   r   A   s+     {+J99""//*EE    c                 h    [         U   n[        R                  R                  R	                  XU5      $ )a  Dequantize tensor from FP8 format with per-tensor scaling.

Args:
    x: Input FP8 tensor (float8_e4m3fn or float8_e5m2)
    scale: Scale tensor (scalar)
    output_type: Target dtype (float32, float16, or bfloat16)

Returns:
    Dequantized tensor in specified output format
)r   r*   r+   r,   dequantize_fp8r.   s       r0   r   r   T   s+     {+J99""11!JGGr1   rngc                 F    XUS.n[         R                  " SUS9nU" S0 UD6$ )zStochastically round tensor to FP8 format.

Args:
    x: Input tensor
    rng: Random uint8 tensor with the same shape as x
    output_type: FP8 dtype (float8_e4m3fn or float8_e5m2)

Returns:
    Stochastically rounded FP8 tensor
)r%   r4   r'   r#   )kwargs )r   get_implementation)r%   r4   r'   r6   impls        r0   r#   r#   g   s-     =F&&'@PD>&>r1   Tper_tensor_scaleepsilonpad_16xhi_firstc                 X    [         R                  R                  R                  XX#U5      $ )a   Quantize tensor to NVFP4 format with block-wise scaling.

Args:
    x: Input tensor (2D)
    per_tensor_scale: Global scale factor
    epsilon: Epsilon for numerical stability
    pad_16x: If True, implicit zero-padding is applied to make dimensions divisible by 16
    hi_first: Nibble packing order. If True (default), the even-indexed element
              is stored in the high nibble of each packed byte. If False, the
              even-indexed element is stored in the low nibble.

Returns:
    Tuple of (quantized_tensor, block_scales)
)r*   r+   r,   r   )r%   r:   r;   r<   r=   s        r0   r   r   {   s$    * 99""11!wYabbr1   qxblock_scalesc                 j    [         U   n[        R                  R                  R	                  XX%U5      $ )a  Dequantize tensor from NVFP4 format with block-wise scaling.

Args:
    qx: Quantized FP4 tensor (packed as uint8)
    per_tensor_scale: Global scale factor
    block_scales: Block scales in swizzled layout (float8_e4m3fn)
    output_type: Target output dtype (float32, float16, or bfloat16)
    hi_first: Nibble packing order. Must match the packing order used
              during quantization. If True (default), the even-indexed
              element is in the high nibble.

Returns:
    Dequantized tensor in specified output format
)r   r*   r+   r,   r   )r?   r:   r@   r'   r=   r/   s         r0   r   r      s.    * {+J99""33B,dlmmr1   abtensor_scale_atensor_scale_bblock_scale_ablock_scale_bbias	out_dtypealphac	                     Uc  [         R                  n[        U   n	[         R                  R                  R                  XX#XEXiU5	      $ )aB  Matrix multiplication with NVFP4 quantized inputs.

Computes: y = (a @ b.T) * (tensor_scale_a * tensor_scale_b) + bias

Args:
    a: Quantized matrix A (M, K//2) in uint8 format
    b: Quantized matrix B (N, K//2) in uint8 format
    tensor_scale_a: Global scale for A
    tensor_scale_b: Global scale for B
    block_scale_a: Block-wise scales for A
    block_scale_b: Block-wise scales for B
    bias: Optional bias vector
    out_dtype: Output dtype (defaults to bfloat16)
    alpha: Output scale (tensor_scale_a * tensor_scale_b)

Returns:
    Result tensor of shape (M, N)
)r*   bfloat16r   r+   r,   r   )
rB   rC   rD   rE   rF   rG   rH   rI   rJ   r/   s
             r0   r   r      sF    : NN	y)J99""22	nd r1   pad_32xc                 T    [         R                  R                  R                  X5      $ )a  Quantize tensor to MXFP8 format with block-wise E8M0 scaling.

MXFP8 uses block size 32 with power-of-2 (E8M0) block scales.

Args:
    x: Input tensor (2D, shape M x K, K must be divisible by 32)
    pad_32x: If True, pad dimensions to be divisible by 32

Returns:
    Tuple of (quantized_fp8_tensor, block_scales_e8m0)
    - quantized_fp8_tensor: FP8 E4M3 data of shape (M, K)
    - block_scales_e8m0: E8M0 scales in swizzled layout
)r*   r+   r,   r   )r%   rM   s     r0   r   r      s    " 99""11!==r1   c                 h    [         U   n[        R                  R                  R	                  XU5      $ )a"  Dequantize tensor from MXFP8 format.

Args:
    qx: Quantized FP8 tensor (float8_e4m3fn)
    block_scales: E8M0 block scales in swizzled layout (float8_e8m0fnu)
    output_type: Target output dtype (float32, float16, or bfloat16)

Returns:
    Dequantized tensor in specified output format
)r   r*   r+   r,   r   )r?   r@   r'   r/   s       r0   r   r      s+     {+J99""33BjQQr1   c                     Uc  [         R                  n[        U   n[         R                  R                  R                  XX#XF5      $ )a  Matrix multiplication with MXFP8 quantized inputs.

Computes: y = a @ b.T + bias

Args:
    a: Quantized FP8 matrix A (M, K)
    b: Quantized FP8 matrix B (N, K)
    block_scale_a: E8M0 block scales for A in swizzled layout
    block_scale_b: E8M0 block scales for B in swizzled layout
    bias: Optional bias vector
    out_dtype: Output dtype (defaults to bfloat16)

Returns:
    Result tensor of shape (M, N)
)r*   rL   r   r+   r,   r   )rB   rC   rF   rG   rH   rI   r/   s          r0   r   r      sA    . NN	y)J99""22	mD r1   smooth	lora_downpad_sizeact_unsignedlora_xc                 X    [         R                  R                  R                  XX#XE5      $ )u  Quantize activations to int4 with smoothing + LoRA down projection.

Args:
    x: (M, K) bf16/fp16 main-path input (caller pre-shifts if unsigned path).
    smooth: (K,) smoothing factor applied before quantization.
    lora_down: (K, R) low-rank down projection weight.
    pad_size: pad M to multiple of this value (default 256).
    act_unsigned: if True, quantize into uint4 [0, 15] (scale=max/15) for u4
        MMA downstream. Caller must ensure x is non-negative — the shift
        constant is a model-topology concern, not part of this op.
    lora_x: (M, K) optional pre-shift activation for LoRA. Defaults to x.
        Pass raw (un-shifted) x when x has been pre-shifted for unsigned path.

Returns:
    (quantized_x uint8 [M_pad, K//2], ascales [K//64, M_pad], lora_act [M_pad, R])

Note: eager returns fp32 lora_act as a high-precision reference. The CUDA
backend returns lora_act in x.dtype because the runtime epilogue consumes it
as bf16/fp16; this avoids an otherwise redundant cast/allocation.
)r*   r+   r,   r   )r%   rQ   rR   rS   rT   rU   s         r0   r   r     s(    8 99""99	9 r1   actwgtascaleswscaleslora_act_inlora_upc           
      Z    [         R                  R                  R                  XX#XEXg5      $ )aF  SVDQuant W4A4 int4 GEMM + LoRA-up + bias.

Computes out = int4_matmul(act, wgt, ascales, wscales) + lora_act_in @ lora_up^T + bias.
The CUDA backend performs int4 MMA + per-group dequant + bias in one
kernel and, when lora_act_in/proj_up layout and dtype allow it, fuses
LoRA-up into the same writeback epilogue with bf16/fp16 tensor-core MMA.
Unsupported combinations fall back to the wrapper's bf16/fp16 addmm_ path.

Args:
    act: (M, K//2) uint8 packed activations from quantize_svdquant_w4a4.
    wgt: (N, K//2) int8 packed weights (natural row-major), or backend
        specific tile-packed storage.
    ascales: (K//64, M) activation scales.
    wscales: (K//64, N) weight scales.
    lora_act_in: (M, R) LoRA activations from quantize step.
    lora_up: (N, R) LoRA up projection weight, or matching tile-packed
        storage for tile-packed weights.
    bias: optional (N,) bias.
    act_unsigned: if True, activations are interpreted as unsigned [0,15] by
        u4.s4 MMA (for post-GELU+shift fc2). Caller pre-shifts.

Returns:
    (M, N) output tensor (same dtype as lora_up).
)r*   r+   r,   r   )rW   rX   rY   rZ   r[   r\   rH   rT   s           r0   r   r   9  s+    D 99""::'K$ r1   qweightwzeros
group_sizec                 X    [         R                  R                  R                  XX#XE5      $ )a|  AWQ W4A16 quantized GEMV (for modulation-style layers called with small batch).

Args:
    x: (..., K) bf16/fp16 input.
    qweight: (N//4, K//2) int32 packed weight.
    wscales: (K//group_size, N) per-group scales.
    wzeros: (K//group_size, N) per-group zero points.
    bias: optional (N,) bias.
    group_size: quantization group size.

Returns:
    (..., N) output tensor.
)r*   r+   r,   r   )r%   r^   rZ   r_   rH   r`   s         r0   r   r   `  s(    * 99""11	GT r1   xqxk	freqs_cisc                 V    [         R                  R                  R                  XU5      $ )a  Apply Rotary Position Embedding (RoPE) to query and key tensors.

Interleaved layout: pair k uses adjacent elements [2k, 2k+1].

Args:
    xq: Query tensor
    xk: Key tensor
    freqs_cis: Precomputed frequency tensor

Returns:
    Tuple of (transformed_query, transformed_key)
)r*   r+   r,   r   rb   rc   rd   s      r0   r   r   z  s!    " 99""--bi@@r1   c                 T    [         R                  R                  R                  X5      $ )zApply Rotary Position Embedding (RoPE) to a single tensor.

Interleaved layout: pair k uses adjacent elements [2k, 2k+1].

Args:
    x: Input tensor
    freqs_cis: Precomputed frequency tensor

Returns:
    Transformed tensor
)r*   r+   r,   r   r%   rd   s     r0   r   r     s     99""..q<<r1   c                 V    [         R                  R                  R                  XU5      $ )a"  Apply Rotary Position Embedding (RoPE) to query and key tensors.

Split-half layout: pair k uses elements [k] and [k + head_dim//2].
Matches the formula:
    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).to(freqs.dtype)
    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
    t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)

Args:
    xq: Query tensor
    xk: Key tensor
    freqs_cis: Precomputed frequency tensor shape (..., head_dim//2, 2, 2)

Returns:
    Tuple of (transformed_query, transformed_key)
)r*   r+   r,   r   rf   s      r0   r   r     s!    * 99""88KKr1   c                 T    [         R                  R                  R                  X5      $ )a  Apply Rotary Position Embedding (RoPE) to a single tensor.

Split-half layout: pair k uses elements [k] and [k + head_dim//2].
Matches the formula:
    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).to(freqs.dtype)
    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
    t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)

Args:
    x: Input tensor
    freqs_cis: Precomputed frequency tensor shape (..., head_dim//2, 2, 2)

Returns:
    Transformed tensor
)r*   r+   r,   r   rh   s     r0   r   r     s    & 99""99!GGr1   priorityc                 0    [         R                  " U 5        g)zSet the priority order for backend selection.

Args:
    priority: List of backend names in order of preference
             Example: ["cuda", "eager"] to prefer CUDA over Torch
N)r   set_priority)rk   s    r0   r    r      s     (#r1   namec                 0    [         R                  " U 5        g)zoDisable a backend, preventing its use.

Args:
    name: Backend name to disable ("eager", "cuda", or "triton")
N)r   disablern   s    r0   r"   r"     s     Tr1   c                 0    [         R                  " U 5        g)zpRe-enable a previously disabled backend.

Args:
    name: Backend name to enable ("eager", "cuda", or "triton")
N)r   enablerq   s    r0   r!   r!     s     OODr1   c                  ,    [         R                  " 5       $ )a+  Get status information for all backends.

Returns:
    Dictionary mapping backend names to their status:
    {
        "backend_name": {
            "available": bool,
            "disabled": bool,
            "unavailable_reason": str or None,
            "capabilities": list[str]
        }
    }
)r   r   r7   r1   r0   r   r     s     !!##r1   c                 .    [         R                  " U 5      $ )zContext manager to temporarily use a specific backend.

Args:
    name: Backend name to use within the context

Example:
    with comfy_kitchen.use_backend("eager"):
        result = comfy_kitchen.quantize_per_tensor_fp8(x, scale)
)r   r$   rq   s    r0   r$   r$      s     %%r1   )g        FT)NNN)F)NN)   FN)NF)N@   )6r*   backendsr   _cuda_backendr   _eager_backendr   _triton_backendbackends.eager.quantizationr   
exceptionsr   r	   r
   r   float_utilsr   r   r   r   __version____all__float8_e4m3fnTensordtyper   rL   r   r#   floatbooltupler   r   r   r   r   r   intr   r   r   r   r   r   r   liststrr    r"   r!   dictr   r$   r7   r1   r0   <module>r      s8    + . / 6  @ ? #\  %22F||F<<F F \\	F,  %~~H||H<<H H \\	H,  %22||	  \\	. c||cllc c 	c
 c 5<<%&c8  %~~nnlln ,,n 	n
 n \\n@ !%$(!%#||#||# LL# LL	#
 <<# <<# ,,
# {{T!# <<$# \\#P >||>> 5<<%&>.  %~~RR,,R R \\	R0 !%$(|||| << <<	
 ,,
 {{T! \\F "&||LL || 	
  LL4 5<<u||34P !%$	$	$ \\$ \\	$
 $ \\$ ,,
$ $ \\$X !%||\\ \\ LL	
 ,,
  \\4AAA ||A 5<<%&	A(=||=||= \\=$LLL ||L 5<<%&	L0H||H||H \\H6$49 $ $# $   $t $"
&c 
&r1   