
    +jy@                        d dl Z ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZmZmZ dd	lmZ d
Zg dZe j        fde j        de j        de j        de j        fdZe j        fde j        de j        de j        de j        fdZe j        fde j        de j        de j        de j        fdZ	 	 	 dNde j        de j        dedededee j        e j        f         fdZ e j        dfde j        de j        de j        de j        dede j        fdZ!	 	 	 dOde j        d e j        d!e j        d"e j        d#e j        d$e j        d%e j        dz  d&e j        dz  d'e j        dz  de j        fd(Z"	 dPde j        d)edee j        e j        f         fd*Z#e j        fde j        de j        de j        de j        fd+Z$	 	 dQde j        d e j        d#e j        d$e j        d%e j        dz  d&e j        dz  de j        fd,Z%	 	 	 dRde j        d.e j        d/e j        d0e&d1ed2e j        dz  dee j        e j        e j        f         fd3Z'	 	 dSd4e j        d5e j        d6e j        d7e j        d8e j        d9e j        d%e j        dz  d1ede j        fd:Z(	 	 dTde j        d<e j        d7e j        d=e j        d%e j        dz  d>e&de j        fd?Z)d@e j        dAe j        dBe j        dee j        e j        f         fdCZ*de j        dBe j        de j        fdDZ+d@e j        dAe j        dBe j        dee j        e j        f         fdEZ,de j        dBe j        de j        fdFZ-dGe.e/         ddfdHZ0dIe/ddfdJZ1dIe/ddfdKZ2de3fdLZ4dIe/fdMZ5dS )U    N   )cuda)eager)triton)DTYPE_TO_CODE)BackendErrorBackendNotFoundErrorBackendNotImplementedErrorNoCapableBackendError)from_blockedswap_nibbles
to_blocked)registryz0.1.0)quantize_per_tensor_fp8dequantize_per_tensor_fp8quantize_nvfp4dequantize_nvfp4quantize_mxfp8dequantize_mxfp8quantize_svdquant_w4a4scaled_mm_nvfp4scaled_mm_mxfp8scaled_mm_svdquant_w4a4gemv_awq_w4a16
apply_ropeapply_rope1apply_rope_split_halfapply_rope_split_half1r   r   r   list_backendsset_backend_priorityenable_backenddisable_backendstochastic_rounding_fp8use_backendr   r	   r
   r   xscaleoutput_typereturnc                 h    t           |         }t          j        j                            | ||          S )zQuantize tensor to FP8 format with per-tensor scaling.

    Args:
        x: Input tensor
        scale: Scale tensor (scalar)
        output_type: FP8 dtype (float8_e4m3fn or float8_e5m2)

    Returns:
        Quantized FP8 tensor
    )r   torchopscomfy_kitchenquantize_fp8r%   r&   r'   
dtype_codes       Y/home/wildlama/comfy/ComfyUI/.venv/lib/python3.11/site-packages/comfy_kitchen/__init__.pyr   r   A   s+     {+J9"//5*EEE    c                 h    t           |         }t          j        j                            | ||          S )a0  Dequantize tensor from FP8 format with per-tensor scaling.

    Args:
        x: Input FP8 tensor (float8_e4m3fn or float8_e5m2)
        scale: Scale tensor (scalar)
        output_type: Target dtype (float32, float16, or bfloat16)

    Returns:
        Dequantized tensor in specified output format
    )r   r*   r+   r,   dequantize_fp8r.   s       r0   r   r   T   s+     {+J9"11!UJGGGr1   rngc                 J    | ||d}t          j        d|          } |di |S )a  Stochastically round tensor to FP8 format.

    Args:
        x: Input tensor
        rng: Random uint8 tensor with the same shape as x
        output_type: FP8 dtype (float8_e4m3fn or float8_e5m2)

    Returns:
        Stochastically rounded FP8 tensor
    )r%   r4   r'   r#   )kwargs )r   get_implementation)r%   r4   r'   r6   impls        r0   r#   r#   g   s;     S==F&'@PPPD4>>&>>r1           FTper_tensor_scaleepsilonpad_16xhi_firstc                 R    t           j        j                            | ||||          S )aL  Quantize tensor to NVFP4 format with block-wise scaling.

    Args:
        x: Input tensor (2D)
        per_tensor_scale: Global scale factor
        epsilon: Epsilon for numerical stability
        pad_16x: If True, implicit zero-padding is applied to make dimensions divisible by 16
        hi_first: Nibble packing order. If True (default), the even-indexed element
                  is stored in the high nibble of each packed byte. If False, the
                  even-indexed element is stored in the low nibble.

    Returns:
        Tuple of (quantized_tensor, block_scales)
    )r*   r+   r,   r   )r%   r;   r<   r=   r>   s        r0   r   r   {   s(    * 9"11!5EwPWYabbbr1   qxblock_scalesc                 l    t           |         }t          j        j                            | ||||          S )aF  Dequantize tensor from NVFP4 format with block-wise scaling.

    Args:
        qx: Quantized FP4 tensor (packed as uint8)
        per_tensor_scale: Global scale factor
        block_scales: Block scales in swizzled layout (float8_e4m3fn)
        output_type: Target output dtype (float32, float16, or bfloat16)
        hi_first: Nibble packing order. Must match the packing order used
                  during quantization. If True (default), the even-indexed
                  element is in the high nibble.

    Returns:
        Dequantized tensor in specified output format
    )r   r*   r+   r,   r   )r@   r;   rA   r'   r>   r/   s         r0   r   r      s2    * {+J9"33B8H,Xbdlmmmr1   abtensor_scale_atensor_scale_bblock_scale_ablock_scale_bbias	out_dtypealphac	                     |t           j        }t          |         }	t           j        j                            | |||||||	|	  	        S )az  Matrix multiplication with NVFP4 quantized inputs.

    Computes: y = (a @ b.T) * (tensor_scale_a * tensor_scale_b) + bias

    Args:
        a: Quantized matrix A (M, K//2) in uint8 format
        b: Quantized matrix B (N, K//2) in uint8 format
        tensor_scale_a: Global scale for A
        tensor_scale_b: Global scale for B
        block_scale_a: Block-wise scales for A
        block_scale_b: Block-wise scales for B
        bias: Optional bias vector
        out_dtype: Output dtype (defaults to bfloat16)
        alpha: Output scale (tensor_scale_a * tensor_scale_b)

    Returns:
        Result tensor of shape (M, N)
    )r*   bfloat16r   r+   r,   r   )
rC   rD   rE   rF   rG   rH   rI   rJ   rK   r/   s
             r0   r   r      sL    : N	y)J9"22	1nn}dJ  r1   pad_32xc                 L    t           j        j                            | |          S )a  Quantize tensor to MXFP8 format with block-wise E8M0 scaling.

    MXFP8 uses block size 32 with power-of-2 (E8M0) block scales.

    Args:
        x: Input tensor (2D, shape M x K, K must be divisible by 32)
        pad_32x: If True, pad dimensions to be divisible by 32

    Returns:
        Tuple of (quantized_fp8_tensor, block_scales_e8m0)
        - quantized_fp8_tensor: FP8 E4M3 data of shape (M, K)
        - block_scales_e8m0: E8M0 scales in swizzled layout
    )r*   r+   r,   r   )r%   rN   s     r0   r   r      s    " 9"11!W===r1   c                 h    t           |         }t          j        j                            | ||          S )a>  Dequantize tensor from MXFP8 format.

    Args:
        qx: Quantized FP8 tensor (float8_e4m3fn)
        block_scales: E8M0 block scales in swizzled layout (float8_e8m0fnu)
        output_type: Target output dtype (float32, float16, or bfloat16)

    Returns:
        Dequantized tensor in specified output format
    )r   r*   r+   r,   r   )r@   rA   r'   r/   s       r0   r   r      s+     {+J9"33BjQQQr1   c                     |t           j        }t          |         }t           j        j                            | |||||          S )a  Matrix multiplication with MXFP8 quantized inputs.

    Computes: y = a @ b.T + bias

    Args:
        a: Quantized FP8 matrix A (M, K)
        b: Quantized FP8 matrix B (N, K)
        block_scale_a: E8M0 block scales for A in swizzled layout
        block_scale_b: E8M0 block scales for B in swizzled layout
        bias: Optional bias vector
        out_dtype: Output dtype (defaults to bfloat16)

    Returns:
        Result tensor of shape (M, N)
    )r*   rM   r   r+   r,   r   )rC   rD   rG   rH   rI   rJ   r/   s          r0   r   r      sE    . N	y)J9"22	1m]D*  r1      smooth	lora_downpad_sizeact_unsignedlora_xc                 T    t           j        j                            | |||||          S )u7  Quantize activations to int4 with smoothing + LoRA down projection.

    Args:
        x: (M, K) bf16/fp16 main-path input (caller pre-shifts if unsigned path).
        smooth: (K,) smoothing factor applied before quantization.
        lora_down: (K, R) low-rank down projection weight.
        pad_size: pad M to multiple of this value (default 256).
        act_unsigned: if True, quantize into uint4 [0, 15] (scale=max/15) for u4
            MMA downstream. Caller must ensure x is non-negative — the shift
            constant is a model-topology concern, not part of this op.
        lora_x: (M, K) optional pre-shift activation for LoRA. Defaults to x.
            Pass raw (un-shifted) x when x has been pre-shifted for unsigned path.

    Returns:
        (quantized_x uint8 [M_pad, K//2], ascales [K//64, M_pad], lora_act [M_pad, R])

    Note: eager returns fp32 lora_act as a high-precision reference. The CUDA
    backend returns lora_act in x.dtype because the runtime epilogue consumes it
    as bf16/fp16; this avoids an otherwise redundant cast/allocation.
    )r*   r+   r,   r   )r%   rS   rT   rU   rV   rW   s         r0   r   r     s.    8 9"99	69hf  r1   actwgtascaleswscaleslora_act_inlora_upc           
      X    t           j        j                            | |||||||          S )a  SVDQuant W4A4 int4 GEMM + LoRA-up + bias.

    Computes out = int4_matmul(act, wgt, ascales, wscales) + lora_act_in @ lora_up^T + bias.
    The CUDA backend performs int4 MMA + per-group dequant + bias in one
    kernel and, when lora_act_in/proj_up layout and dtype allow it, fuses
    LoRA-up into the same writeback epilogue with bf16/fp16 tensor-core MMA.
    Unsupported combinations fall back to the wrapper's bf16/fp16 addmm_ path.

    Args:
        act: (M, K//2) uint8 packed activations from quantize_svdquant_w4a4.
        wgt: (N, K//2) int8 packed weights (natural row-major), or backend
            specific tile-packed storage.
        ascales: (K//64, M) activation scales.
        wscales: (K//64, N) weight scales.
        lora_act_in: (M, R) LoRA activations from quantize step.
        lora_up: (N, R) LoRA up projection weight, or matching tile-packed
            storage for tile-packed weights.
        bias: optional (N,) bias.
        act_unsigned: if True, activations are interpreted as unsigned [0,15] by
            u4.s4 MMA (for post-GELU+shift fc2). Caller pre-shifts.

    Returns:
        (M, N) output tensor (same dtype as lora_up).
    )r*   r+   r,   r   )rY   rZ   r[   r\   r]   r^   rI   rV   s           r0   r   r   9  s3    D 9"::S'7K$  r1   @   qweightwzeros
group_sizec                 T    t           j        j                            | |||||          S )a  AWQ W4A16 quantized GEMV (for modulation-style layers called with small batch).

    Args:
        x: (..., K) bf16/fp16 input.
        qweight: (N//4, K//2) int32 packed weight.
        wscales: (K//group_size, N) per-group scales.
        wzeros: (K//group_size, N) per-group zero points.
        bias: optional (N,) bias.
        group_size: quantization group size.

    Returns:
        (..., N) output tensor.
    )r*   r+   r,   r   )r%   ra   r\   rb   rI   rc   s         r0   r   r   `  s.    * 9"11	7GVT:  r1   xqxk	freqs_cisc                 N    t           j        j                            | ||          S )a7  Apply Rotary Position Embedding (RoPE) to query and key tensors.

    Interleaved layout: pair k uses adjacent elements [2k, 2k+1].

    Args:
        xq: Query tensor
        xk: Key tensor
        freqs_cis: Precomputed frequency tensor

    Returns:
        Tuple of (transformed_query, transformed_key)
    )r*   r+   r,   r   re   rf   rg   s      r0   r   r   z  s!    " 9"--b"i@@@r1   c                 L    t           j        j                            | |          S )zApply Rotary Position Embedding (RoPE) to a single tensor.

    Interleaved layout: pair k uses adjacent elements [2k, 2k+1].

    Args:
        x: Input tensor
        freqs_cis: Precomputed frequency tensor

    Returns:
        Transformed tensor
    )r*   r+   r,   r   r%   rg   s     r0   r   r     s     9"..q)<<<r1   c                 N    t           j        j                            | ||          S )aR  Apply Rotary Position Embedding (RoPE) to query and key tensors.

    Split-half layout: pair k uses elements [k] and [k + head_dim//2].
    Matches the formula:
        t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).to(freqs.dtype)
        t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
        t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)

    Args:
        xq: Query tensor
        xk: Key tensor
        freqs_cis: Precomputed frequency tensor shape (..., head_dim//2, 2, 2)

    Returns:
        Tuple of (transformed_query, transformed_key)
    )r*   r+   r,   r   ri   s      r0   r   r     s!    * 9"88RKKKr1   c                 L    t           j        j                            | |          S )a  Apply Rotary Position Embedding (RoPE) to a single tensor.

    Split-half layout: pair k uses elements [k] and [k + head_dim//2].
    Matches the formula:
        t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).to(freqs.dtype)
        t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
        t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)

    Args:
        x: Input tensor
        freqs_cis: Precomputed frequency tensor shape (..., head_dim//2, 2, 2)

    Returns:
        Transformed tensor
    )r*   r+   r,   r   rk   s     r0   r   r     s    & 9"99!YGGGr1   priorityc                 .    t          j        |            dS )zSet the priority order for backend selection.

    Args:
        priority: List of backend names in order of preference
                 Example: ["cuda", "eager"] to prefer CUDA over Torch
    N)r   set_priority)rn   s    r0   r    r      s     (#####r1   namec                 .    t          j        |            dS )z{Disable a backend, preventing its use.

    Args:
        name: Backend name to disable ("eager", "cuda", or "triton")
    N)r   disablerq   s    r0   r"   r"     s     Tr1   c                 .    t          j        |            dS )z|Re-enable a previously disabled backend.

    Args:
        name: Backend name to enable ("eager", "cuda", or "triton")
    N)r   enablert   s    r0   r!   r!     s     ODr1   c                  (    t          j                    S )aW  Get status information for all backends.

    Returns:
        Dictionary mapping backend names to their status:
        {
            "backend_name": {
                "available": bool,
                "disabled": bool,
                "unavailable_reason": str or None,
                "capabilities": list[str]
            }
        }
    )r   r   r7   r1   r0   r   r     s     !###r1   c                 *    t          j        |           S )zContext manager to temporarily use a specific backend.

    Args:
        name: Backend name to use within the context

    Example:
        with comfy_kitchen.use_backend("eager"):
            result = comfy_kitchen.quantize_per_tensor_fp8(x, scale)
    )r   r$   rt   s    r0   r$   r$      s     %%%r1   )r:   FT)NNN)F)NN)rR   FN)NF)Nr`   )6r*   backendsr   _cuda_backendr   _eager_backendr   _triton_backendbackends.eager.quantizationr   
exceptionsr   r	   r
   r   float_utilsr   r   r   r   __version____all__float8_e4m3fnTensordtyper   rM   r   r#   floatbooltupler   r   r   r   r   r   intr   r   r   r   r   r   r   liststrr    r"   r!   dictr   r$   r7   r1   r0   <module>r      s    + + + + + + . - - - - - / / / / / / 6 6 6 6 6 6            @ ? ? ? ? ? ? ? ? ?      # # #\  %2F F|F<F F \	F F F F,  %~H H|H<H H \	H H H H,  %2 |	  \	   . c c|clc c 	c
 c 5<%&c c c c8  %~n nnln ,n 	n
 n \n n n n@ !%$(!%# #|#|# L# L	#
 <# <# ,
# {T!# <$# \# # # #P > >|>> 5<%&> > > >.  %~R RR,R R \	R R R R0 !%$( || < <	
 ,
 {T! \   F "& |L | 	
  L4 5<u|34   P !%$ $	$	$ \$ \	$
 $ \$ ,
$ $ \$ $ $ $X !% |\ \ L	
 ,
  \   4AAA |A 5<%&	A A A A(=|=|= \= = = =$LLL |L 5<%&	L L L L0H|H|H \H H H H6$49 $ $ $ $ $# $         $t $ $ $ $"
&c 
& 
& 
& 
& 
& 
&r1   