
    3j<                        S SK r S SKJs  Jr  SrSrSrSrSr	Sr
S\S	\S
\4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jr   S+S\ R                  S\ R                  S\ R                  S\S\S\ R                  S-  S
\\ R                  \ R                  \ R                  4   4S jjr  S,S\ R                  S\ R                  S\ R                  S\ R                  S \ R                  S\ R                  S!\ R                  S-  S\S
\ R                  4S" jjr\ R0                  R3                  S#S$S%9   S+S\ R                  S\ R                  S\ R                  S\S\S\ R                  S-  S
\\ R                  \ R                  \ R                  4   4S& jj5       r\R6                   S+S' j5       r\ R0                  R3                  S(S$S%9  S,S\ R                  S\ R                  S\ R                  S\ R                  S \ R                  S\ R                  S!\ R                  S-  S\S
\ R                  4S) jj5       r\R6                   S,S* j5       rg)-    N@               g      ?abreturnc                     U * U-  * $ )N )r   r	   s     _/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/backends/eager/svdquant.py	_ceil_divr   ?   s    R1W:    valuesc                 T   U R                   S   S-  S:w  a  [        SU R                   S    35      eU SSSS24   R                  [        R                  5      S-  nU SSSS24   R                  [        R                  5      S-  nXS	-  -  R                  [        R
                  5      $ )
a>  Pack (..., K) int4 values into (..., K // 2) int8 (low = even column).

Storage-level codec: handles the full 4-bit field; the caller decides which
quantizer emission range to use. Inputs may contain any value that fits in
a nibble (signed [-8, 7] or unsigned [0, 15]); values outside get masked
via the & 0x0F below.
   r   zlast dim must be even, got .Nr      r   )shape
ValueErrortotorchint32int8)r   lohis      r   _pack_int4_row_majorr   C   s     ||B!q 6v||B7G6HIJJ	QTT				ekk	*T	1B	QTT				ekk	*T	1B'Nuzz**r   packedc                 z   U R                  [        R                  5      nUS-  nUS-	  S-  n[        R                  " US:  US-
  U5      n[        R                  " US:  US-
  U5      n[        R                  " X#/SS9nUR
                  " / U R                  SS QSP76 R                  [        R                  5      $ )u   Inverse of _pack_int4_row_major with signed-nibble interpretation.

Storage-level codec: returns int8 across the full signed nibble range
[-8, 7]. This is wider than the quantizer's emission range [-7, 7] by
design — the codec must accept any bit pattern that could land in the
nibble.
r   r         r   dimN)r   r   r   wherestackreshaper   r   r   x32r   r   stackeds        r   _unpack_int4_row_majorr*   R   s     ))EKK
 C	tB
(d	B	R1Wb2gr	*B	R1Wb2gr	*Bkk2(+G??2FLL"-2r255ejjAAr   c                    U R                  [        R                  5      nUS-  nUS-	  S-  n[        R                  " X#/SS9nUR                  " / U R
                  SS QSP76 R                  [        R                  5      $ )u   Inverse of _pack_int4_row_major with unsigned-nibble interpretation.

Storage-level codec: returns int8 across the full unsigned nibble range
[0, 15] (used by the u4.s4 MMA path — post-GELU+shift fc2 activations).
r   r   r   r"   N)r   r   r   r%   r&   r   r   r'   s        r   _unpack_uint4_row_majorr,   c   sp     ))EKK
 C	tB
(d	Bkk2(+G??2FLL"-2r255ejjAAr   c                    U R                  5       S:w  a  U $ U R                  u  pp4U[        [        -  :w  a  [	        SU 35      eU[        [
        S-  -  :w  a  [	        SU 35      eU R                  XU[        [
        S-  5      R                  SSSSS5      R                  5       R                  U[        -  U[
        S-  -  5      $ )z=Convert kitchen_tile_packed_w4a4 weight to natural (N, K//2).r   z unexpected tile-packed N quads: r   z"unexpected tile-packed byte axis: r      r   )	r#   r   _TILE_PACKED_BLOCK_N_TILE_PACKED_INTERLEAVEr   _INT4_GROUP_SIZEviewpermute
contiguous)r   n_tilesk_groupsn_quadsbytes_per_quads        r    _tile_packed_weight_to_row_majorr9   p   s    zz|q17.Gw&*AAA;G9EFF04D4IJJ=n=MNOO;;7$;=MQR=RgaAq!ZZ\$$&&4D4I(J+r   wscalesc                     U R                  5       S:w  a  U $ U R                  SSS5      R                  5       R                  U R                  S   U R                  S   [
        -  5      $ )zBConvert tile-packed wscales (N/128, K/G, 128) to natural (K/G, N).r.   r   r   r   r#   r3   r4   r2   r   r/   )r:   s    r   _tile_packed_scales_to_naturalr=      s[    {{}??1a#..055a'--*-AA r   lora_upc                     U R                  5       S:w  a  U $ U R                  SSS5      R                  5       R                  U R                  S   [
        -  U R                  S   5      $ )z>Convert tile-packed proj_up (N/128, R, 128) to natural (N, R).r.   r   r   r   r<   )r>   s    r   _tile_packed_lora_up_to_naturalr@      s[    {{}??1a#..055a//q1A r   xsmooth	lora_downpad_sizeact_unsignedlora_xc                 &   U R                  5       S:w  a!  [        S[        U R                  5       35      eU R                  u  pgUR                  S     [        nXx-  S:w  a  [        SU SU 35      e[        Xc5      U-  n	Ub  UOU n
U
R                  5       UR                  5       -  nX-  nUR                  XgU-  U5      nUR                  5       R                  SS9R                  S	S
9nU(       a  [        O[        nU(       a  SO[        * nX-  nUUR                  S5      -  R                  5       R                  UU5      R                  [         R"                  5      nUR%                  Xg5      n['        U5      nX:  aU  X-
  n[(        R*                  " USSSU45      n[(        R*                  " USSSU45      n[(        R*                  " USSSU45      nUR-                  5       R/                  5       R                  U R0                  5      nUUU4$ )u  Quantize activations to int4 with smoothing + separate LoRA down projection.

Args:
    x: (M, K) bf16/fp16 — main-path input (may be shifted by caller for unsigned).
    smooth: (K,) per-channel smoothing factor applied before quantization.
    lora_down: (K, R) low-rank down projection weight. The eager reference
        here runs the matmul in fp32 for numerical stability; the CUDA
        backend takes a lower-precision bf16 matmul path for memory/launch
        savings. Do not use this eager result as a bitwise oracle for the
        CUDA output — it is a high-precision reference, not a backend parity
        target.
    pad_size: pad M to a multiple of this (default 256) to match downstream kernels.
    act_unsigned: if True, quantize into uint4 [0, 15] (scale=max/15) instead of
        signed int4 [-7, 7] (scale=max/7). Selects MMA grid for downstream u4.s4.
    lora_x: (M, K) bf16/fp16 — input for LoRA matmul. Defaults to x. Pass
        separately when caller pre-shifts x (SVDQuant LoRA is mathematically
        defined on pre-quantization, pre-shift activations).

Returns:
    q_x: (M_pad, K // 2) int8 packed (2 int4 per byte, same layout as weight).
    ascales: (K // 64, M_pad) same dtype as x — per-row per-group scales.
    lora_act: (M_pad, R) fp32 LoRA activations.
r   zexpected 2D input, got shape r   r   zK=z not divisible by group_size=r   r"   g|=)min)r#   r   tupler   r1   r   floatr2   absamaxclamp
_UINT4_MAX	_INT4_MAX	unsqueezeroundr   r   r   r&   r   Fpadtr4   dtype)rA   rB   rC   rD   rE   rF   mkgroupm_padlora_srclora_actx_smoothgroupsabsmaxqmaxqminscalesq_valsq_packedrS   ascaless                         r   quantize_svdquant_w4a4re      s   > 	uuw!|8qww8HIJJ77DAOOAEyA~2aS =eWEFFa"X-E  +vH~~)//"33H
 zH]]15j%0FZZ\2&,,,7F
 &:9D19*D]Fv''++224::4FII%**UF^^A!F#F+Hyi55Aq!S>2v1a~.55Aq!S>2hhj##%((1GWh&&r   actwgtrd   lora_act_inbiasc                 ,   [        U5      n[        U5      n[        U5      nU R                  u  pUR                  S   n
U	S-  n[        nUR
                  n[        U5      R                  U5      nUR                  XU-  U5      nUR                  5       R                  S5      nUU-  R                  X5      nU(       a  [        U 5      O
[        U 5      nUR                  U5      R                  XU-  U5      nUR                  5       R                  S5      nUU-  R                  X5      nUUR                  5       -  nUR                  5       UR                  5       R                  5       -  nUUR                  UR
                  5      -   nUb  UU-   nU$ )u  SVDQuant W4A4 int4 GEMM + LoRA up + optional bias (eager reference impl).

Semantic mirror of the CUDA path; used as a high-precision reference and on
non-CUDA devices. Numerical notes:

* Quantization grid and emission range match the CUDA kernel exactly
  (signed absmax/7, unsigned absmax/15 — see module header).
* The LoRA-up branch here accumulates in fp32 for stability. The CUDA
  wrapper takes a bf16 addmm_ path for memory/launch-count savings and
  drops the fp32 lora_act_in precision. This eager output is therefore a
  high-precision reference, not a bit-parity oracle for the CUDA backend.
  Tests that compare the two paths should use tolerances consistent with
  bf16 matmul (~8e-3 abs).

Args:
    act: (M, K // 2) int8 packed activations from quantize_svdquant_w4a4.
        Bit-pattern interpretation depends on act_unsigned (signed [-7,7]
        vs unsigned [0,15]).
    wgt: (N, K // 2) int8 packed weights (kitchen natural row-major) or
        (N/128, K/64, 32, 128) kitchen_tile_packed_w4a4.
    ascales: (K // 64, M) per-row per-group activation scales.
    wscales: (K // 64, N) natural or (N/128, K/64, 128) tile-packed.
    lora_act_in: (M, R) fp32 LoRA down-projection activations.
    lora_up: (N, R) natural or (N/128, R, 128) tile-packed.
    bias: (N,) bias or None.
    act_unsigned: if True, interpret packed activations as unsigned [0, 15]
        (matches the u4.s4 MMA path used for post-GELU+shift fc2 layers).

Returns:
    out: (M, N) in the dtype of wscales/lora_up.
r   r   r   )r9   r=   r@   r   r1   rU   r*   r   r2   rT   rP   r,   rJ   )rf   rg   rd   r:   rh   r>   ri   rE   rV   k_halfnrW   rX   compute_dtypewgt_intwgt_gwscales_bngwgt_fpact_intascales_mngact_fpoutlora_contributions                          r   scaled_mm_svdquant_w4a4rw      sm   R +3
/C,W5G-g6G		IA		!A
AEMMM %S),,];GLLJ.E))+''+Kk!''-F /;%c*@VWZ@[Gjj',,QU
EBG))+''+K#))!/F
688:
C $))+gmmo.?.?.AA
!$$SYY/
/CDjJr   z%comfy_kitchen::quantize_svdquant_w4a4r   )mutates_argsc                 P    SSK Jn  U UUUUUS.nUR                  SUS9nU" S0 UD6$ )Nr   registry)rA   rB   rC   rD   rE   rF   re   kwargsr   comfy_kitchen.registryr{   get_implementation)	rA   rB   rC   rD   rE   rF   r{   r}   impls	            r   _op_quantize_svdquant_w4a4r   ,  sF     0 $F &&'?&OD>&>r   c                    U R                   u  pgUR                   S   n[        Xc5      U-  n	[        R                  " XS-  [        R                  U R
                  S9n
[        R                  " U[        -  XR                  U R
                  S9n[        R                  " X[        R                  U R
                  S9nXU4$ )Nr   r   rU   device)	r   r   r   emptyr   r   r1   rU   float32)rA   rB   rC   rD   rE   rF   rV   rW   rrY   q_xrd   r[   s                r   _op_quantize_svdquant_w4a4_faker   C  s     77DAAa"X-E
++e!V5::ahh
GCkk!//ggahhWG{{55==JH!!r   z&comfy_kitchen::scaled_mm_svdquant_w4a4c           	      L    SSK Jn  XX#XEXgS.n	UR                  SU	S9n
U
" S0 U	D6$ )Nr   rz   )rf   rg   rd   r:   rh   r>   ri   rE   rw   r|   r   r~   )rf   rg   rd   r:   rh   r>   ri   rE   r{   r}   r   s              r   _op_scaled_mm_svdquant_w4a4r   P  s?     0 7"F
 &&'@&PD>&>r   c                     U R                   S   nUR                  5       S:X  a  UR                   S   [        -  OUR                   S   n	[        R                  " XUR
                  U R                  S9$ )Nr   r   r   )r   r#   r/   r   r   rU   r   )
rf   rg   rd   r:   rh   r>   ri   rE   rV   rl   s
             r    _op_scaled_mm_svdquant_w4a4_faker   f  sU     			!A/2wwyA~		!++399Q<A;;q7==DDr   )   FN)NF)r   torch.nn.functionalnn
functionalrR   r1   r/   r0   rO   rN   _GELU_UNSIGNED_SHIFTintr   Tensorr   r*   r,   r9   r=   r@   boolrI   re   rw   library	custom_opr   register_faker   r   r   r   r   r   <module>r      s  ^       	
    + +%,, +B5<< BELL B"
BELL 
BU\\ 
BU\\ ell  ELL U\\ U\\ ell  "&D'||D'LLD' ||D' 	D'
 D' LL4D' 5<<u||34D'\ !%G	G	G \\G \\	G
 G \\G ,,
G G \\Gf @rR
 "&||LL || 	
  LL4 5<<u||34 S, ))CG	" *	" APRS !%		 \\ \\	
  \\ ,,
  \\ T* **NSE +Er   