ó
    ¤3j—<  ã                   óÄ  • S SK r S SKJs  Jr  SrSrSrSrSr	Sr
S\S	\S
\4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jrS\ R                  S
\ R                  4S jr   S+S\ R                  S\ R                  S\ R                  S\S\S\ R                  S-  S
\\ R                  \ R                  \ R                  4   4S jjr  S,S\ R                  S\ R                  S\ R                  S\ R                  S \ R                  S\ R                  S!\ R                  S-  S\S
\ R                  4S" jjr\ R0                  R3                  S#S$S%9   S+S\ R                  S\ R                  S\ R                  S\S\S\ R                  S-  S
\\ R                  \ R                  \ R                  4   4S& jj5       r\R6                   S+S' j5       r\ R0                  R3                  S(S$S%9  S,S\ R                  S\ R                  S\ R                  S\ R                  S \ R                  S\ R                  S!\ R                  S-  S\S
\ R                  4S) jj5       r\R6                   S,S* j5       rg)-é    Né@   é€   é   é   é   g      Æ?ÚaÚbÚreturnc                 ó   • U * U-  * $ )N© )r   r	   s     Ú_/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/backends/eager/svdquant.pyÚ	_ceil_divr   ?   s   € ØˆR1‰Wˆ:Ðó    Úvaluesc                 óT  • U R                   S   S-  S:w  a  [        SU R                   S    35      eU SSSS24   R                  [        R                  5      S-  nU SSSS24   R                  [        R                  5      S-  nXS	-  -  R                  [        R
                  5      $ )
a>  Pack (..., K) int4 values into (..., K // 2) int8 (low = even column).

Storage-level codec: handles the full 4-bit field; the caller decides which
quantizer emission range to use. Inputs may contain any value that fits in
a nibble (signed [-8, 7] or unsigned [0, 15]); values outside get masked
via the & 0x0F below.
éÿÿÿÿé   r   zlast dim must be even, got .Nr   é   r   )ÚshapeÚ
ValueErrorÚtoÚtorchÚint32Úint8)r   ÚloÚhis      r   Ú_pack_int4_row_majorr   C   s¢   € ð ‡||BÑ˜!Ñ˜qÓ ÜÐ6°v·|±|ÀBÑ7GÐ6HÐIÓJÐJØ	QT˜T	Ñ	×	Ñ	œeŸk™kÓ	*¨TÑ	1€BØ	QT˜T	Ñ	×	Ñ	œeŸk™kÓ	*¨TÑ	1€BØ˜‘'‰N×ÑœuŸz™zÓ*Ð*r   Úpackedc                 óz  • U R                  [        R                  5      nUS-  nUS-	  S-  n[        R                  " US:¬  US-
  U5      n[        R                  " US:¬  US-
  U5      n[        R                  " X#/SS9nUR
                  " / U R                  SS QSP76 R                  [        R                  5      $ )u   Inverse of _pack_int4_row_major with signed-nibble interpretation.

Storage-level codec: returns int8 across the full signed nibble range
[-8, 7]. This is wider than the quantizer's emission range [-7, 7] by
design â€” the codec must accept any bit pattern that could land in the
nibble.
r   r   é   é   r   ©ÚdimN)r   r   r   ÚwhereÚstackÚreshaper   r   ©r   Úx32r   r   Ústackeds        r   Ú_unpack_int4_row_majorr*   R   s¤   € ð )‰)”E—K‘KÓ
 €CØ	ˆt‰€BØ
‰(dÑ	€BÜ	ŠR˜1‘W˜b 2™g rÓ	*€BÜ	ŠR˜1‘W˜b 2™g rÓ	*€BÜkŠk˜2˜(¨Ñ+€GØ?Š?Ð2˜FŸL™L¨¨"Ð-Ð2¨rÒ2×5Ñ5´e·j±jÓAÐAr   c                 ó  • U R                  [        R                  5      nUS-  nUS-	  S-  n[        R                  " X#/SS9nUR                  " / U R
                  SS QSP76 R                  [        R                  5      $ )uØ   Inverse of _pack_int4_row_major with unsigned-nibble interpretation.

Storage-level codec: returns int8 across the full unsigned nibble range
[0, 15] (used by the u4.s4 MMA path â€” post-GELU+shift fc2 activations).
r   r   r   r"   N)r   r   r   r%   r&   r   r   r'   s        r   Ú_unpack_uint4_row_majorr,   c   sp   € ð )‰)”E—K‘KÓ
 €CØ	ˆt‰€BØ
‰(dÑ	€BÜkŠk˜2˜(¨Ñ+€GØ?Š?Ð2˜FŸL™L¨¨"Ð-Ð2¨rÒ2×5Ñ5´e·j±jÓAÐAr   c                 ó  • U R                  5       S:w  a  U $ U R                  u  pp4U[        [        -  :w  a  [	        SU 35      eU[        [
        S-  -  :w  a  [	        SU 35      eU R                  XU[        [
        S-  5      R                  SSSSS5      R                  5       R                  U[        -  U[
        S-  -  5      $ )z=Convert kitchen_tile_packed_w4a4 weight to natural (N, K//2).r   z unexpected tile-packed N quads: r   z"unexpected tile-packed byte axis: r   é   r   )	r#   r   Ú_TILE_PACKED_BLOCK_NÚ_TILE_PACKED_INTERLEAVEr   Ú_INT4_GROUP_SIZEÚviewÚpermuteÚ
contiguous)r   Ún_tilesÚk_groupsÚn_quadsÚbytes_per_quads        r   Ú _tile_packed_weight_to_row_majorr9   p   sÆ   € à‡zzƒ|qÓØˆØ17·±Ñ.€GwØÔ&Ô*AÑAÓAÜÐ;¸G¸9ÐEÓFÐFØÔ0Ô4DÈÑ4IÑJÓJÜÐ=¸nÐ=MÐNÓOÐOØ;‰;Ø˜7Ô$;Ô=MÐQRÑ=RóçgˆaAq˜!ÓŸZ™Z›\¯$©$ØÔ&Ñ&¨Ô4DÈÑ4IÑ(Jó+ðr   Úwscalesc                 óÒ   • U R                  5       S:w  a  U $ U R                  SSS5      R                  5       R                  U R                  S   U R                  S   [
        -  5      $ )zBConvert tile-packed wscales (N/128, K/G, 128) to natural (K/G, N).r.   r   r   r   ©r#   r3   r4   r2   r   r/   )r:   s    r   Ú_tile_packed_scales_to_naturalr=   €   s[   € à‡{{ƒ}˜ÓØˆØ?‰?˜1˜a Ó#×.Ñ.Ó0×5Ñ5Ø‰aÑ˜'Ÿ-™-¨Ñ*Ô-AÑAóð r   Úlora_upc                 óÒ   • U R                  5       S:w  a  U $ U R                  SSS5      R                  5       R                  U R                  S   [
        -  U R                  S   5      $ )z>Convert tile-packed proj_up (N/128, R, 128) to natural (N, R).r.   r   r   r   r<   )r>   s    r   Ú_tile_packed_lora_up_to_naturalr@   ‰   s[   € à‡{{ƒ}˜ÓØˆØ?‰?˜1˜a Ó#×.Ñ.Ó0×5Ñ5Ø‰aÑÔ/Ñ/°·±¸qÑ1Aóð r   ÚxÚsmoothÚ	lora_downÚpad_sizeÚact_unsignedÚlora_xc                 ó&  • U R                  5       S:w  a!  [        S[        U R                  5       35      eU R                  u  pgUR                  S     [        nXx-  S:w  a  [        SU SU 35      e[        Xc5      U-  n	Ub  UOU n
U
R                  5       UR                  5       -  nX-  nUR                  XgU-  U5      nUR                  5       R                  SS9R                  S	S
9nU(       a  [        O[        nU(       a  SO[        * nXï-  nUUR                  S5      -  R                  5       R                  UU5      R                  [         R"                  5      nUR%                  Xg5      n['        U5      nX–:”  aU  X–-
  n[(        R*                  " USSSU45      n[(        R*                  " USSSU45      n[(        R*                  " USSSU45      nUR-                  5       R/                  5       R                  U R0                  5      nUUU4$ )u  Quantize activations to int4 with smoothing + separate LoRA down projection.

Args:
    x: (M, K) bf16/fp16 â€” main-path input (may be shifted by caller for unsigned).
    smooth: (K,) per-channel smoothing factor applied before quantization.
    lora_down: (K, R) low-rank down projection weight. The eager reference
        here runs the matmul in fp32 for numerical stability; the CUDA
        backend takes a lower-precision bf16 matmul path for memory/launch
        savings. Do not use this eager result as a bitwise oracle for the
        CUDA output â€” it is a high-precision reference, not a backend parity
        target.
    pad_size: pad M to a multiple of this (default 256) to match downstream kernels.
    act_unsigned: if True, quantize into uint4 [0, 15] (scale=max/15) instead of
        signed int4 [-7, 7] (scale=max/7). Selects MMA grid for downstream u4.s4.
    lora_x: (M, K) bf16/fp16 â€” input for LoRA matmul. Defaults to x. Pass
        separately when caller pre-shifts x (SVDQuant LoRA is mathematically
        defined on pre-quantization, pre-shift activations).

Returns:
    q_x: (M_pad, K // 2) int8 packed (2 int4 per byte, same layout as weight).
    ascales: (K // 64, M_pad) same dtype as x â€” per-row per-group scales.
    lora_act: (M_pad, R) fp32 LoRA activations.
r   zexpected 2D input, got shape r   r   zK=z not divisible by group_size=r   r"   g»½×Ùß|Û=)Úmin)r#   r   Útupler   r1   r   Úfloatr2   ÚabsÚamaxÚclampÚ
_UINT4_MAXÚ	_INT4_MAXÚ	unsqueezeÚroundr   r   r   r&   r   ÚFÚpadÚtr4   Údtype)rA   rB   rC   rD   rE   rF   ÚmÚkÚgroupÚm_padÚlora_srcÚlora_actÚx_smoothÚgroupsÚabsmaxÚqmaxÚqminÚscalesÚq_valsÚq_packedrS   Úascaless                         r   Úquantize_svdquant_w4a4re   ’   s×  € ð> 	‡uuƒw!ƒ|ÜÐ8¼¸q¿w¹w»Ð8HÐIÓJÐJØ7‰7D€AØ‡OOAÒÜ€EØyAƒ~Ü˜2˜a˜SÐ =¸e¸WÐEÓFÐFÜaÓ" XÑ-€Eð  Ñ+‰v°€HØ~‰~Ó )§/¡/Ó"3Ñ3€Hð
 ‰z€HØ]‰]˜1 5™j¨%Ó0€FØZ‰Z‹\×Ñ 2ÐÐ&×,Ñ,°Ð,Ð7€Fö
 &:¬9€DÞ‰1¤9 *€DØ‰]€FØv×'Ñ'¨Ó+Ñ+×2Ñ2Ó4×:Ñ:¸4ÀÓF×IÑIÌ%Ï*É*ÓU€FØ^‰^˜AÓ!€FÜ# FÓ+€HàƒyØ‰iˆÜ—5’5˜ A q¨!¨S >Ó2ˆÜ—’v  1 a¨˜~Ó.ˆÜ—5’5˜ A q¨!¨S >Ó2ˆàh‰h‹j×#Ñ#Ó%×(Ñ(¨¯©Ó1€GØW˜hÐ&Ð&r   ÚactÚwgtrd   Úlora_act_inÚbiasc                 ó,  • [        U5      n[        U5      n[        U5      nU R                  u  p‰UR                  S   n
U	S-  n[        nUR
                  n[        U5      R                  U5      nUR                  X«U-  U5      nUR                  5       R                  S5      nUU-  R                  X«5      nU(       a  [        U 5      O
[        U 5      nUR                  U5      R                  X‹U-  U5      nUR                  5       R                  S5      nUU-  R                  X‹5      nUUR                  5       -  nUR                  5       UR                  5       R                  5       -  nUUR                  UR
                  5      -   nUb  UU-   nU$ )ué  SVDQuant W4A4 int4 GEMM + LoRA up + optional bias (eager reference impl).

Semantic mirror of the CUDA path; used as a high-precision reference and on
non-CUDA devices. Numerical notes:

* Quantization grid and emission range match the CUDA kernel exactly
  (signed absmax/7, unsigned absmax/15 â€” see module header).
* The LoRA-up branch here accumulates in fp32 for stability. The CUDA
  wrapper takes a bf16 addmm_ path for memory/launch-count savings and
  drops the fp32 lora_act_in precision. This eager output is therefore a
  high-precision reference, not a bit-parity oracle for the CUDA backend.
  Tests that compare the two paths should use tolerances consistent with
  bf16 matmul (~8e-3 abs).

Args:
    act: (M, K // 2) int8 packed activations from quantize_svdquant_w4a4.
        Bit-pattern interpretation depends on act_unsigned (signed [-7,7]
        vs unsigned [0,15]).
    wgt: (N, K // 2) int8 packed weights (kitchen natural row-major) or
        (N/128, K/64, 32, 128) kitchen_tile_packed_w4a4.
    ascales: (K // 64, M) per-row per-group activation scales.
    wscales: (K // 64, N) natural or (N/128, K/64, 128) tile-packed.
    lora_act_in: (M, R) fp32 LoRA down-projection activations.
    lora_up: (N, R) natural or (N/128, R, 128) tile-packed.
    bias: (N,) bias or None.
    act_unsigned: if True, interpret packed activations as unsigned [0, 15]
        (matches the u4.s4 MMA path used for post-GELU+shift fc2 layers).

Returns:
    out: (M, N) in the dtype of wscales/lora_up.
r   r   r   )r9   r=   r@   r   r1   rU   r*   r   r2   rT   rP   r,   rJ   )rf   rg   rd   r:   rh   r>   ri   rE   rV   Úk_halfÚnrW   rX   Úcompute_dtypeÚwgt_intÚwgt_gÚwscales_bngÚwgt_fpÚact_intÚascales_mngÚact_fpÚoutÚlora_contributions                          r   Úscaled_mm_svdquant_w4a4rw   Ù   sm  € ôR +¨3Ó
/€CÜ,¨WÓ5€GÜ-¨gÓ6€Gà—	‘	I€AØ	‰	!‰€AØ‰
€AÜ€EØ—M‘M€Mô % SÓ)×,Ñ,¨]Ó;€GØL‰L˜ ™J¨Ó.€EØ—)‘)“+×'Ñ'¨Ó+€KØkÑ!×'Ñ'¨Ó-€Fö /;Ô% cÔ*Ô@VÐWZÓ@[€GØj‰j˜Ó'×,Ñ,¨Q°U±
¸EÓB€GØ—)‘)“+×'Ñ'¨Ó+€KØ˜Ñ#×)Ñ)¨!Ó/€Fà
6—8‘8“:Ñ
€Cð $×)Ñ)Ó+¨g¯m©m«o×.?Ñ.?Ó.AÑAÐØ
Ð!×$Ñ$ S§Y¡YÓ/Ñ
/€CàÑØD‰jˆØ€Jr   z%comfy_kitchen::quantize_svdquant_w4a4r   )Úmutates_argsc                 óP   • SSK Jn  U UUUUUS.nUR                  SUS9nU" S0 UD6$ )Nr   ©Úregistry)rA   rB   rC   rD   rE   rF   re   ©Úkwargsr   ©Úcomfy_kitchen.registryr{   Úget_implementation)	rA   rB   rC   rD   rE   rF   r{   r}   Úimpls	            r   Ú_op_quantize_svdquant_w4a4r‚   ,  sF   € õ 0ð ØØØØ$Øñ€Fð ×&Ñ&Ð'?ÈÐ&ÐO€DÙ‰>&‰>Ðr   c                 ó€  • U R                   u  pgUR                   S   n[        Xc5      U-  n	[        R                  " X—S-  [        R                  U R
                  S9n
[        R                  " U[        -  XR                  U R
                  S9n[        R                  " X˜[        R                  U R
                  S9nX«U4$ )Nr   r   ©rU   Údevice)	r   r   r   Úemptyr   r…   r1   rU   Úfloat32)rA   rB   rC   rD   rE   rF   rV   rW   ÚrrY   Úq_xrd   r[   s                r   Ú_op_quantize_svdquant_w4a4_fakerŠ   C  s   € ð 7‰7D€AØ‰˜Ñ€AÜaÓ" XÑ-€EÜ
+Š+e !™V¬5¯:©:¸a¿h¹hÑ
G€CÜkŠk˜!Ô/Ñ/°¿g¹gÈaÏhÉhÑW€GÜ{Š{˜5¬5¯=©=ÀÇÁÑJ€HØ˜Ð!Ð!r   z&comfy_kitchen::scaled_mm_svdquant_w4a4c           	      óL   • SSK Jn  XX#XEXgS.n	UR                  SU	S9n
U
" S0 U	D6$ )Nr   rz   )rf   rg   rd   r:   rh   r>   ri   rE   rw   r|   r   r~   )rf   rg   rd   r:   rh   r>   ri   rE   r{   r}   r   s              r   Ú_op_scaled_mm_svdquant_w4a4rŒ   P  s?   € õ 0ð ¨7Ø"Øñ€Fð
 ×&Ñ&Ð'@ÈÐ&ÐP€DÙ‰>&‰>Ðr   c                 óæ   • U R                   S   nUR                  5       S:X  a  UR                   S   [        -  OUR                   S   n	[        R                  " X‰UR
                  U R                  S9$ )Nr   r   r„   )r   r#   r/   r   r†   rU   r…   )
rf   rg   rd   r:   rh   r>   ri   rE   rV   rl   s
             r   Ú _op_scaled_mm_svdquant_w4a4_fakerŽ   f  sU   € ð 		‰	!‰€AØ/2¯w©w«y¸A«~ˆ	‰	!‰Ô+Ò+À3Ç9Á9ÈQÁ<€AÜ;Š;q 7§=¡=¸¿¹ÑDÐDr   )é   FN)NF)r   Útorch.nn.functionalÚnnÚ
functionalrR   r1   r/   r0   rO   rN   Ú_GELU_UNSIGNED_SHIFTÚintr   ÚTensorr   r*   r,   r9   r=   r@   ÚboolrI   re   rw   ÚlibraryÚ	custom_opr‚   Úregister_fakerŠ   rŒ   rŽ   r   r   r   Ú<module>rš      s   ðó^ ß Ð àÐ ØÐ ØÐ ð €	Ø€
ØÐ ðð ˜ð  ô ð+ §¡ð +°%·,±,ô +ðB 5§<¡<ð B°E·L±Lô Bð"
B E§L¡Lð 
B°U·\±\ô 
Bð¨U¯\©\ð ¸e¿l¹lô ð ¨E¯L©Lð ¸U¿\¹\ô ð¨U¯\©\ð ¸e¿l¹lô ð ØØ"&ñD'Ø‡||ðD'àL‰LðD'ð |‰|ðD'ð ð	D'ð
 ðD'ð L‰L˜4ÑðD'ð ˆ5<‰<˜Ÿ™ u§|¡|Ð3Ñ4õD'ð\ !%ØñGØ	‰ðGà	‰ðGð \‰\ðGð \‰\ð	Gð
 —‘ðGð \‰\ðGð ,‰,˜Ñ
ðGð ðGð ‡\\õGðf ‡×ÑÐ@ÈrÐÐRð
 ØØ"&ñØ‡||ðàL‰Lðð |‰|ðð ð	ð
 ðð L‰L˜4Ñðð ˆ5<‰<˜Ÿ™ u§|¡|Ð3Ñ4ôó Sðð, ×)Ñ)àCGó	"ó *ð	"ð ‡×ÑÐAÐPRÐÐSð !%ØñØ	‰ðà	‰ðð \‰\ðð \‰\ð	ð
 —‘ðð \‰\ðð ,‰,˜Ñ
ðð ðð ‡\\ôó Tðð* ×*Ñ*àNSóEó +ñEr   