
    3j}X                       S r SSKJr  SSKrSSKrSSKJr  SSKJr  SSK	J
r
  SSKrSSKrSSKJrJrJrJr  \
(       a  SS	KJr  \R(                  " \5      rS
rSrSr1 SkrS$S jrS%S jrS&S jrS'S jrS(S jrSSS.       S)S jjr               S*S jr! S+SSS.           S,S jjjr"S-S jr# S+SSS.         S.S jjjr$          S/S jr% " S S\5      r&        S0S jr'\" \RP                  RR                  RT                  RV                  \&5      S 5       r,S1S  jr-\" \RP                  RR                  R\                  RV                  \&5      S! 5       r/\" \RP                  RR                  R`                  RV                  \&5      S" 5       r1\" \RP                  RR                  Rd                  RV                  \&5      S# 5       r3g)2a~  SVDQuant W4A4 quantization layout for tensor cores.

Each quantized linear stores:
  qweight:       (N, K // 2)  int8        packed W4 residual
  scale=wscales: (K // 64, N) bf16/fp16   per-group weight scales
  proj_down:     (K, R)       bf16/fp16   SVD down projection (V^T)
  proj_up:       (N, R)       bf16/fp16   SVD up projection (U)
  smooth_factor: (K,)         bf16/fp16   input-side smoothing

LoRA-style proj_down / proj_up recover the outlier-heavy singular directions
that pure 4-bit quantization cannot represent; the dispatched kernel fuses
activation quantization + low-rank correction + int4 matmul into a single call.
    )annotationsN)Sequence)	dataclass)TYPE_CHECKING   )BaseLayoutParamsQuantizedLayoutdequantize_argsregister_layout_opQuantizedTensor@      g      ?>   0noofffalsec                   [         R                  " U 5      nUc  U$ UR                  5       R                  5       [        ;  $ N)osgetenvstriplower_FALSE_ENV_VALUES)namedefaultvalues      \/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/tensor/svdquant_w4a4.py_env_enabledr   ,   s4    IIdOE};;= (999    c                 L    [        [        R                  R                  SS 5      $ )Nbackend_override)getattrckregistry_thread_local r    r   _registry_backend_overrider(   3   s    2;;,,.@$GGr    c                   [        SSS9(       d  g[        5       nUb  US:w  a  gU R                  (       a  UR                  (       d  gU R                  UR                  :w  a  g SSKJn  [        USS	5      (       d  gU$ ! [         a     gf = f)
a  Return the CUDA backend module when SVDQuant can safely bypass registry.

ComfyUI currently disables comfy_kitchen's global CUDA backend on PyTorch
CUDA < 13, but this SVDQuant extension is built locally and works on the
cu128/RTX 5090 environment. The QuantizedTensor path can call the CUDA
implementation directly while still respecting an explicit non-CUDA backend
override such as ``with ck.use_backend("eager")``.
"COMFY_KITCHEN_SVDQUANT_DIRECT_CUDAT)r   Ncudar   )r+   _EXT_AVAILABLEF)r   r(   is_cudadevicecomfy_kitchen.backendsr+   	Exceptionr#   )input_tensorqdataoverridecuda_backends       r   _direct_cuda_backendr5   7   s     <dK)+HF 2u}}ell*? <!1599	  s   $A? ?
BBc                    SSK Jn  [        X5      =(       a;    U R                  S:H  =(       a%    [	        [        U R                  SS5      5      (       + $ )Nr   r   TensorCoreSVDQuantW4A4Layout
transposedF)baser   
isinstance_layout_clsboolr#   _params)tensorr   s     r   _is_svdquant_w4a4_qtensorr?   U   sG    % 	6+ 	C"@@	CWV^^\5ABBr    c                    U R                   UR                   :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ r   )shapedtyper.   )abs     r   _same_tensor_metadatarE   _   s9    77aggM!''QWW"4MQXX9MMr    Fvalidatetrustc               h   [        U 5      S:X  a  g[        S U  5       5      (       d  gU S   R                  nU SS  GHq  nUR                  n[        UR                  5      [        UR                  5      :w  a    g[        UR                  UR                  5      (       d    g[        UR                  UR                  5      (       d    gUR                  R                  5       UR                  R                  5       :X  a8  UR                  R                  5       UR                  R                  5       :X  a  M  U(       a  GM  U(       d    g[        R                  " UR                  UR                  5      (       d    g[        R                  " UR                  UR                  5      (       a  GMr    g   g)a  Return True when split SVDQuant projections can reuse activation quantize.

Qwen split Q/K/V checkpoints store separate weight tensors but share the
exact same activation-side SVDQuant parameters: ``smooth_factor`` and
``proj_down``. When that invariant holds, the expensive activation int4
quantization and LoRA-down matmul can run once and feed all projections.

``validate=True`` compares tensor values when pointers differ. Use it once
at module setup or first forward, then cache the result at the caller. Pass
``trust=True`` only after such a cached validation, for temporary casted
copies whose tensor pointers differ but whose source parameters were
already proven identical.
r   Fc              3  8   #    U  H  n[        U5      v   M     g 7fr   )r?   ).0weights     r   	<genexpr>0svdquant_w4a4_can_share_quant.<locals>.<genexpr>x   s     GwV(00ws   r   NT)lenallr=   r<   act_unsignedrE   smooth_factor	proj_downdata_ptrtorchequal)weightsrG   rH   r9   rL   paramss         r   svdquant_w4a4_can_share_quantrY   c   sB   & 7|qGwGGG1:D!"+##$T->->(??$V%9%94;M;MNN$V%5%5t~~FF((*d.@.@.I.I.KK%%'4>>+B+B+DD{{6//1C1CDD{{6++T^^<<' ( r    c               
   [         R                  U5      u  pxpn[        [        UR                  SS5      5      nUb  UR                  XXX+UUS9nO[        R
                  " XXX+UUS9n[         R                  U5      nUS U U4$ )NrQ   Factwgtascaleswscaleslora_act_inlora_upbiasrQ   )r7   get_plain_tensorsr<   r#   r=   scaled_mm_svdquant_w4a4r$   get_out_features_from_storage)q_xr^   lora_act	weight_qtrb   r4   mr2   r_   _smooth
_proj_downproj_uprQ   outout_featuress                  r   '_w4a4_forward_from_quantized_activationro      s     4P3a3abk3l0EG	 1 1>5IJL22 % 3 
 (( %

 0MMeTLr7L  r    )validate_shared_quantassume_shared_quantc               2   [        U5      S:X  a  gUc  S[        U5      -  n[        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      e[        XUS9(       d  [        S	5      eUS   n[        R	                  U5      u  pgpn
[        [        UR                  S
S5      5      nU R                  nU R                  SUS   5      nUR                  S   nU(       a  U[        -   nUnOUnSn[        X5      nUb  UR                  UUU	UUSS9u  nnnO[        R                  " UUU	UUS9u  nnn/ n[        XSS9 H>  u  nn[        UUUUUUUS9u  nnUR!                  UR                  " / USS QUP76 5        M@     [#        U5      $ )a  Run split SVDQuant linears while sharing activation quantize.

This is intentionally a runtime grouping helper, not a fused-QKV storage
format. It preserves split Q/K/V checkpoint tensors and only removes the
repeated per-input work that is identical across those projections.
r   r'   Nr   got  weights but  biasesrF   5SVDQuant weights do not share quantization parametersrQ   FTsmooth	lora_downrQ   lora_xreuse_workspacery   rz   rQ   r{   strict)ri   )rO   
ValueErrorrY   r7   rc   r<   r#   r=   rA   reshape_GELU_UNSIGNED_SHIFTr5   quantize_svdquant_w4a4r$   zipro   appendtuple)r1   rW   biasesrp   rq   firstr2   _wscalesry   rS   _proj_uprQ   
orig_shapex2dri   x_mainr{   r4   rf   r^   rg   outputsrL   rb   out2drn   s                             r   svdquant_w4a4_grouped_linearr      s    7|q~3w<'
6{c'l"4G~]3v;-wOPP(7J PQQAJE3O3a3abg3h0EV~uEFL##J


r:b>
2C		!A++'6L!-!D!D%  "E "
Wh "$!:!:%"
Wh GGD9E(FD,!
| 	u}}Dj"oD|DE	 :
 >r    c               0  ^ U S   R                  5       m[        U4S jU  5       5      (       a  [        S5      eTS;   a,  [        R                  " [        U 5      SS9R                  5       $ [        R                  " [        U 5      US9R                  5       $ )Nr   c              3  H   >#    U  H  oR                  5       T:g  v   M     g 7fr   dim)rK   tr   s     r   rM   '_cat_svdquant_n_axis.<locals>.<genexpr>   s     
+7a557c>7s   "z6cannot fuse mixed natural/tile-packed SVDQuant tensors>         r   )r   anyr   rU   catr   
contiguous)tensorsnatural_dimr   s     @r   _cat_svdquant_n_axisr      ss    
!*..
C

+7
+++QRR
f}yywQ/::<<99U7^5@@BBr    c          
     J   SSK Jn  [        U 5      S:X  a  [        S5      eUc  S[        U 5      -  n[        U5      [        U 5      :w  a$  [        S[        U 5       S[        U5       S	35      e[	        XUS
9(       d  [        S5      eU S   nUR
                  nUR                  S   n[        UR                  5      n/ n	/ n
/ n/ nU  H  n[        R                  U5      u  pnnnUR
                  R                  S   U:w  a  [        S5      e[        UR
                  R                  5      U:w  a  [        S5      eU	R                  [        R                  U5      5        U
R                  U5        UR                  U5        UR                  U5        M     [        U
SS9n[        USS9n[        USS9n[        S U 5       5      (       a  SnO/ n[        XSS9 HD  u  nnUc*  [         R"                  " UUR$                  UR&                  S9nUR                  U5        MF     [         R(                  " [+        U5      SS9R-                  5       n[        R/                  UUR0                  [3        U	5      U4UR4                  UUR6                  US9nU" USU5      U[+        U	5      4$ )a  Fuse split SVDQuant projections into a runtime-only wide projection.

This does not change checkpoint storage. It concatenates the already loaded
split weights along output-N so a caller can execute Q/K/V as one wider
SVDQuant linear and split the output view afterwards.
r   r   r   zexpected at least one weightNr   rs   rt   ru   rF   rv   z<all fused SVDQuant weights must have the same input featuresz?all fused SVDQuant weights must have the same act_unsigned flag)r   c              3  (   #    U  H  oS L v   M
     g 7fr   r'   )rK   rb   s     r   rM   4svdquant_w4a4_fuse_linear_weights.<locals>.<genexpr>-  s     
+FD4<Fs   Tr~   rB   r.   r   )scale
orig_dtyper   rS   rl   rR   rQ   r7   )r9   r   rO   r   rY   r=   r   r<   rQ   r7   rc   r   re   r   rP   r   rU   zerosrB   r.   r   r   r   Paramsr   sumrS   rR   )rW   r   rp   rq   r   r   first_paramsin_featuresrQ   rn   qdatasscalesproj_upsrL   r2   wscalerj   rk   rl   fused_qdatafused_scalefused_proj_up
fused_bias
bias_partsrb   nrX   s                              r   !svdquant_w4a4_fuse_linear_weightsr      s    &
7|q788~3w<'
6{c'l"4G~]3v;-wOPP(7J PQQAJE==L))!,K112LLFFH6R6d6dek6l3w
G>>$$Q';6[\\++,<^__8VVW\]^ef   'v1=K&v1=K(qAM

+F
+++

6=GD!|{{1M,?,?H\H\]d# > YYuZ0a8CCE
)00**%{3(("00! 1 F 	%CVLl r    c                    [        U5      (       d  [        S5      e[        XU5      n[        [        R
                  " U[        U5      SS95      $ )zGExecute a runtime-fused split projection and return split output views.zCfused_weight must be a TensorCoreSVDQuantW4A4Layout QuantizedTensorrw   r   )r?   	TypeError_w4a4_forwardr   rU   split)r1   fused_weightr   output_featuresrm   s        r   "svdquant_w4a4_fused_grouped_linearr   G  sD     %\22]^^
J
?CS%"8bABBr    c                      \ rS rSrSrSrSr\" SS9 " S S\5      5       r	\
    SS	 j5       r\
SS
 j5       r\
    SS j5       r\
SS j5       r\
SS j5       rSrg)r7   iT  uf  SVDQuant W4A4 weight quantization with low-rank correction.

Note:
    Offline-quantized only — `quantize()` raises NotImplementedError because
    SVDQuant factorization requires calibration (smooth_factor, proj_down,
    proj_up) that must be computed from activation statistics. Use the
    DeepCompressor pipeline to produce the pre-quantized tensors.
)   r   FT)frozenc                  d    \ rS rSr% SrS\S'   S\S'   S\S'   SrS\S	'   SrS\S
'   SS jrS r	Sr
g)#TensorCoreSVDQuantW4A4Layout.Paramsij  a  SVDQuant W4A4 parameters.

Inherits `scale` (= wscales), `orig_dtype`, `orig_shape` from
BaseLayoutParams. Adds the three tensors that parameterize the
low-rank correction and input smoothing, plus a logical-transpose flag
used by the aten.t / aten.mm dispatch path.
torch.TensorrS   rl   rR   Fr<   rQ   r8   c                
    / SQ$ )Nr   rS   rl   rR   r'   selfs    r   _tensor_fields2TensorCoreSVDQuantW4A4Layout.Params._tensor_fieldsy  s    EEr    c                    g r   r'   r   s    r   _validate_tensor_fields;TensorCoreSVDQuantW4A4Layout.Params._validate_tensor_fields|  s     r    r'   N)returnz	list[str])__name__
__module____qualname____firstlineno____doc____annotations__rQ   r8   r   r   __static_attributes__r'   r    r   r   r   j  s:    	  ##"d" 
D 	F	r    r   c                    [        S5      e)NzvSVDQuant W4A4 requires offline calibration (DeepCompressor). Load pre-quantized tensors via `from_state_dict` instead.)NotImplementedError)clsr>   kwargss      r   quantize%TensorCoreSVDQuantW4A4Layout.quantize  s     "H
 	
r    c           
     j   UR                   S   nUR                  nUR                  n[        R                  " X5US9n[
        R                  " XbR                  UR                  S9u  pxn	[
        R                  " XqXR                  XR                  SS9SU n
U
R                  5       R                  5       $ )u%  Reconstruct the effective weight W_eff such that plain ``x @ W_eff.T + bias``
reproduces the SVDQuant kernel output to bf16 precision.

Uses the kitchen kernel itself with an identity input rather than
reimplementing dequant in Python. Kitchen weight layout is natural
row-major packed int4 ``(N, K/2)`` — the kernel reads it directly, so
this path stays bit-exact with the actual compute path regardless of
per-group scaling / LoRA composition details. Tile-packed storage is
handled by the same kernel path and produces the same logical weight.
r   r   )ry   rz   N)r\   r]   r^   r_   r`   ra   rb   )r   r.   r   rU   eyer$   r   rR   rS   rd   r   rl   r   r   )r   r2   rX   r   r.   rB   r   rf   r^   rg   w_effs              r   
dequantize'TensorCoreSVDQuantW4A4Layout.dequantize  s     ''*!!ii@!#!:!:,,8H8H"
h ** ..t
 ; wwy##%%r    c                    UR                   nUR                  UR                  UR                  UR                  UR
                  4$ r   )r=   _qdatar   rR   rS   rl   )r   qtensorps      r   rc   .TensorCoreSVDQuantW4A4Layout.get_plain_tensors  s2     OO~~qwwaiiOOr    c                    UR                  5       S:X  a  [        UR                  S   5      [        -  $ [        UR                  S   5      $ )Nr   r   )r   intrA   _TILE_PACKED_BLOCK_N)r   r2   s     r   re   :TensorCoreSVDQuantW4A4Layout.get_out_features_from_storage  s<    99;!u{{1~&)===5;;q>""r    c                b    UUR                   UR                  UR                  UR                  S.$ )zSerialization mapping.

Suffixes compose onto the owning Parameter's key (typically `*.weight`),
producing for example `transformer_blocks.0.attn.to_q.weight`,
`...weight_scale`, `...weight_proj_down`, etc.
) _scalerk   r   _smooth_factorr   )r   r2   rX   s      r   state_dict_tensors/TensorCoreSVDQuantW4A4Layout.state_dict_tensors  s1     ll **$22
 	
r    r'   N)r>   r   r   ztuple[torch.Tensor, Params])r2   r   rX   r   r   r   )r   r   r   zKtuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])r2   r   r   r   )r2   r   rX   r   r   zdict[str, torch.Tensor])r   r   r   r   r   MIN_SM_VERSIONQUANTIZES_INPUTr   r   r   classmethodr   r   rc   re   r   r   r'   r    r   r7   r7   T  s     N Od!  , 

 
%	
 
 & &2 P%P	TP P # #
 
 
r    r7   c                @   [         R                  U5      u  p4pVn[        [        UR                  SS5      5      nU R
                  n	U R                  SU	S   5      n
U
R
                  S   nU(       a  U
[        -   nU
nOU
nSn[        X5      nUb-  UR                  UUUUUSS9u  nnnUR                  XUUUXrUS9nO5[        R                  " UUUUUS	9u  nnn[        R                  " XUUUXrUS9n[         R                  U5      nUSU R                  " / U	SS QUP76 $ )
u,  Compute y = x @ W^T + bias via the int4 kernel.

For layers flagged ``act_unsigned`` (nunchaku convention: post-GELU fc2),
we apply the +0.171875 shift to the main-path activation here at the layer
so it falls into the unsigned [0, 15] quantization grid. LoRA continues to
use the raw un-shifted activation (SVDQuant invariant: LoRA is the residual
between full-precision W and the int4 approximation, computed on the
pre-quantization activation).

Kernel API stays shift-free — shift is a Qwen/Flux model-topology constant,
not a quantize-op parameter.
rQ   Frw   r   NTrx   r[   r}   )r7   rc   r<   r#   r=   rA   r   r   r5   r   rd   r$   re   )r1   rh   rb   r2   r_   ry   rS   rl   rQ   r   r   ri   r   r{   r4   rf   r^   rg   rm   rn   s                       r   r   r     sd   " 2N1_1_`i1j.EFw	 1 1>5IJL##J


r:b>
2C		!A++'6L!-!D!D%  "E "
Wh 22 '% 3 
 "$!:!:%"
Wh (( '%

 0MMeTLr7??:JsO:\::r    c                `   SSK nSSKJn  US   n[        XT5      (       d4  [        R
                  R                  R                  R                  " U0 UD6$ UR                  nUR                  UUR                  S   UR                  S   4UR                  (       + S9nU" UR                  SU5      $ )u   Zero-copy logical transpose — flip the ``transposed`` flag.

Lets ``F.linear(x, W)`` decompose into ``x @ W.t()`` without reordering any
storage; ``mm`` / ``addmm`` handlers below unwind the flag.
r   Nr   r   )r   r8   r7   )dataclassesr9   r   r:   rU   opsatenr   r   r=   replacer   r8   r   )qtargsr   r   r   r1   old
new_paramss           r   _handle_w4a4_tr     s     %7Ll44yy~~''888


C$$NN1%s~~a'89~~% % J
 <..0NPZ[[r    c                R    U R                   R                  (       d  [        S5      eU $ )zDReturn rhs unchanged if it is logically transposed (represents W^T).z\SVDQuant W4A4 GEMM expects the RHS to be W.T (stored W). Use F.linear(x, W) or mm(x, W.t()).)r=   r8   RuntimeError)rhss    r   _resolve_svdquant_rhsr     s(    ;;!!2
 	
 Jr    c                   SSK Jn  US   US   pT[        U5      S:  a  US   OSn[        XS5      (       d2  [        R
                  R                  R                  " [        XEU45      6 $ [        XC5      (       a  UR                  5       nUR                  R                  (       a8  [        R
                  R                  R                  XER                  5       U5      $ [        XEU5      $ )u3   Direct F.linear(input, W, bias) → kitchen kernel.r   r   r      N)r9   r   rO   r:   rU   nn
functionallinearr
   r   r=   r8   r   )r   r   r   r   r1   rL   rb   s          r   _handle_w4a4_linearr  '  s     &7DG&$i!m47Df..xx""))?LRV;W+XYY,00#..0~~  xx"")),8I8I8KTRRt44r    c                    SSK Jn  US   US   pT[        XS5      (       d  [        R                  " [        XE45      6 $ [        XC5      (       a  UR                  5       n[        U5      n[        XESS9$ )uo   Handle ``mm(x, W.t())`` — the decomposition F.linear takes when the weight
is a non-default tensor subclass.
r   r   r   Nrb   )	r9   r   r:   rU   mmr
   r   r   r   )r   r   r   r   rC   rD   s         r   _handle_w4a4_mmr  8  sd    
 &7DGqa))xx!011!%%LLNa AD))r    c                    SSK Jn  US   US   US   pen[        Xc5      (       d  [        R                  " [        XEU45      6 $ [        XS5      (       a  UR                  5       n[        U5      n[        XVUS9$ )z!Handle ``addmm(bias, x, W.t())``.r   r   r   r   r  )	r9   r   r:   rU   addmmr
   r   r   r   )r   r   r   r   rb   rC   rD   s          r   _handle_w4a4_addmmr  H  sn     &a$q'47QDa)){{OTaL9::!%%LLNa AD))r    )r   strr   r<   r   r<   )r   z
str | None)r1   r   r2   r   )r>   r   r   r<   )rC   r   rD   r   r   r<   )rW   Sequence[torch.Tensor]rG   r<   rH   r<   r   r<   )rf   r   r^   r   rg   r   rh   r   rb   torch.Tensor | Noneri   r   r   ztuple[torch.Tensor, int]r   )r1   r   rW   r  r   $Sequence[torch.Tensor | None] | Nonerp   r<   rq   r<   r   tuple[torch.Tensor, ...])r   r  r   r   r   r   )
rW   r  r   r  rp   r<   rq   r<   r   z<tuple[QuantizedTensor, torch.Tensor | None, tuple[int, ...]])
r1   r   r   r   r   r  r   zSequence[int]r   r  )r1   r   rh   r   rb   r  r   r   )r   r   r   r   )4r   
__future__r   loggingr   collections.abcr   r   r   typingr   rU   comfy_kitchenr$   r9   r   r	   r
   r   r   	getLoggerr   logger_INT4_GROUP_SIZEr   r   r   r   r(   r5   r?   rE   rY   ro   r   r   r   r   r7   r   r   r   r   r   r   r   r  r  r  r  r
  r  r'   r    r   <module>r     s   #  	 $ !     X X%			8	$   / :H<N 	-#- - 	-
 
-`!	!! ! 	!
 ! ! !> 48A
 #( %AA#A 1A
  A A AHC 48H #( %H#H0H  	H
 H BHV
C
C
C $
C #	
C
 
Cm
? m
d<;<;<; <; 	<;~ EIINN$$,,.JK\ L\. EIINN))113OP5 Q5  EIINN%%--/KL* M* EIINN((002NO
* P
*r    