
    3j                       S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	r
SSKJrJrJrJr  \(       a  SSKJr  \R"                  " \5      rS	r " S
 S\5      r        SS jr\" \R.                  R0                  R2                  R4                  \5      S 5       rSS jr\" \R.                  R0                  R:                  R4                  \5      S 5       r\" \R.                  R0                  R>                  R4                  \5      S 5       r \" \R.                  R0                  RB                  R4                  \5      S 5       r"g)u/  AWQ W4A16 quantization layout for modulation-style linears.

Each quantized linear stores:
  qweight:        (N, K // 2)  int8       packed uint4 (kitchen-native row-major)
                                          bits 0..3 -> column 2j   (uint4 [0, 15])
                                          bits 4..7 -> column 2j+1 (uint4 [0, 15])
  scale=wscales:  (K // G, N)  bf16/fp16  per-group fp scales
  wzeros:         (K // G, N)  bf16/fp16  per-group fp zero points

Dequantization (per element):
    W[n, k] = (qweight[n, k] - 8) * wscales[k // G, n] + wzeros[k // G, n]

Targets the modulation linears (``img_mod.1`` / ``txt_mod.1``) in
Qwen-Image-Edit and similar topologies — small batch, called once per block,
where W4A16 GEMV stays fp16/bf16-accurate while the int4 weights cut both
checkpoint size and resident VRAM by ~4x vs the bf16-dequantized fallback.

Dispatch goes through ``ck.gemv_awq_w4a16``, which has an eager pure-PyTorch
implementation registered as a ``torch.library`` custom op; a CUDA backend
can be added later without changing this layout.
    )annotationsN)	dataclass)TYPE_CHECKING   )BaseLayoutParamsQuantizedLayoutdequantize_argsregister_layout_opQuantizedTensor@   c                      \ rS rSrSrSrSr\" SS9 " S S\5      5       r	\
SS	 j5       r\
SS
 j5       r\
SS j5       r\
SS j5       rSrg)TensorCoreAWQW4A16Layout/   u#  AWQ W4A16 weight quantization with per-group fp scales + zeros.

Note:
    Offline-quantized only — ``quantize()`` raises NotImplementedError.
    Use the upstream AWQ / DeepCompressor calibration pipeline to produce
    the pre-quantized tensors. ``from_state_dict`` is the loading path.
NFT)frozenc                  P    \ rS rSr% SrS\S'   \rS\S'   SrS\S	'   SS
 jr	S r
Srg)TensorCoreAWQW4A16Layout.Params?   u  AWQ W4A16 parameters.

Inherits ``scale`` (= wscales), ``orig_dtype``, ``orig_shape`` from
BaseLayoutParams. Adds ``zeros`` (per-group fp zero points) and
``group_size`` (the K-dim quantization group size — typically 64,
matching the wscales / wzeros leading shape).
torch.Tensorzerosint
group_sizeFbool
transposedc                
    SS/$ )Nscaler    selfs    X/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/tensor/awq_w4a16.py_tensor_fields.TensorCoreAWQW4A16Layout.Params._tensor_fieldsL   s    W%%    c                    g Nr   r   s    r    _validate_tensor_fields7TensorCoreAWQW4A16Layout.Params._validate_tensor_fieldsO   s    r#   r   N)returnz	list[str])__name__
__module____qualname____firstlineno____doc____annotations___DEFAULT_GROUP_SIZEr   r   r!   r&   __static_attributes__r   r#   r    Paramsr   ?   s-    	 -
C- 
D 	&	r#   r1   c                    [        S5      e)NzaAWQ W4A16 requires offline calibration. Load pre-quantized tensors via `from_state_dict` instead.)NotImplementedError)clstensorkwargss      r    quantize!TensorCoreAWQW4A16Layout.quantizeS   s    !5
 	
r#   c                   UR                   u  p4US-  nUR                  nUR                  [        R                  5      nUS-  R                  [        R                  5      nUS-	  S-  R                  [        R                  5      n	[        R
                  " X5[        R                  UR                  S9n
XSS2SSS24'   XSS2SSS24'   UR                  R                  5       R                  S5      nUR                  R                  5       R                  S5      nU
R                  X5U-  U5      R                  UR                  5      nUS	-
  U-  U-   R                  X55      nU$ )
zReconstruct the bf16/fp16 weight matrix W of shape ``(N, K)``.

Used by the fallback path when the dispatch handler can't run a
quantized matmul (e.g. RHS is not transposed, or operand types don't
match). Stays in fp accumulation on the eager backend.
         )dtypedeviceNr   r   g       @)shaper   totorchint32emptyr>   r   t	unsqueezer   view
orig_dtype)r4   qdataparamsnk_halfkgq_i32lohiw_uintscalesr   groupsws                  r    
dequantize#TensorCoreAWQW4A16Layout.dequantizeZ   s$    KK	QJ%dlu{{+zT!%%ekk2QU\\Jq!$Q$wq!$Q$w!++B/!++B/QQ*--f.?.?@slf$u,2218r#   c                `    UR                   nUR                  UR                  UR                  4$ r%   )_params_qdatar   r   )r4   qtensorps      r    get_plain_tensors*TensorCoreAWQW4A16Layout.get_plain_tensorsr   s$    OO~~qww//r#   c                6    UUR                   UR                  S.$ )N) _scale_zeros)r   r   )r4   rI   rJ   s      r    state_dict_tensors+TensorCoreAWQW4A16Layout.state_dict_tensorsw   s     llll
 	
r#   r   )r5   r   )rI   r   rJ   r1   r(   r   )r[   r   )rI   r   rJ   r1   r(   zdict[str, torch.Tensor])r)   r*   r+   r,   r-   MIN_SM_VERSIONQUANTIZES_INPUTr   r   r1   classmethodr7   rV   r]   rc   r0   r   r#   r    r   r   /   s     N Od!  & 
 
  . 0 0 
 
r#   r   c           	         [         R                  U5      u  p4n[        [        UR                  S[
        5      5      n[        R                  " XXEX&S9$ )z.Compute y = x @ W^T + bias via AWQ W4A16 GEMV.r   )biasr   )r   r]   r   getattrrY   r/   ckgemv_awq_w4a16)input_tensor	weight_qtri   rI   wscaleswzerosr   s          r    _awq_forwardrq      sN     6GG	REFWY..>QRSJW r#   c                `   SSK nSSKJn  US   n[        XT5      (       d4  [        R
                  R                  R                  R                  " U0 UD6$ UR                  nUR                  UUR                  S   UR                  S   4UR                  (       + S9nU" UR                  SU5      $ )u=   Zero-copy logical transpose — flip the ``transposed`` flag.r   Nr   r   )
orig_shaper   r   )dataclassesbaser   
isinstancerB   opsatenrE   defaultrY   replacers   r   rZ   )qtargsr6   rt   r   rm   old
new_paramss           r    _handle_awq_tr      s     %7Ll44yy~~''888


C$$NN1%s~~a'89~~% % J
 <..0JJWWr#   c                R    U R                   R                  (       d  [        S5      eU $ )zDReturn rhs unchanged if it is logically transposed (represents W^T).zXAWQ W4A16 GEMM expects the RHS to be W.T (stored W). Use F.linear(x, W) or mm(x, W.t()).)rY   r   RuntimeError)rhss    r    _resolve_awq_rhsr      s(    ;;!!2
 	
 Jr#   c                   SSK Jn  US   US   pT[        U5      S:  a  US   OSn[        XS5      (       d2  [        R
                  R                  R                  " [        XEU45      6 $ [        XC5      (       a  UR                  5       nUR                  R                  (       a8  [        R
                  R                  R                  XER                  5       U5      $ [        XEU5      $ )u-   Direct F.linear(input, W, bias) → AWQ GEMV.r   r   r   r:   N)ru   r   lenrv   rB   nn
functionallinearr	   rV   rY   r   rq   )r{   r|   r6   r   rm   weightri   s          r    _handle_awq_linearr      s     &7DG&$i!m47Df..xx""))?LRV;W+XYY,00#..0~~  xx"")),8I8I8KTRRd33r#   c                    SSK Jn  US   US   pT[        XS5      (       d  [        R                  " [        XE45      6 $ [        XC5      (       a  UR                  5       n[        U5      n[        XESS9$ )uJ   ``mm(x, W.t())`` — F.linear's decomposition for tensor subclass weights.r   r   r   Nri   )	ru   r   rv   rB   mmr	   rV   r   rq   )r{   r|   r6   r   abs         r    _handle_awq_mmr      sd     &7DGqa))xx!011!%%LLNA4((r#   c                    SSK Jn  US   US   US   pen[        Xc5      (       d  [        R                  " [        XEU45      6 $ [        XS5      (       a  UR                  5       n[        U5      n[        XVUS9$ )z``addmm(bias, x, W.t())``.r   r   r   r:   r   )	ru   r   rv   rB   addmmr	   rV   r   rq   )r{   r|   r6   r   ri   r   r   s          r    _handle_awq_addmmr      sn     &a$q'47QDa)){{OTaL9::!%%LLNA4((r#   )rm   r   rn   r   ri   ztorch.Tensor | Noner(   r   )r   r   r(   r   )#r-   
__future__r   loggingrt   r   typingr   rB   comfy_kitchenrk   ru   r   r   r	   r
   r   	getLoggerr)   loggerr/   r   rq   rw   rx   rE   ry   r   r   r   r   r   r   r   r   r   r#   r    <module>r      sC  * #  !     X X%			8	$ N
 N
f  	 EIINN$$,,.FGX HX& EIINN))113KL4 M4  EIINN%%--/GH
) I
) EIINN((002JK
) L
)r#   