
    3jp&                    (   S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	r
SSKJrJrJr  SSKJrJrJrJr  \(       a  SS	KJr  \R*                  " \5      r " S
 S\5      r\" \R2                  R4                  R6                  R8                  \5      S 5       r        SS jr\" \R2                  R4                  R>                  R8                  \5      S 5       r \" \R2                  R4                  RB                  R8                  \5      S 5       r"g)z8NVFP4 (E2M1) block quantization layout for tensor cores.    )annotationsN)	dataclass)TYPE_CHECKING)F4_E2M1_MAXF8_E4M3_MAXroundup   )BaseLayoutParamsQuantizedLayoutdequantize_argsregister_layout_opQuantizedTensorc                      \ rS rSrSrSr\" SS9 " S S\5      5       r\	 S     SS	 jj5       r
\	SS
 j5       r\	    SS j5       r\	SS j5       r\	SS j5       r\	SS j5       r\	SS j5       rSrg)TensorCoreNVFP4Layout   a:  NVFP4 E2M1 block quantization with per-tensor and block scaling.
Auto-pads to 16x16 alignment

Note:
    Requires SM >= 10.0 (Blackwell) for hardware-accelerated matmul.
    Shape operations (view, reshape, transpose) are not supported due to
    packed format and block scales - they fall back to dequantization.
)
   r   T)frozenc                  B    \ rS rSr% SrS\S'   SrS\S'   SS jrS	 rS
r	g)TensorCoreNVFP4Layout.Params!   zNVFP4 layout parameters.

Inherits scale, orig_dtype, orig_shape from BaseLayoutParams.
Adds block_scale for per-block scaling factors.
torch.Tensorblock_scaleFbool
transposedc                
    SS/$ )z5Override to include block_scale in tensor operations.scaler    selfs    T/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/tensor/nvfp4.py_tensor_fields+TensorCoreNVFP4Layout.Params._tensor_fields+   s    ]++    c           	         [        U R                  [        R                  5      (       a>  [        R                  U SU R                  R                  [        R                  SS95        g g )Nr   T)dtypenon_blocking)
isinstancer   torchTensorobject__setattr__tofloat32r   s    r!   _validate_tensor_fields4TensorCoreNVFP4Layout.Params._validate_tensor_fields/   sD    $**ell33""4$**--emmbf-2gh 4r$   r   N)returnz	list[str])
__name__
__module____qualname____firstlineno____doc____annotations__r   r"   r/   __static_attributes__r   r$   r!   Paramsr   !   s$    	
 "! 
D 	,	ir$   r9   Nc                X   UR                  5       S:w  a  [        SUR                  5        S35      eUR                  n[        UR                  5      nUb  US:X  a2  [
        R                  " UR                  5       5      [        [        -  -  n[        U[
        R                  5      (       d  [
        R                  " U5      nUR                  UR                  [
        R                  S9nU R!                  U5      nXe:g  n["        R$                  " XUS9u  pU R'                  UUUU	S9n
X4$ )N   zNVFP4 requires 2D tensor, got Drecalculate)devicer&   )pad_16x)r   
orig_dtype
orig_shaper   )dim
ValueErrorr&   tupleshaper)   amaxabsr   r   r(   r*   tensorr-   r>   r.   get_padded_shapeckquantize_nvfp4r9   )clsrH   r   kwargsr@   rA   padded_shapeneeds_paddingqdatar   paramss              r!   quantizeTensorCoreNVFP4Layout.quantize3   s     ::<1=fjjl^1MNN\\
6<<(
=E]2JJvzz|,k0IJE%..LL'EU]]C++J7$2..vmT!!#	  
 }r$   c                n    [         R                  " XR                  UR                  UR                  5      $ N)rJ   dequantize_nvfp4r   r   r@   rL   rP   rQ   s      r!   
dequantize TensorCoreNVFP4Layout.dequantizeT   s'    ""5,,8J8JFL]L]^^r$   c                p    UR                   UR                  R                  UR                  R                  4$ rU   )_qdata_paramsr   r   )rL   qtensors     r!   get_plain_tensors'TensorCoreNVFP4Layout.get_plain_tensorsX   s)     ~~w44goo6Q6QQQr$   c                6    UUR                   UR                  S.$ )u7   Return key suffix → tensor mapping for serialization.) _scale_scale_2)r   r   rW   s      r!   state_dict_tensors(TensorCoreNVFP4Layout.state_dict_tensors^   s!     ((
 	
r$   c                    [        U5      S:w  a  [        S[        U5       S35      eUu  p#[        US5      [        US5      4$ )Nr;   zNVFP4 requires 2D shape, got r<      )lenrC   r   )rL   rA   rowscolss       r!   rI   &TensorCoreNVFP4Layout.get_padded_shapeg   sH    z?a<S_<MQOPP
b!74#455r$   c                >    U R                  U5      nUS   US   S-  4$ )Nr   r	   r;   )rI   )rL   rA   paddeds      r!   get_storage_shape'TensorCoreNVFP4Layout.get_storage_shapen   s)    %%j1q	6!9>**r$   c                    US   US   S-  4$ )zGCompute logical (padded) shape from storage shape by reversing packing.r   r	   r;   r   )rL   storage_shapes     r!   get_logical_shape_from_storage4TensorCoreNVFP4Layout.get_logical_shape_from_storages   s     a -"2Q"677r$   r   rU   )rH   r   r   z!torch.Tensor | float | str | Noner1   ztuple[torch.Tensor, Params])rP   r   rQ   r9   r1   r   )r]   r   r1   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])rP   r   rQ   r9   r1   zdict[str, torch.Tensor])rA   tuple[int, ...]r1   rt   )rq   rt   r1   rt   )r2   r3   r4   r5   r6   MIN_SM_VERSIONr   r
   r9   classmethodrR   rX   r^   rd   rI   rn   rr   r8   r   r$   r!   r   r      s     Ndi! i i"  48 1
 
% @ _ _ R%R	8R R
 
 
 6 6 + + 8 8r$   r   c                   SSK Jn  US   n[        XC5      (       d4  [        R                  R
                  R                  R                  " U0 UD6$ UR                  R                  nUS   US   4n[        R                  UR                  R                  UR                  R                  UUR                  R                  UR                  R                  (       + S9nU" UR                   SU5      $ )z3Handle transpose as a logical no-op for NVFP4.
    r	   r   r   )r   r@   rA   r   r   r   )baser   r(   r)   opsatentdefaultr\   rA   r   r9   r   r@   r   r   r[   )qtargsrM   r   input_tensor	old_shape	new_shape
new_paramss           r!   _handle_nvfp4_transposer   |   s     &7Ll44yy~~''888$$//I1y|,I&--""((''22 ((44#++666 . J <..0GTTr$   c                h    U R                   S   U:w  d  U R                   S   U:w  a  U SU2SU24   $ U $ )z7Slice padded matmul output back to original dimensions.r   r	   N)rE   )resultorig_morig_ns      r!   _slice_to_original_shaper      s@     ||A& FLLOv$=gvgww&''Mr$   c           
        SSK Jn  US   US   pT[        XC5      (       a  [        XS5      (       d  [        R                  " [        U5      6 $ UR                  R                  5       S:w  a  [        R                  " [        U5      6 $ [        UR                  SS5      n[        UR                  SS5      nU(       d  U(       d1  [        R                  S5        [        R                  " [        U5      6 $ [        R                  U5      u  pn
[        R                  U5      u  pnUR                  SUR                  R                  5      n [         R"                  " UUU	UU
UUS	9nUR                  R$                  S   nUR                  R$                  S   n['        UUU5      $ ! [(        [*        4 a?  n[        R-                  S
U S35        [        R                  " [        U5      6 s SnA$ SnAff = f)uB  NVFP4 matrix multiplication: output = a @ b

When b is logically transposed (from a prior .t() call), this works directly
with scaled_mm_nvfp4 since that kernel computes a @ b_phys.T, which equals
a @ b_logical when b_logical = b_phys.T.

This handles the common torch.compile decomposition: linear(x, w) → mm(x, w.t())
r	   r   r   r;   r   FzINVFP4 mm: unsupported transpose configuration, falling back to dequantize	out_dtype)tensor_scale_atensor_scale_bblock_scale_ablock_scale_br   zNVFP4 mm failed:  , falling back to dequantizationN)rx   r   r(   r)   mmr   r[   rB   getattrr\   loggerdebugr   r^   getr@   rJ   scaled_mm_nvfp4rA   r   RuntimeError	TypeErrorwarning)r}   r~   rM   r   aba_transposedb_transposeda_qdatascale_ar   b_qdatascale_br   r   r   r   r   es                      r!   _handle_nvfp4_mmr      s    &7DGq q**z!/M/Mxx.// 	xx||~xx.//199lE:L199lE:L<`axx.//&;&M&Ma&P#Gm&;&M&Ma&P#Gm

;		(<(<=I0##""''
 %%a(%%a('??)$ 0*1#-MNOxx.//0s   
AF# #G234G-'G2-G2c                   SSK Jn  US   US   pT[        U5      S:  a  US   OSn[        XC5      (       a  [        XS5      (       d2  [        R
                  R                  R                  " [        XEU45      6 $ UR                  R                  5       S:w  a2  [        R
                  R                  R                  " [        XEU45      6 $ [        UR                  SS5      n[        UR                  SS5      nU(       d  U(       aG  [        R                  S5        [        R
                  R                  R                  " [        XEU45      6 $ [        R!                  U5      u  pn[        R!                  U5      u  pnUR#                  S	UR                  R$                  5      n [&        R(                  " U	UU
UUUUUS
9nUR                  R*                  S   nUR                  R*                  S   n[-        UUU5      $ ! [.        [0        4 aU  n[        R3                  SU S35        [        R
                  R                  R                  " [        XEU45      6 s SnA$ SnAff = f)zNVFP4 linear: output = input @ weight.T + bias

Uses ck.scaled_mm_nvfp4 for hardware-accelerated NVFP4 matmul.
Output is sliced to original (non-padded) shape.
r	   r   r   r;   Nr   FzMNVFP4 linear: unsupported transpose configuration, falling back to dequantizer   )r   r   r   r   biasr   zNVFP4 scaled_mm failed: r   )rx   r   rh   r(   r)   nn
functionallinearr   r[   rB   r   r\   r   r   r   r^   r   r@   rJ   r   rA   r   r   r   r   )r}   r~   rM   r   r   weightr   input_transposedweight_transposedinput_qdatar   r   weight_qdatar   r   r   r   r   r   r   s                       r!   _handle_nvfp4_linearr      s    &7DG&$i!m47D |55*V:]:]xx""))?LRV;W+XYY  A%xx""))?LRV;W+XYY|33\5IeD,dexx""))?LRV;W+XYY*?*Q*QR^*_'K-+@+R+RSY+Z(L=

;(<(<(G(GHIZ##""''	
 %%003**1-'??)$ Z1!4TUVxx""))?LRV;W+XYYZs    "AG< <I!A
II!I!)r   r   r   intr   r   r1   r   )#r6   
__future__r   loggingdataclassesr   typingr   r)   comfy_kitchenrJ   comfy_kitchen.float_utilsr   r   r   rx   r
   r   r   r   r   	getLoggerr2   r   r   ry   rz   r{   r|   r   r   r   r   r   r   r   r$   r!   <module>r      s   > "  !     G G X X%			8	$a8O a8N EIINN$$,,.CDU EU0  	 EIINN%%--/DE20 F20j EIINN))113HI1Z J1Zr$   