
    3j                    j   S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	r
SSKJr  SSKJrJrJrJr  \(       a  SS	KJr  \R&                  " \5      r " S
 S\5      r  S             SS jjrS r\" \R2                  R4                  R6                  R8                  \5      S 5       r\" \R2                  R4                  R<                  R8                  \5      S 5       r\" \R2                  R4                  R@                  R8                  \5      S 5       r!\R2                  R4                  RD                  R8                  \R2                  R4                  RF                  R8                  \R2                  R4                  RH                  R8                  4 H  r%\" \%\5      " \" \%5      5        M     g)z0FP8 (E4M3) quantization layout for tensor cores.    )annotationsN)	dataclass)TYPE_CHECKING)scaled_mm_v2   )BaseLayoutParamsQuantizedLayoutdequantize_argsregister_layout_opQuantizedTensorc                      \ rS rSrSrSr\" SS9 " S S\5      5       r\	S\
R                  4       SS	 jj5       r\	SS
 j5       r\	SS j5       r\	SS j5       rSrg)TensorCoreFP8Layout   a  FP8 E4M3 quantization with per-tensor scaling.

Example:
    >>> x = torch.randn(128, 256, device="cuda", dtype=torch.bfloat16)
    >>> qt = QuantizedTensor.from_float(x, TensorCoreFP8Layout)
    >>> qt.shape
    torch.Size([128, 256])
    >>> dq = qt.dequantize()
    >>> torch.allclose(dq, x, rtol=0.1)
    True

Note:
    Requires SM >= 8.9 (Ada Lovelace) for hardware-accelerated matmul.
)   	   T)frozenc                      \ rS rSrS rSrg)TensorCoreFP8Layout.Params'   c           	         [        U R                  [        R                  5      (       a>  [        R                  U SU R                  R                  [        R                  SS95        g g )NscaleT)dtypenon_blocking)
isinstancer   torchTensorobject__setattr__tofloat32)selfs    R/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/tensor/fp8.py_validate_tensor_fields2TensorCoreFP8Layout.Params._validate_tensor_fields)   sD    $**ell33""4$**--emmbf-2gh 4     N)__name__
__module____qualname____firstlineno__r$   __static_attributes__r'   r&   r#   Paramsr   '   s    	ir&   r-   Nc                   UR                   n[        UR                  5      nUb  US:X  aE  [        R                  " UR                  5       5      [        R                  " U5      R                  -  n[        U[        R                  5      (       d  [        R                  " U5      nUR                  UR                  [        R                  S9n[        R                  " XU5      nXpR!                  X%US94$ )Nrecalculate)devicer   r   
orig_dtype
orig_shape)r   tupleshaper   amaxabsfinfomaxr   r   tensorr    r0   r!   ckquantize_per_tensor_fp8r-   )clsr:   r   r   kwargsr2   r3   qdatas           r#   quantizeTensorCoreFP8Layout.quantize-   s     \\
6<<(
=E]2JJvzz|,u{{5/A/E/EEE%..LL'EU]]C**6%@jjuPZj[[[r&   c                X    [         R                  " XR                  UR                  5      $ N)r;   dequantize_per_tensor_fp8r   r2   r=   r?   paramss      r#   
dequantizeTensorCoreFP8Layout.dequantizeC   s    ++E<<ARARSSr&   c                F    UR                   UR                  R                  4$ rC   )_qdata_paramsr   )r=   qtensors     r#   get_plain_tensors%TensorCoreFP8Layout.get_plain_tensorsG   s    ~~w4444r&   c                     UUR                   S.$ )u7   Return key suffix → tensor mapping for serialization.) _scale)r   rE   s      r#   state_dict_tensors&TensorCoreFP8Layout.state_dict_tensorsK   s     ll
 	
r&   r'   )r:   torch.Tensorr   z!torch.Tensor | float | str | Noner   ztorch.dtypereturnztuple[torch.Tensor, Params])r?   rT   rF   r-   rU   rT   )rL   r   rU   z!tuple[torch.Tensor, torch.Tensor])r?   rT   rF   r-   rU   zdict[str, torch.Tensor])r(   r)   r*   r+   __doc__MIN_SM_VERSIONr   r   r-   classmethodr   float8_e4m3fnr@   rG   rM   rR   r,   r'   r&   r#   r   r      s     Ndi! i i
  48"00	\\ 1\ 	\ 
%\ \* T T 5 5 
 
r&   r   c           	     :    [        U R                  5       UUUUUS9$ )N)scale_ascale_bbias	out_dtype)r   
contiguous)input_qdataweight_qdatar[   r\   r]   r^   s         r#   _fp8_scaled_mmrb   V   s,       r&   c                $   ^ ^ SSK Jm  UU 4S jnU$ )zFactory for shape-changing operations (view, reshape, t, etc.).

These ops work directly on FP8 since it's not packed (1:1 element mapping).
The aten_op is applied to _qdata and the result is wrapped in a new QuantizedTensor.
r   r   c                &  > US   n[        UT5      (       d  T" U0 UD6$ T" UR                  /USS  Q70 UD6n[        R                  UR                  R
                  UR                  R                  [        UR                  5      S9nT" USU5      $ )Nr   r   r1   r   )	r   rJ   r   r-   rK   r   r2   r4   r5   )qtargsr>   input_tensor	new_qdata
new_paramsr   aten_ops         r#   handler(_make_fp8_shape_handler.<locals>.handlerp   s    Aw,88D+F++ L//E$qr(EfE	 )//&&,,#++66Y__- 0 


 y*?LLr&   )baser   )rj   rk   r   s   ` @r#   _make_fp8_shape_handlerrn   h   s     &M  Nr&   c                   SSK Jn  US   US   pT[        U5      S:  a  US   OSn[        XC5      (       a  [        XS5      (       d2  [        R
                  R                  R                  " [        XEU45      6 $ [        R                  U5      u  px[        R                  U5      u  pUR                  SUR                  R                  5      nU	R                  5       n [        X|XXk5      nUR                   [        R"                  [        R$                  4;   aH  [        R'                  X-  UR                  R                  [)        UR*                  5      S9nU" USU5      $ U$ ! [,        [.        4 aU  n[0        R3                  S	U S
35        [        R
                  R                  R                  " [        XEU45      6 s SnA$ SnAff = f)zFP8 linear: output = input @ weight.T + bias

Uses torch._scaled_mm for hardware-accelerated FP8 matmul when both
input and weight are FP8 QuantizedTensors.
r   r   r      Nr^   r1   r   zFP8 _scaled_mm failed: z , falling back to dequantization)rm   r   lenr   r   nn
functionallinearr
   r   rM   getrK   r2   trb   r   rY   float8_e5m2r-   r4   r5   RuntimeError	TypeErrorloggerwarning)re   rf   r>   r   rg   weightr]   r`   r[   ra   r\   r^   weight_toutputoutput_paramses                   r#   _handle_fp8_linearr      s    &7DG&$i!m47D |55*V:]:]xx""))?LRV;W+XYY.@@NK/AA&IL

;(<(<(G(GHI ~~HZwY <<E//1B1BCC/66''//:: . 7 M
 #6+@-PP)$ Z03STUxx""))?LRV;W+XYYZs&   BE"  E" "G2A
G<GGc                   SSK Jn  US   US   pT[        XC5      (       a  [        XS5      (       d  [        R                  " [        U5      6 $ [        R                  U5      u  pg[        R                  U5      u  pUR                  SUR                  R                  5      n
 [        XhXyU
S9$ ! [        [        4 a    [        R                  " [        U5      6 s $ f = f)z)FP8 matrix multiplication: output = a @ br   r   r   r^   )r^   )rm   r   r   r   mmr
   r   rM   ru   rK   r2   rb   rx   ry   )re   rf   r>   r   aba_qdatar[   b_qdatar\   r^   s              r#   _handle_fp8_mmr      s     &7DGqq**z!/M/Mxx.//*<<Q?G*<<Q?G

;		(<(<=I0gIVV)$ 0xx.//0s   !
B, ,,CCc                   SSK Jn  US   US   US   pen[        XS5      (       a  [        Xc5      (       d  [        R                  " [        U5      6 $ [        R                  U5      u  px[        R                  U5      u  pUR                  SUR                  R                  5      n [        XyXXK5      $ ! [        [        4 a    [        R                  " [        U5      6 s $ f = f)z)FP8 addmm: output = bias + input @ weightr   r   r   rp   r^   )rm   r   r   r   addmmr
   r   rM   ru   rK   r2   rb   rx   ry   )re   rf   r>   r   r]   rg   r|   r`   r[   ra   r\   r^   s               r#   _handle_fp8_addmmr      s     &!%a$q'47D|55*V:]:]{{OD122.@@NK/AA&IL

;(<(<(G(GHI3k4[[)$ 3{{OD1223s   &B3 3,C"!C")NN)r`   rT   ra   rT   r[   rT   r\   rT   r]   ztorch.Tensor | Noner^   ztorch.dtype | NonerU   rT   )&rV   
__future__r   loggingdataclassesr   typingr   r   comfy_kitchenr;   comfy_kitchen.scaled_mm_v2r   rm   r   r	   r
   r   r   	getLoggerr(   rz   r   rb   rn   opsatenrt   defaultr   r   r   r   r   viewreshaperv   _aten_opr'   r&   r#   <module>r      s   6 "  !     3 X X%			8	$<
/ <
L !%$(  	
  " $: EIINN))113FG%Z H%ZP EIINN%%--/BC0 D0& EIINN((002EF3 G3. 
IINN	IINN""	IINNH
 x!456Mh6WXr&   