
    +j2                        d dl Z dedefdZd\  ZZ eedz
            ZdZdZd	Zd
Z	dZ
dZdZdededefdZde j        de j        fdZde j        dedede j        fdZde j        dedede j        fdZd Zd Zd Zd(de j        dede j        fdZd(dede j        fdZde j        de j        fdZd(d ede j        fd!Zd"ed#ede j        fd$Zd(defd%Zde j        de j        fd&Zde j        de j        fd'ZdS ))    Nnreturnc                     d| z  dz
  S N    )r   s    \/home/wildlama/comfy/ComfyUI/.venv/lib/python3.11/site-packages/comfy_kitchen/float_utils.py_n_onesr
      s    Fa<    )      r   g      @g      ?g      |@g      ?g      @g      ?   xmultiplec                     | |z   dz
  |z  |z  S )z#Round up x to the nearest multiple.r   r   )r   r   s     r	   roundupr       s    \A(*h66r   c                 z    |                      t          j                                       t          j                  S )N)totorchfloat8_e4m3fnfloat32)r   s    r	   _float8_roundr   %   s'    44#$$''666r   ebitsmbitsc                     | j         t          j        k    sJ d|z   |z   dk    sJ t          |dz
            }t          ||z             }d||z   z  }t          t          |z
  dz
            }dt          |          |z
  z  t          |dz             d|z  z  z  }dd|z
  z  }t
          |z
  t          |z
  z   dz   }	|	t          z  }
t          j        |
t          j                                      t          j	                  }|                     t          j                  } | dz  }| |z  } |                     t          j                  } | |k    }t          j
        t          j        |          | |k               }t          j        t          j        ||                    }| |z   }|                    t          j                  }||
z  }|                    t          j                  }|                     t          j                  }|t          |z
  z	  dz  }|t
          z
  t          z  |z   }||z  }||z  }|t          |z
  z	  }|                    t          j                  }t          j        | |t          j                  } t          j        |||           } t          j        |||           } |t          t"          z   |z
  |z
  z	  }|                    t          j                  }||z  }| |z  } |                     t          j                  S )aU  Convert FP32 numbers to sub-byte floating point numbers with the given
    number of exponent and mantissa bits.

    Input: torch.Tensor of dtype torch.float
    Output: torch.Tensor of dtype torch.uint8, where the bit encoding is stored
    in the least significant bits. e.g.
      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding

    Note: there are no special values (NaN, inf) support in this code. Values
    outside the representable range of Floatx after rounding are clamped to the
    maximum Floatx magnitude (sign is preserved).

    Background 1: last answer in https://stackoverflow.com/questions/8981913/how-to-perform-round-to-even-with-floating-point-numbers  # noqa: E501
    Background 2: Computer Organization and Design, RISC-V edition, Chapter 3.5
    r   r      )dtypel        )r   r   floatr
   	MBITS_F32F32_EXP_BIAStensorint32viewr   logical_andlogical_not
logical_orr   uint8	full_likewhere	EBITS_F32)r   r   r   exp_biasmax_int	sign_maskmagic_adder
max_normal
min_normal
denorm_expdenorm_mask_intdenorm_mask_floatsignsaturate_maskdenormal_masknormal_mask
denormal_xnormal_xmant_odd
val_to_addsign_lps                        r	   _f32_to_floatx_unpackedr=   )   s   " 7ek!!!!u9u!!!! uqy!!Heem$$Geem$I)e+a/00K wu~~01WUQY5G5G1e85TUJ q8|$J 
	 u	 		  !I-O _EKHHHMMem\\
 	
u{Az>D 	
DA 	
u{A OM%e&7&F&FJWWM#E$4]M$R$RSSK &&J--J/!Ju{++J
 vvek""HY./14Hl*y8KGJ
HHI-.H{{5;''H
 	7%+666AM:q11AK1--A y9,u4u<=Gjj%%G
 	!G	GA44r   c                    | j         t          j        k    sJ d|z   |z   dk    sJ d||z   z  }t          |dz
            }t          |          }| |z  }| |z  }|dk    }t          j        |dk    ||z	  dk              }	||z	  }
|
|z
  t
          z   }|                    t          j                  t          z  }||z                      t          j                  }|t          |z
  z  }||z  }d||<   d|z
  t
          z   }|dk    r||z
  t          z  ||	<   nxt          |          D ]R}t          d|z  d|dz   z            D ]6}||z
  }|d|z  z
  |t          z   |z
  z  }||z
  t          z  }||z   |||k    <   7St          j
        |	||          }|                    t          j                  t          |z
  t          z   |z
  z  }||z  }|                    t          j                  S )a  Convert sub-byte floating point numbers with the given number of exponent
    and mantissa bits to FP32.

    Input: torch.Tensor of dtype uint8, where the bit encoding is stored
    in the least significant bits. e.g.
      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
    Output: torch.Tensor of dtype fp32 with the dequantized value
    r   r   r   )r   r   r'   r
   r$   r    r   r"   r   ranger)   r*   r#   r   )r   r   r   r-   r+   mantissa_maskr<   x_pos	zero_maskr6   exp_biased_lpexp_biased_f32mantissa_lp_int32mantissa_f32resultdenormal_exp_biasedimantissa_cmp
left_shiftsign_f32s                       r	   _floatx_unpacked_to_f32rM      s?    7ek!!!!u9u!!!!eem$Iuqy!!HENNM )mG KE
 
I
 %uqyUe^4IKKM UNM"X-<N#&&u{33y@N .225;??$U):;Ll*F
 F9h,5 zz!4u!< J} u 	e 	eA %a1faAEl ; ; 
e 
e #QY
 ,Q 7Z)=SV[=[\"5
"By!P
 HVXdGd!"3|"CDD
e ],=vFF zz%+&&9u+<y+H5+PQHhF;;u{###r   c                 j    | d         dz  dk    sJ |  d            g | d d         | d         dz  R S )Nr   r   z last dim not divisible by twor   sizes    r	   	down_sizerR      sO    8a<1EEE&T#2#Y&RA&&&r   c                 2    g | d d         | d         dz  R S )NrO   r   r   rP   s    r	   up_sizerT      s%    %T#2#Y%R1%%%r   c                     | |z   dz
  |z  S r   r   )abs     r	   ceil_divrX      s    EAI!r   T
uint8_datahi_firstc                 l   | j         }|d         dz  dk    sJ |                                                     d          } |r:| d d d         dz  | dd d         z                      t          |                    S | dd d         dz  | d d d         z                      t          |                    S )NrO   r   r      r   )shape
contiguousr#   rR   )rY   rZ   r]   s      r	   
pack_uint4r_      s    E9q=A&&((--b11J P33Q31$z!$Q$'77==i>N>NOOO14a4 A%
33Q37==i>N>NOOOr   c                    |                                  sJ | j        }| dz	                      t          j                  }| dz                      t          j                  }|r9t          j        ||gd                              t          |                    }n8t          j        ||gd                              t          |                    }|S )z?Get the original weight from the normalized float weight formatr\      rO   )dim)is_contiguousr]   r   r   r'   stackr#   rT   )rY   rZ   r]   hilounpackeds         r	   unpack_uint4rh     s    ##%%%%%E
/		ek	*	*B
v
	!	!%+	.	.B F;BxR00055gennEE;BxR00055gennEEOr   packedc                 R    | dz  | dz	  z                       t          j                  S )ab  Exchange the high and low nibbles of each byte in a uint8 tensor.

    This converts between hi_first=True and hi_first=False FP4 packing
    without re-quantizing: ``0xAB`` becomes ``0xBA``.

    Args:
        packed: A uint8 tensor of nibble-packed FP4 values.

    Returns:
        A new uint8 tensor with the nibble order reversed in every byte.
    r\   )r   r   r'   )ri   s    r	   swap_nibblesrk     s'     q[Vq[)--ek:::r   flattenc                    | j         \  }}t          |d          }t          |d          }|dz  }|dz  }| }||f||fk    r.t          j        ||f| j        | j                  }| |d|d|f<   |                    |d|d                              dddd          }	|	                    d	dd
d          	                    dd                              d	d
d          }
|r|

                                S |
                    ||          S )aY  
    Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
    See:
        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout

    Args:
        input_matrix: Input tensor of shape (H, W)
    Returns:
        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
       r\   )devicer   Nr   r   r      rO          )r]   rX   r   zerosro   r   r#   permutereshape	transposerl   )input_matrixrl   rowscolsn_row_blocksn_col_blockspadded_rowspadded_colspaddedblocks
rearrangeds              r	   
to_blockedr   $  s5    #JD$D#&&LD!$$L $K"KFd|[111+&&$
 
 

  ,uuete| [[sL!<<DDQ1aPPFAr1--771==EEb"bQQJ $!!###k;777r   num_rowsnum_colsc                    t          |d          }t          |d          }|dz  }|dz  }|                     ddd          }|                    dddd                              dd          }|                    ||ddd          }	|	                    ||dd          }
|
                    dddd	          }|                    ||          }|d
|d
|f         S )aO  
    Reverse the cuBLAS tiled layout back to normal (H, W) layout.

    Args:
        blocked_matrix: Swizzled tensor from cuBLAS layout (padded_rows, padded_cols)
        num_rows: Desired output rows (unpadded)
        num_cols: Desired output cols (unpadded)
    Returns:
        Unswizzled tensor of shape (num_rows, num_cols)
    rn   r\   rO   rq   rr   r   r   r   rp   N)rX   ru   rv   rt   )blocked_matrixr   r   rz   r{   r|   r}   step1step2step3step4step5	unblockeds                r	   from_blockedr   I  s     Hc**LHa((L$K"K""2r2..EMM"b!Q''11!Q77EMM,aQ??EMM,c1==EMM!Q1%%Ek;77IYhY		)**r   c                 J    t          | |          }t          |dd          }|S )N)rZ   r   r   )rh   rM   )rV   rZ   a_u8a_f32s       r	   fp4_x2_to_f32r   c  s*    H---D#D!Q//ELr   c                    | j         t          j        k    s
J d            |                     t          j                  }|t
          z	  dz  }|t          t
                    z  }|dt
          dz
  z  k    }||                    t          j                  z   }t          j        |dd          }|                    t          j	                  S )NzInput must be float32   r   r   )
r   r   r   r#   r"   r   r
   r   clampr'   )r   x_int
biased_expmantissaround_ups        r	   f32_to_e8m0r   h  s    7em###%<###FF5;E9$,J wy)))HA)a-01Hhkk%+666JZC00J==%%%r   c                 (   | j         t          j        k    s
J d            |                     t          j                  }|t
          z  }t          j        |dk    t          j        |          |          }|                    t          j	                  S )NzInput must be uint8r   )
r   r   r'   r   r"   r   r)   
zeros_liker#   r   )r   r   rG   s      r	   e8m0_to_f32r   x  ss    7ek!!!#8!!!ek""J9$F[q%*:6*B*BFKKF;;u}%%%r   )T)r   intr
   r*   r   r    F4_E2M1_MAXF4_E2M1_EPSF8_E4M3_MAXF8_E4M3_EPSF8_E5M2_MAXF8_E5M2_EPS	E8M0_BIASr   Tensorr   r=   rM   rR   rT   rX   boolr_   rh   rk   r   r   r   r   r   r   r   r	   <module>r      s   s s      	9wy1}%% 	7s 7c 7c 7 7 7 7
7U\ 7el 7 7 7 7mu| mC m m m m m m`T$u| T$C T$ T$ T$ T$ T$ T$n' ' '
& & &  P P5< P4 P5< P P P P t u|    ; ;%, ; ; ; ;"8 "8d "8el "8 "8 "8 "8J+3 +# +%, + + + +4 t    
&5< &EL & & & & &5< &EL & & & & & &r   