
    3j(                        % S SK r S SKrS SKrS SKrS SKr/ SQrSr  S SKr	\	R                  R                  S    rS r\R                  S:X  a#  \" \S5      r\(       a  \R"                  " \5      rO\" \S5      r\(       a}  S SKr\R&                  " \5       Hc  rS\;   d  M  S\;   d  M  \ R*                  " \5         \R,                  " \R.                  R1                  \\5      \R2                  S	9  SSS5        Me      Sr\R.                  R1                  \R.                  R7                  \5      \R                  S:X  a  S
OS5      r\R.                  R=                  \5      (       d  \R                  S:X  a  SOSr\R.                  R7                  \5      r \R&                  " \ 5       HO  r\RC                  S5      (       d  M  \RE                  \5      (       d  M3  \R.                  R1                  \ \5      rMQ     \R.                  R=                  \5      (       a  \RF                  RI                  S\5      r%\%(       a[  \%RL                  (       aJ  \RF                  RO                  \%5      r\\RP                  S'   \%RL                  RS                  \5        Sr*Sr+OSr*S\ 3r+OSr*S\ 3r+ S SK/J0r0  S SK1J2r2  \*=(       a
    \3" \SS5      r4Sq5\Rl                  S-  \7S'   S\84S jr9S\Rl                  4S jr:S\Rl                  4S jr;\Rx                  4S\Rl                  S\Rl                  S\Rz                  S\Rl                  4S  jjr>\R~                  4S\Rl                  S\Rl                  S\Rz                  S\Rl                  4S! jjr@\Rx                  4S\Rl                  S"\Rl                  S\Rz                  S\Rl                  4S# jjrA   S\S\Rl                  S$\Rl                  S%\BS&\CS'\CS\D\Rl                  \Rl                  4   4S( jjrE\R~                  S4S)\Rl                  S$\Rl                  S*\Rl                  S\Rz                  S'\CS\Rl                  4S+ jjrF S]S\Rl                  S,\CS\D\Rl                  \Rl                  4   4S- jjrG   S^S.\Rl                  S/\Rl                  S0\Rl                  S1\Rl                  S2\Rl                  S3\Rl                  S4\Rl                  S5\Rz                  S6\Rl                  S\Rl                  4S7 jjrHS\Rl                  S8\Rl                  S\Rl                  4S9 jrIS:\Rl                  S;\Rl                  S8\Rl                  S\D\Rl                  \Rl                  4   4S< jrJS\Rl                  S8\Rl                  S\Rl                  4S= jrKS:\Rl                  S;\Rl                  S8\Rl                  S\D\Rl                  \Rl                  4   4S> jrLS?rMS@rN0 rO\P\D\QSA4   \D\Rl                  \Rl                  \Rl                  4   4   \7SB'   SC\Rl                  S\C4SD jrRSC\Rl                  S\84SE jrSSF\Rl                  S\Rl                  4SG jrT    S_S\Rl                  SI\Rl                  SJ\Rl                  SK\8SL\CSM\Rl                  S-  SN\CS\D\Rl                  \Rl                  \Rl                  4   4SO jjrU  S`SP\Rl                  SC\Rl                  SQ\Rl                  SR\Rl                  SS\Rl                  SF\Rl                  S4\Rl                  S-  SL\CS\Rl                  4ST jjrVSHrWS\Rl                  SU\Rl                  SR\Rl                  SV\Rl                  SW\8S\Rl                  4SX jrX  SaS\Rl                  SU\Rl                  SR\Rl                  SV\Rl                  S4\Rl                  S-  SW\8S\Rl                  4SY jjrYS\P4SZ jrZS[ r[\[" 5         g! \ a    \R                  S    r GNf = f! , (       d  f       GM  = f! \ a     GN(f = f! \, a  r-Sr*\." \-5      r+Sr Sr-C-GNSr-C-f\ a  r-Sr*S\- 3r+Sr Sr-C-GNSr-C-ff = f)b    N)
apply_ropeapply_rope1apply_rope_split_halfapply_rope_split_half1dequantize_nvfp4dequantize_per_tensor_fp8gemv_awq_w4a16quantize_mxfp8quantize_nvfp4quantize_per_tensor_fp8quantize_svdquant_w4a4scaled_mm_nvfp4scaled_mm_svdquant_w4a4stochastic_rounding_fp8c                 l    [         R                  " U 5       H  u  p#nU H  nX;   d  M
  Us  s  $    M     g N)oswalk)	start_dirlib_patternroot_dirsfilesfiles         ^/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/backends/cuda/__init__.pyfind_lib_dirr   0   s4    "$'')"4D&K  #5     win32
cublasLt64zlibcublasLt.socublasLtz.so)modez_C.abi3.pydz
_C.abi3.soz.pydz_C.zcomfy_kitchen.backends.cuda._CTFz!Could not create module spec for zExtension file not found: zFailed to load extension: )DTYPE_TO_CODE)roundupHAS_CUBLASLT_cublas_workspacereturnc                      [         R                  R                  [         R                  R                  5       5      R                  S:  a  gg)zAReturn 32 MiB if using hopper, 4 MiB for all other architectures.	   i   i  @ )torchcudaget_device_propertiescurrent_devicemajor r   r   get_cublas_workspace_size_bytesr/   s   s2    zz''

(A(A(CDJJaOr   c                  t    [         c,  [        R                  " [        5       [        R                  SS9q [         $ )zReturns workspace for cublas.r*   dtypedevice)r%   r)   emptyr/   uint8r.   r   r   get_cublas_workspacer6   z   s/      !KK+-U[[
 r   tensorc                 b    U R                   (       a  U R                  5       n U R                  SS9$ )a  Export tensor via DLPack without cross-stream sync.

Works around PyTorch issue where __dlpack__(stream=None) syncs with
the default stream, breaking CUDA graph capture on non-default streams.
See: https://github.com/pytorch/pytorch/pull/163242

Detaches first so nn.Parameter (requires_grad=True) inputs like bias
export without PyTorch's gradient-tracking refusal.

Returns a PyCapsule containing the DLTensor that nanobind can import.
)stream)requires_graddetach
__dlpack__)r7   s    r   _wrap_for_dlpackr>      s,     B''r   xscaleoutput_typec           	         [         U R                     n[         U   nU R                  5       (       d  U R                  5       n [        R
                  " U R                  [        R                  U R                  S9nU R                  5       n[        R                  R                  U R                  5      R                  n[        R                  [        U 5      [        U5      [        U5      UUUU5        UR!                  U5      $ )Nr1   )r"   r2   is_contiguous
contiguousr)   r4   shaper5   r3   numelr*   current_streamcuda_stream_Cr   r>   view)r?   r@   rA   input_dtype_codeoutput_dtype_coderesult_uint8rF   
stream_ptrs           r   r   r      s     %QWW-%k2??LLN;;qwwekk!((KLGGIE**1884@@J& [))r   c           	         UR                  5       S:X  d   S5       e[        U R                     n[        U   n[        R                  " U R
                  X R                  S9nU R                  5       n[        R                  R                  U R                  5      R                  n[        R                  [        U R                  [        R                  5      5      [        U5      [        U5      UUUU5        U$ )N   zScale must be a scalar tensorr1   )rF   r"   r2   r)   r4   rE   r3   r*   rG   rH   rI   r   r>   rJ   r5   )r?   r@   rA   rK   rL   resultrF   rN   s           r   r   r      s     ;;=A>>>$QWW-%k2[[HHEFGGIE**1884@@J  ,-  Mr   rngc                    [         U   nU R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       n[        R                  R                  U R                  5      R                  n[        R                  [        U5      [        U 5      UU R                  5       U5        UR                  U5      $ r   )r"   rC   rD   r)   r*   rG   r3   rH   rI   stochastic_round_fp8r>   rF   rJ   )r?   rR   rA   rL   rN   s        r   r   r      s    
 &k2??LLNnn**1884@@J		 88K  r   per_tensor_scaleepsilonpad_16xhi_firstc           
      p   U R                  5       (       d   S5       eU R                  u  pVU(       a  [        US5      n[        US5      nO(XVpUS-  S:X  d
   SU 35       eUS-  S:X  d
   SU 35       e[        R                  " XxS-  4U R
                  [        R                  [        R                  S9n	[        US5      n
[        US-  S	5      n[        R                  " X4U R
                  [        R                  S
9nUR                  5       S:X  a  UR                  S5      n[        R                  R                  U R
                  5      R                  n[        R                  [!        U 5      [!        U5      [!        U	5      [!        U5      UUUU5        UR#                  [        R$                  5      nX4$ )NInput tensor must be contiguous   r   z&num_rows must be divisible by 16, got z&num_cols must be divisible by 16, got    r3   r2   memory_format      r3   r2   rP   )rC   rE   r#   r)   r4   r3   r5   contiguous_formatzerosdimreshaper*   rG   rH   rI   r   r>   rJ   float8_e4m3fn)r?   rU   rV   rW   rX   	orig_rows	orig_colsnum_rowsnum_colsqx
scale_rows
scale_colssx_uint8rN   sxs                  r   r   r      s    ?????77I9b)9b)&("}!V%KH:#VV!"}!V%KH:#VV! 
hA.qxxu{{bgbyby	zB 3'JR+J{{J3AHHEKKXH "+33A6**1884@@J)*"	 
u**	+B6Mr   rk   block_scalesc           	      $   U R                  5       (       d   S5       eU R                  u  pVUS-  n[        R                  " XW4U R                  US9nUR                  5       S:X  a  UR                  S5      nUR                  [        R                  5      n	[        U   n
[        R                  R                  U R                  5      R                  n[        R                  [        U 5      [        U5      [        U	5      [        U5      U
UU5        U$ )NrZ   r\   ra   r   rP   )rC   rE   r)   r4   r3   rd   re   rJ   r5   r"   r*   rG   rH   rI   r   r>   )rk   rU   rp   rA   rX   ri   num_cols_packedrj   outputblock_scales_uint8rL   rN   s               r   r   r     s     @@@ "H"H[[(-bii{SF "+33A6%**5;;7%k2**2995AAJ)*+,  Mr   pad_32xc                    U R                  5       (       d   S5       eU R                  u  p#U(       a  [        US5      n[        US5      nO(X#pTUS-  S:X  d
   SU 35       eUS-  S:X  d
   SU 35       e[        R                  " XE4U R
                  [        R                  [        R                  S9n[        US5      n[        US-  S5      n[        R                  " Xx4U R
                  [        R                  S	9n	[        R                  R                  U R
                  5      R                  n
[        R                  [        U 5      [        U5      [        U	5      UU
5        U	R!                  [        R"                  5      nXk4$ )
NrZ       r   z&num_rows must be divisible by 32, got z&num_cols must be divisible by 32, got r]   r_   r`   ra   )rC   rE   r#   r)   r4   r3   rf   rb   rc   r5   r*   rG   rH   rI   r
   r>   rJ   float8_e8m0fnu)r?   ru   rg   rh   ri   rj   rk   rl   rm   rn   rN   ro   s               r   r
   r
   9  sO    ?????77I9b)9b)&("}!V%KH:#VV!"}!V%KH:#VV!	h)!((%BUBUeje|e|	}B3'JR+J{{J3AHHEKKXH**1884@@J" 
u++	,B6Mr   abtensor_scale_atensor_scale_bblock_scale_ablock_scale_bbias	out_dtypealphac	                 
   Sn	Sn
[        U[        R                  R                  5      (       a  UR                  n[        U[        R                  R                  5      (       a  UR                  nUc  X#-  nO5[        U[        R                  R                  5      (       a  UR                  nUR
                  [        R                  :w  a  UR                  [        R                  5      nUR                  5       S:X  a  UR                  S5      n[        U [        R                  R                  5      (       a  U R                  n [        U[        R                  R                  5      (       a  UR                  n[        U[        R                  R                  5      (       a  UR                  n[        U[        R                  R                  5      (       a  UR                  nU R                  u  pUR                  u  pX:X  d   S5       eSU-  n[        R                  " XXpR                  S9nUS-  S:X  d   S	5       eUR
                  UR
                  :X  d   S
5       eUR
                  [        R                  :X  a  U
S:X  d   S5       e[        S5      eUR
                  [        R                  :X  aX  U
S:X  d   S5       eUc   S5       eUR
                  [        R                  :X  d   S5       eUR!                  5       S:X  d   S5       eO[        SUR
                   35      e[#        US5      n[#        US5      n[#        X-  S5      nUR                  5       S:X  d   S5       eUR%                  5       UU4:X  d   S5       eUR                  5       S:X  d   S5       eUR%                  5       UU4:X  d   S5       eUc  [        R&                  " 5       nO5UR
                  [        R(                  [        R*                  4;   d   S5       eSu  nnUR-                  [        R.                  5      nUR-                  [        R.                  5      n[0        U   n[        R2                  R5                  U R                  5      R6                  nUb  UR!                  5       S:X  a/  [        R                  " SU R                  [        R(                  S9nO5[        U[        R                  R                  5      (       a  UR                  n[8        R;                  [=        U5      [=        U5      [=        U 5      [=        U5      [=        U5      U[=        U5      [=        [?        5       5      U	[=        U5      U5        U$ )NFr[   r   rP   zMatrix dimensions do not matchr\   r1      z-B tensor must have 8 alignment in N dimensionzA and B scale dtype must matchrw   z#MXFP4 only supports block length 32z2MXFP4 is not supported yet for cuBLAS in CUDA 12.9z#NVFP4 only supports block length 16z alpha must be provided for NVFP4zalpha must be float32zalpha must be a scalarzUnsupported scale dtype: r_   r`   zInvalid A scale shapezInvalid B scale shapez*Only fp16 and bfloat16 bias are supported.)TFra   ) 
isinstancer)   nn	Parameterdatar2   float32tord   re   rE   r4   r3   rx   
ValueErrorrf   rF   r#   sizeTensorfloat16bfloat16rJ   r5   r"   r*   rG   rH   rI   cublas_gemm_blockwise_fp4r>   r6   )ry   rz   r{   r|   r}   r~   r   r   r   
accumulateblock_lengthmk_ank_bkout	roundup_m	roundup_n
roundup_sk_transa_transbblock_scale_b_uint8block_scale_a_uint8out_dtype_coderN   s                             r   r   r   ]  s~    JL .%(("4"455',,.%(("4"455',,}/	E588--	.	.

 {{emm#'yy{aa  !UXX''((FF!UXX''((FF-!3!344%**-!3!344%**WWFAWWFA:777: 	
CA
++a)HH
=C q5A:FFF: -"5"55W7WW5e222r!H#HH!MNN			 3 3	3r!H#HH! D"DD {{emm+D-DD+{{}!;#;;!4]5H5H4IJKK3I3I*A.J!#<%<<#Iz#::S<SS:!#<%<<#Iz#::S<SS:|||~zzMMNN
 
 	8 8	8 
 #GW (,,U[[9',,U[[9"9-N**1884@@J |tzz|q({{1QXXU]]C dEHH..//99D  ,-,--/0 Jr   	freqs_cisc           	         U R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       n[        R                  " U 5      n[        R                  R                  U R                  5      R                  n[        R                  [        U 5      [        U5      [        U5      S S US5        U$ NFrC   rD   r)   
empty_liker*   rG   r3   rH   rI   r   r>   r?   r   x_outrN   s       r   r   r     s    ??LLN""$$((*	QE**1884@@JMM# Lr   xqxkc           	         U R                   UR                   :w  a  [        X5      [        X5      4$ U R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       nUR                  5       (       d  UR                  5       n[        R
                  " U 5      n[        R
                  " U5      n[        R                  R                  U R                  5      R                  n[        R                  [        U 5      [        U5      [        U5      [        U5      [        U5      US5        X44$ r   )rE   r   rC   rD   r)   r   r*   rG   r3   rH   rI   r   r>   r   r   r   xq_outxk_outrN   s         r   r   r     s     
xx2882);r+EEE]]_]]_""$$((*	b!Fb!F**2995AAJMM#   >r   c           	         U R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       n[        R                  " U 5      n[        R                  R                  U R                  5      R                  n[        R                  [        U 5      [        U5      [        U5      S S US5        U$ NTr   r   s       r   r   r     s    ??LLN""$$((*	QE**1884@@JMM# Lr   c           	         U R                   UR                   :w  a  [        X5      [        X5      4$ U R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       nUR                  5       (       d  UR                  5       n[        R
                  " U 5      n[        R
                  " U5      n[        R                  R                  U R                  5      R                  n[        R                  [        U 5      [        U5      [        U5      [        U5      [        U5      US5        X44$ r   )rE   r   rC   rD   r)   r   r*   rG   r3   rH   rI   r   r>   r   s         r   r   r   &  s    
 
xx288%b46LR6[[[]]_]]_""$$((*	b!Fb!F**2995AAJMM#   >r   @   r_   ._SVDQUANT_WORKSPACE_CACHEwgtc                 (    U R                  5       S:H  $ )Nr`   rd   r   s    r   _is_svdquant_tile_packed_weightr   O  s    779>r   c                     [        U 5      (       a  [        U R                  S   5      [        -  $ [        U R                  S   5      $ )Nr   )r   intrE   _SVDQUANT_W4A4_BLOCK_Nr   s    r   "_svdquant_out_features_from_weightr   S  s9    &s++399Q< #999syy|r   lora_upc                    U R                  5       S:w  a  U $ [        U SS5      nUbi  UR                  U R                  :X  aO  UR                  U R                  :X  a5  UR                  U R                  S   [
        -  U R                  S   4:X  a  U$ U R                  SSS5      R                  U R                  S   [
        -  U R                  S   5      R                  5       nX l	        U$ )a  Return a natural (N, R) view/copy for tile-packed proj_up.

Converter layout is (N/128, R, 128). cuBLAS addmm_ wants a natural
row-major (N, R) tensor. Cache the one-time reorder on the source tensor so
the runtime path keeps cuBLAS speed without paying a per-forward permute.
   _ck_natural_lora_upNr   rP   r\   )
rd   getattrr3   r2   rE   r   permutere   rD   r   )r   cachednaturals      r   !_natural_lora_up_from_tile_packedr   Y  s     {{}W3T:FMMW^^+LLGMM)LLW]]1-0FFVWHXYYooaA&..a117==3Cjl  #*Nr      smooth	lora_downpad_sizeact_unsignedlora_xreuse_workspacec                 R   U R                  5       S:X  a  U R                  5       (       d   S5       eU R                  u  pxUR                  S   n	[        Xs5      n
[        nX-  S:X  d   SU SU 35       e[
        R                  " SS5      R                  5       S	;   a  [        R                  OU R                  n[        R                  R                  U R                  5      R                  nU=(       a(    [
        R                  " S
S5      R                  5       S;  nU(       Ga  U R                  R                  b  U R                  R                  O[        R                  R!                  5       nU[#        U5      U R                  XX4n[$        R'                  U5      nUc  [        R(                  " XS-  [        R*                  U R                  S9[        R(                  " X-  XR                  U R                  S9[        R(                  " XXR                  S94nU[$        U'   Uu  nnnO|[        R(                  " XS-  [        R*                  U R                  S9n[        R(                  " X-  XR                  U R                  S9n[        R(                  " XXR                  S9n[,        R/                  [1        U 5      [1        U5      [1        U5      [1        U5      [1        U5      [1        U5      [3        U5      U5        Ub  UOU nUS:  a^  USU nUR                  UR                  :X  a,  UR                  5       (       a  [        R4                  " UUUS9  OUR7                  UU-  SS9  X:  a  UUS R9                  5         UR;                  [        R<                  5      UU4$ )u#  SVDQuant W4A4 activation quantize + smooth + LoRA-down (CUDA).

Kitchen-native layouts:
  x         (M, K)       bf16/fp16  main-path input (pre-shifted if unsigned)
  smooth    (K,)         bf16/fp16
  lora_down (K, R)       bf16/fp16  natural row-major
  lora_x    (M, K)       bf16/fp16  pre-shift x for LoRA (defaults to x)
  q_x       (M_pad, K/2) int8       two int4 per byte
  ascales   (K/G, M_pad) bf16/fp16  per-row per-group
  lora_act  (M_pad, R)   bf16/fp16 by default (same dtype as x)

act_unsigned=True selects scale=max/15 + clamp [0,15] (u4 bit patterns for
downstream u4.s4 MMA). Caller must ensure x is non-negative — shift is a
model-topology concern kept out of the kernel API. Pass lora_x=raw_x when
x was pre-shifted (LoRA operates on pre-quantization, pre-shift activations).
r\   zx must be 2D contiguousrP   r   K=z! must be divisible by group_size=$COMFY_KITCHEN_SVDQUANT_LORA_ACT_FP32 >   1onyestrue&COMFY_KITCHEN_SVDQUANT_REUSE_WORKSPACEr   >   0noofffalseNr1   )r   T)non_blocking)rd   rC   rE   r#   _SVDQUANT_W4A4_GROUP_SIZEr   getenvlowerr)   r   r2   r*   rG   r3   rH   indexr,   r   r   getr4   r5   rI   svdquant_quantize_w4a4r>   boolmmcopy_zero_rJ   int8)r?   r   r   r   r   r   r   r   r   rm_padglora_act_dtyperN   device_indexkeyr   q_xascaleslora_actlora_srclora_act_rowss                         r   r   r   q  s   2 557a<AOO--H/HH-77DAAA E!A5A:CA3?sCC:&(ii.'eg+',U]]12  **1884@@J% 1"))0#+eg0+1O )*)Cqxx~~IbIbIdS_agg~aS*..s3>E6QXXNAFEJEN88LF
 .4%c*!'Whkk%au{{188L++afe77188L;;u~hhO #!"\	"  +vH1u !(..0]5P5P5R5RHHXym<9 44Hy88EJJ(22r   actr   wscaleslora_act_inc                 "   U R                   S   n[        U5      n	[        R                  " XUR                  U R
                  S9n
[        R                  " SUR                  U R
                  S9n[        R                  " SS5      R                  5       S;   n[        R                  " S5      nUc  [        U5      nOUR                  5       S;   n[        U5      UR                  5       S:H  :H  n[        R                  " S	5      nUR                  U
R                  :H  =(       a'    U=(       a    USL =(       d    UR                  5       S;   n[        R                  R                  U R
                  5      R                  n[        R                  [!        U R#                  [        R$                  5      5      [!        UR#                  [        R$                  5      5      [!        U5      [!        U5      [!        U5      [!        U5      [!        Ub  UOU5      [!        U
5      UUUUU5        U(       db  UR                  U
R                  :X  a  UOUR'                  U
R                  5      n[)        U5      nU
R+                  UUR-                  5       5        U
$ )
a  SVDQuant W4A4 int4 GEMM + LoRA-up + bias (CUDA).

int4 MMA + per-group dequant + bias run in one kernel. By default, when
lora_act_in already has the output dtype and proj_up's layout matches the
weight layout, LoRA-up is fused into the CUDA epilogue with bf16/fp16
tensor-core MMA. Set COMFY_KITCHEN_SVDQUANT_FUSE_LORA_UP=0 to force the
older Python/cuBLAS addmm_ epilogue for comparison.

Kitchen-native layouts:
  act         (M, K/2)   int8       two int4 per byte (signed or unsigned)
  wgt         (N, K/2)   int8       signed int4 weight, natural row-major
    or        (N/128, K/64, 32, 128) int8 kitchen_tile_packed_w4a4
  ascales     (K/G, M)   bf16/fp16  per-row per-group
  wscales     (K/G, N)   bf16/fp16  per-col per-group
    or        (N/128, K/G, 128) bf16/fp16 for tile-packed wgt
  lora_act_in (M, R)     bf16/fp16 or fp32
  lora_up     (N, R) or (N/128, R, 128) bf16/fp16
  bias        (N,) or (N/128, 128) bf16/fp16  (optional)
  out         (M, N)     bf16/fp16  (= lora_up.dtype)

act_unsigned: if True, A fragments go through u4.s4 MMA instead of s4.s4.
Set COMFY_KITCHEN_SVDQUANT_FAST_ACCUM=1 to use the experimental packed
half2/bfloat162 accumulator instead of the default fp32 accumulator.
r   r1   !COMFY_KITCHEN_SVDQUANT_FAST_ACCUMr   >   r   r   r   r   #COMFY_KITCHEN_SVDQUANT_SHARED_SCALENr   #COMFY_KITCHEN_SVDQUANT_FUSE_LORA_UP)rE   r   r)   r4   r2   r3   r   r   r   r   rd   r*   rG   rH   rI   svdquant_scaled_mm_w4a4r>   rJ   r5   r   r   addmm_t)r   r   r   r   r   r   r   r   r   r   r   r4   
fast_accumshared_scale_envshared_scalelora_up_layout_matches_wgtfuse_lora_env	fuse_lorarN   	lora_bf16
lora_up_mms                        r   r   r     s   D 			!A*3/A
++a'--


CCKKszzBE>CIIK P J yy!FG6s;'--/3MM',!1CD  IICDMSYY& 	[&	[d"Ym&9&9&;?Y&Y  **3::6BBJ%++./%++./!!%!!1u=   $/#4#4		#AK{~~VYV_V_G`	6w?


9jlln-Jr   qweightwzeros
group_sizec                    UR                   u  pVUS-  nUnUR                  n	UR                  [        R                  5      n
U
S-  R                  [        R
                  5      nU
S-	  S-  R                  [        R
                  5      n[        R                  " X/SS9R                  XW5      R                  U	5      nUR                  XWU-  U5      S-
  UR                  5       R                  S5      -  UR                  5       R                  S5      -   R                  XW5      nU R                  UR                  5       5      $ )u!  Large-M fallback: dequantize qweight to bf16 then bf16 cuBLAS matmul.

qweight (N, K/2) int8 packed uint4 → W (N, K) bf16 via group dequant, then
`out = x @ W.T`. Same algebra as eager.gemv_awq_w4a16 — used for M values
where the in-kitchen MMA kernel is slower than cuBLAS bf16 GEMM.
r\      r`   r9   r   g       @)rE   r2   r   r)   int32r   stackre   rJ   r   	unsqueezematmul)r?   r  r   r  r  r   k_halfr   r   compute_dtypex32lohinibblesws                  r   _awq_w4a16_dequant_then_matmulr  "  s     IA
AAMMM
**U[[
!C
)

	#B!8s
		uzz	*Bkk2(+33A9<<]KG	aa	#c	)WYY[-B-B2-FF
((*

r
"	#
d1j  88ACCE?r   c           	         U R                   nU R                  SUS   5      nUR                   u  pUR                   S   n
X-  S:w  a  [        SU	 SU 35      eUR                   S   S-  U	:w  a  [        SUR                   S    SU	 35      eU[        :  a5  [	        UR                  5       R                  UR                  5      XX55      nO[        R                  " XUR                  U R                  S	9n[        R                  R                  U R                  5      R                  n[        R                  [!        UR                  5       R                  UR                  5      5      [!        UR#                  [        R$                  5      5      [!        U5      [!        U5      [!        U5      UU5        Ub  UR'                  U5        UR                  " / US
S QU
P76 $ )u  AWQ W4A16 matmul: int4 weight @ fp activation (CUDA, kitchen-native).

Tiered routing:
  M ≤ 8                    naive 1-thread-per-output kernel (GEMV-style)
  8 < M ≤ 512              fused int4 x bf16/fp16 MMA kernel — dequant
                           into shmem, mma.m16n8k16.f32 along K, no
                           intermediate bf16 W workspace
  M > 512                  dequant + cuBLAS bf16 matmul fallback

bias is applied externally (`out.add_`), mirroring
scaled_mm_svdquant_w4a4's epilogue contract.

Layouts (match comfy_kitchen.backends.eager.awq):
  x        (M, K)   bf16/fp16  row-major activation. Leading dims are
                               flattened.
  qweight  (N, K/2) int8       two unsigned int4 per byte
  wscales  (K/G, N) bf16/fp16  per-group, per-output-col scale
  wzeros   (K/G, N) bf16/fp16  per-group, per-output-col zero
  bias     (N,)     bf16/fp16  optional (= wscales.dtype)
  out      (..., N) bf16/fp16  same dtype as wscales
r9   r   r   z not divisible by group_size=rP   r\   zqweight K//2=z inconsistent with x K=r1   N)rE   re   r   _AWQ_W4A16_MMA_M_LIMITr  rD   r   r2   r)   r4   r3   r*   rG   rH   rI   	awq_w4a16r>   rJ   r5   add_)r?   r  r   r  r   r  
orig_shapex2dr   r   r   out2drN   s                r   r	   r	   ?  s   : J
))B
2
'C99DAaA~2aS =j\JKK}}Q!q =q)9(::QRSQTUVV!!.NN.&
 AahhGZZ..qxx8DD

S^^-00?@W\\%++67W%V$U#	
 

4==-*Sb/-1--r   c                     SSK Jn JnJnJn  [        S15      nU" U" [        [        R                  [        R                  [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  [        R                  15      S9S.US9U" U" [        [        R                  [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  [        R                  [        R                  15      S9S.US9U" U" [        [        R                  [        R                  [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  [        R                  15      S9S.US9U" U" [        [        R                  [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  15      S9S
.US9U" SU" [        [        R                  [        R                  [        R                  15      U" S5      4S	90US9U" U" [        [        R                  15      U" S5      U " SSS94S	9U" [        [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  [        R                  [        R                  15      S9S.US9U" U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  [        R                  15      U" S5      4S	9S.US9U" U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  [        R                  15      U" S5      4S	9S.US9U" U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  [        R                  15      U" S5      4S	9S.US9U" U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  [        R                  15      U" S5      4S	9S.US9U" U" [        [        R                  [        R                  15      U" S5      U " SSS94S	9U" [        [        R                  [        R                  15      S9U" [        [        R                  [        R                  15      U" S5      4S	9S.USS9U" U" [        [        R                  15      U" S5      4S	9U" [        [        R                  15      S9U" [        [        R                  [        R                  15      S9U" [        [        R                  [        R                  15      S9U" [        [        R                  [        R                  [        R                  15      S9U" [        [        R                  [        R                  15      S9S.USS9U" U" [        [        R                  [        R                  15      S9U" [        [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  15      U" S5      4S	9U" [        [        R                  [        R                  15      U" S5      4S	9S.USS9S.n[        (       Ga  U" U" [        [        R                  15      U" S5      U " SSS94S	9U" [        [        R                  15      U" S5      U " SSS94S	9U" [        [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  15      S9U" [        [        R                  [        R                  15      S9S.USS9US'   U$ )Nr   )DivisibleBy	ExactDimsFunctionConstraintsParamConstraintr*   )dtypes)r?   r@   rA   )paramsdefault_devices)r?   rR   rA   r\   )r  shape_rules)r?   rU   r?   r[   )rd   factor)rk   rU   rp   rA   r`      )r?   r   )r   r   r   rP   r   )r?   r   r   )r   r   )r  r   min_compute_capability)r   r   r   r   r   r   )r?   r  r   r  )r   r   r   r   r
   r   r   r   r   r   r   r   r	   )ry   rz   r{   r|   r}   r~   r   )
   r   r   )comfy_kitchen.constraintsr  r  r  r  	frozensetr)   r   r   r   rf   float8_e5m2r5   r   _CUBLASLT_AVAILABLE)r  r  r  r  cuda_devicesconstraintss         r   _build_constraintsr,  |  s     fX&L $7$$emmU]]ENN%ST )$emm_5  /$e&9&95;L;L%MN 
 )$
 &9$$e&9&95;L;L%MN )$emm_5  /$emmU]]ENN%ST 
 )&
 $7$$emmU]]ENN%ST 'i.FG.$e&9&95;L;L%MN  )$
 .$$emmU]]ENN%ST!*1 %4$emm_5% )
 ._$emmU]]ENN%ST!*1 )
 0%$ekk]3!*1{q/L M %4$emm_5% !0$e&9&9%:;!  /$emmU]]ENN%ST  )!
$ +$$emmU^^%DE!*1 -$emmU]]ENN%ST!*1	 )
 *%$emmU^^%DE!*1 &$emmU^^%DE!*1 -$emmU]]ENN%ST!*1 )
" #6$$emmU^^%DE!*1 -$emmU]]ENN%ST!*1	 )#
 "5%$emmU^^%DE!*1 &$emmU^^%DE!*1 -$emmU]]ENN%ST!*1 )"
" #6$$emmU^^%DE!*1{q/L M *$emmU^^%DE -$emmU^^%DE!*1 )#)#
" $7&i.ET]^_T`Sbc&i.EF*)U]]ENN<[2\]*)U]]ENN<[2\].$emmU]]ENN%ST  +)U]]ENN<[2\]	 )#)$
 .$$emmU^^%DE +$ejj\2!*1 +$emmU^^%DE!*1 *$emmU^^%DE!*1" )#)'
W@KD )<$$ekk]3!*1{q/L M %$ekk]3!*1{q/L M #2$emm_5# #2$emm_5# "1$e&9&9%:;" "1$e&9&9%:;" -$emmU^^%DE+2 )#*7*
%&< r   c                     SSK Jn   [        (       d  U R                  S[        5        g[
        R                  R                  5       (       d  U R                  SS5        gU R                  S[        [        [        S9[        5       S9  g)z/Register CUDA backend with the global registry.r   registryr*   Nz!CUDA not available on this system)fromlist)namemodulecapabilities)comfy_kitchen.registryr/  _EXT_AVAILABLEmark_unavailable
_EXT_ERRORr)   r*   is_availableregister
__import____name____all__r,  r.  s    r   	_registerr=  j  sj    />!!&*5::""$$!!&*MN(W5')  r   )g        FT)F)NNN)r   FNFr   )Nr   )\
contextlibimportlib.util	importlibr   sysr)   r<  _dll_handlenvidia.cu13nvidiacu13__path__nvidia_cu13_path	Exceptionr   platformlib_diradd_dll_directoryctypeslistdirfilenamesuppressCDLLpathjoinRTLD_GLOBALrI   dirname__file___module_pathexistsext	directory
startswithendswithutilspec_from_file_location_specloadermodule_from_specmodulesexec_moduler5  r7  ImportErrorestr)comfy_kitchen.backends.eager.quantizationr"   comfy_kitchen.float_utilsr#   r   r)  r%   r   __annotations__r   r/   r6   r>   rf   r2   r   r   r   r   floatr   tupler   r   r
   r   r   r   r   r   r   r   r   dictobjectr   r   r   r   r   r  r  r	   r,  r=  r.   r   r   <module>rm     s     	 
 $ 	-!;;//2 ||w/>..w7K/1ABJJw/)ex.?#,,Y7BGGLL($C&J\J\] 87 0"	B77<< 9CLL\cLc=iuvL77>>,''/fUGGOOH-	

9-H""5))h.?.?.D.D!ww||Ix@ . 
ww~~l##66,l
 U\\007B<>CKK89LL$$R(!NJ"N<\NKJ1,@
 D -$K^U)K )- 5<<$& - ell (U\\ (& FKEXEX*||*!LL*7<{{*
\\*6 FK^^||!LL7<{{
\\8  %22!||!	! ! \\	!6 2||2ll2 2 	2
 2 5<<%&2r  %~~  ll  ,,  	 
   \\ J !||!! 5<<%&!V !||||||| LL| LL	|
 <<| <<| ,,| {{| <<| \\|~5<< ELL U\\ ,,,38<<
5<<%&<ell u||  , || 5<<%&	H   gi 4fck 2E%,,V[VbVb:b4c cd i $ ELL S u||  8 "&!R3||R3LLR3 ||R3 	R3
 R3 LL4R3 R3 5<<u||34R3x !%O	O	O \\O \\	O
 O \\O ,,
O O \\Or  ||\\ \\ LL	
  \\D !%:.||:.\\:. \\:. LL	:.
 ,,
:. :. \\:.zkD k\& 
a"  - >>!,-( 87 		@  NQJ	B N-aS1J	Bs   _. 5` 1-` "` *` <6`
2` ?C`* `* B=`* `* "`* .`` `` 

`		` `'&`'*a0a	a	aa