
    3j                       % S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	J
r
  SSKJrJrJrJr  S	S
KJr  S	SKJr  \
R(                  " \5      r\" SS9 " S S5      5       r\R0                  S$S%S jj5       r\R4                  R6                  S$S&S jj5       rS$S%S jjr\R0                  S'S j5       r\" 5       r S\!S'   S(S jr"S)S jr#S*S+S jjr$          S,S jr%          S-S jr&S.S jr'S/S jr(              S0S jr)                    S1S jr*SS\RV                  S4               S2S jjr,          S3S  jr-          S3S! jr.S4S" jr/ S*           S5S# jjr0g)6uG  DeepGEMM integration: fused grouped GEMM kernels from `kernels-community/deep-gemm`.

Provides:
- `deepgemm_bf16_experts_forward`: BF16 M-grouped experts forward.
- `deepgemm_fp8_fp4_linear`: end-to-end FP8/FP4 linear (BF16 in, BF16 out).
- `deepgemm_fp8_fp4_experts_forward`: FP8 (or FP4 on SM100+) M-grouped experts forward.
- `deepgemm_fp8_fp4_megamoe_experts_forward`: FP8×FP4 Mega MoE forward (SM100+).

Requirements: CUDA, Hopper (SM90+), CUDA runtime ≥ 12.3, kernels-community/deep-gemm
≥ 2.5 (Mega MoE symbols required). Mega MoE additionally needs SM100+ at call time.
    )annotationsN)Callable)	dataclass   )logging)get_cuda_runtime_versionis_kernels_availableis_torchdynamo_compilingresolve_internal_import   )lazy_load_kernel)to_localT)frozenc                      \ rS rSr% SrS\S'   S\S'   S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   S\S'   S\S'   Srg)DeepGEMM3   z>Curated entry points exposed by `kernels-community/deep-gemm`.r   fp8_fp4_matmulgrouped_fp8_fp4_matmul_ntgrouped_fp8_fp4_matmul_nngrouped_bf16_matmul_ntgrouped_bf16_matmul_nnper_token_cast_to_fp8!transform_sf_into_required_layouttransform_weights_for_mega_moeget_symm_buffer_for_mega_moefp8_fp4_mega_moeintm_alignment N)__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__r       \/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/integrations/deepgemm.pyr   r   3   sI    H''''$$$$##'//$,,"** r'   r   c                *   [        5       (       d  [        5       (       d  [        S5      e[        R                  R                  5       (       d  [        S5      e[        R                  R                  5       u  pU (       a  SOSnX;  a  U (       a  SOSn[        SU SU U S	35      e[        5       u  pVUS
:X  a  SOSnXV4U:  a#  [        SU U SUS    S	US    SU S	U S	35      e[        S5      nUc  [        S5      e[        USS5      n	[        USS5      n
[        USS5      n[        USS5      n[        USS5      n[        USS9n[        USS5      n[        USS5      n[        USS5      n[        USS5      n[        US S5      nSU	4SU
4SU4SU4SU4SU4SU4SU4SU4SU4S U44 VVs/ s H  u  nnUb  M  UPM     nnnU(       a  [        S!S"R                  U5       S#35      e[        U	U
UUUUUUUU[        U" 5       5      S$9$ s  snnf )%zLoad DeepGEMM once; raise `ImportError` if env or any required symbol is missing.

`requires_sm100` raises a Blackwell-specific error for callers (FP4 / Mega MoE)
that won't work on Hopper, instead of the generic SM90+ message.
zYDeepGEMM kernel requires the `kernels` package. Install it with `pip install -U kernels`.z9DeepGEMM kernel requires CUDA, but CUDA is not available.)
   )	   r*   zBlackwell (SM100)z"Hopper (SM90) or Blackwell (SM100)zDeepGEMM requires z; current device is SM.r*   )   r+   )r-      zDeepGEMM on SMu    requires CUDA runtime ≥ r   r   z, found z	deep-gemmNuc   Failed to load `kernels-community/deep-gemm` — check that a build matches the current torch/CUDA.fp8_fp4_gemm_nt$m_grouped_fp8_fp4_gemm_nt_contiguous$m_grouped_fp8_fp4_gemm_nn_contiguous!m_grouped_bf16_gemm_nt_contiguous!m_grouped_bf16_gemm_nn_contiguouszutils.per_token_cast_to_fp8)chained_pathr   r   r   &get_mk_alignment_for_contiguous_layoutr   z-DeepGEMM kernel is missing required symbols: z, z'. Update with `pip install -U kernels`.)r   r   r   r   r   r   r   r   r   r   r   )r
   r	   ImportErrortorchcudais_availableget_device_capabilityr   r   getattrr   joinr   r   )requires_sm100majorminorallowedarch
cuda_major
cuda_minormin_cudakernelr   r   r   r   r   r   r   r   r   get_mk_alignmentr   nameattrmissings                          r(   _load_deepgemm_kernelrJ   J   s    $%%#%%k  zz&&((YZZzz779 *%w*8&>bD 24&8NugV[U\\]^__ ":!;
#rk7w#h. w.I(ST+VWX`abXcWd e#Aj\4 
 k*F~q
 	
 V%6=N '0VX\ ] '0VX\ ]$V-PRVW$V-PRVW3FIfg(/8[]a(b%%,V5UW[%\"#*63QSW#X v'OQUVv'94@
 /35NO35NO02HI02HI*,AB02ST-/MN+-IJ57GH!12

JD$  	
  " ;DIIg<N;OOvw
 	
 %";";553*K'E%A)(*+ +s   2
H Hc                    [        U S9ng Nr=   )rJ   )r=   _s     r(   _populate_deepgemm_kernelrO      s    ^<Ar'   c                D    [        5       (       a	  [        U S9  [        U S9$ rL   )r
   rO   rJ   rM   s    r(   load_deepgemm_kernelrQ      s    !!!@ ??r'   c                L    [         R                  R                  U 5      S   S:  $ )z``True`` for Blackwell (SM100+). Cached: device capability is fixed for the
process lifetime and this gets hit on every linear/expert forward.
r   r*   )r7   r8   r:   devices    r(   	_is_sm100rU      s#    
 ::++F3A6"<<r'   zset[int]_DEEPGEMM_VISITED_DEVICESc                
   U R                   b  U R                   O[        R                  R                  5       n[        R                  U5        [        [        5      S::  a  gSnUS:X  a  [        US-   5      e[        US-   5      e)u6  Reject DeepGEMM calls that span multiple CUDA devices in the same process
(e.g. ``device_map="auto"`` across N GPUs). DeepGEMM loads each kernel via
``cuKernelGetFunction``, which binds the resulting ``CUfunction`` handle to
the CUDA context that was current at load time. Driving the same cached
handle from a different device's context launches it against the wrong
module/context and produces garbage. Distributed setups (torchrun + TP/EP)
don't trip this because each process owns exactly one device's context.

The fix is a build-time choice on the DeepGEMM side: compiling with
``DG_JIT_USE_RUNTIME_API=1`` swaps the loader for the runtime API
(context-free ``cudaKernel_t``) and lifts the restriction — but it has to
be baked into the wheel, setting the env var at Python runtime won't change
the loader the cached ``.so`` already uses. Until the kernels-community build
we ship picks that up, we reject single-process multi-device by default.

Raised as :class:`ImportError` from the per-linear path so :func:`fp8_linear`
falls back to Triton (which loads through the runtime API and has no such
binding); raised as :class:`RuntimeError` from the experts path where there's
no fallback — the user explicitly chose ``experts_implementation="deepgemm"``
and must switch to ``"grouped_mm"`` / ``"eager"`` or run distributed.
Nr   zDeepGEMM caches each kernel's `CUfunction` against the CUDA context it was first loaded under, so driving it from a different device in the same process produces garbage. Run distributed (TP/EP) so each process owns one device, linearz:or fall back to the Triton kernel (handled automatically).z.or pick `experts_implementation='grouped_mm'`.)	indexr7   r8   current_devicerV   addlenr6   RuntimeError)rT   contextidxmsgs       r(   _assert_single_devicera      s{    , !,,2&,,

8Q8Q8SC!!#&
$%*	M 
 (# \\]]
sMM
NNr'   c                    U R                  [        R                  5      nUS-   R                  S5      R                  [        R                  5      $ )u  Round each fp32 SF up to the nearest power of 2 (zero mantissa).

Mirrors `deep_gemm.utils.math.ceil_to_ue8m0`. On SM100 the kernel's
`pack_fp32_into_ue8m0` cleanly extracts the biased exponent only when the
mantissa is already zero — its inner shifts (`>> 15`, `>> 7`, `<< 1`)
otherwise leak mantissa bits into adjacent UE8M0 byte slots and silently
corrupt the SF. SM90 consumes raw fp32 SFs without going through this path.
i i  )viewr7   int32bitwise_and_float)sfint_views     r(   _ceil_to_ue8m0ri      s<     wwu{{#H&445EFKKEKKXXr'   c                   [        U R                  5      nU R                  [        R                  :X  a  Ub8  U R                  S5      U:  a#  XR                  S5      -  nU R                  USS9n U(       a.  U R                  5       R                  [        R                  5      n OAU R                  5       n O0U R                  [        R                  :X  a  U(       a  [        U 5      n U R                  5       S;  a  [        SU R                  5        S35      eU R                  S5      nU R                  S5      nSU R                  5       -  nU* U-  * U-  nU R                  5       S:X  a  S	U4OXW-  S	U4n[!        U R#                  5       5      U:X  a  U $ [        R$                  " U R&                  XR                  U R                  S
9n	U	R)                  U 5        U	$ )uT  Lay out `sf` as DeepGEMM's `check_sf_layout` expects: MN-major
(`stride(-2) == 1`) and TMA-aligned (`stride(-1) == align(mn, 16/esize)`).

Inputs come in three flavors:
  - `float8_e8m0fnu` on SM100: raw UE8M0 bytes — pack 4 K-bytes → int32
    (last dim /4) for the kernel's `(INT, 1, gran_k)` path.
  - `float8_e8m0fnu` on SM90: SM90 dispatch only accepts FP32 SFs, so cast
    UE8M0 → FP32 (exact upcast — UE8M0 is the biased-exponent half of a
    pow-of-2 FP32, so `.float()` rebuilds the original FP32 scale exactly).
  - `float32`: per-token / per-block SFs from `per_token_cast_to_fp8` or
    on-disk weights — round to UE8M0 on SM100 (see `_ceil_to_ue8m0`).
  - `int32`: already-packed UE8M0 — pass through.

When `expected_mn` is set and the SF's M-dim is smaller (block-quantized
UE8M0, e.g. DSv4-Flash compressor weights with `(N/128, K/128)` SFs), we
repeat the SF on the M-axis to per-row before packing — the `(INT, 1, gran_k)`
DeepGEMM kernel branch is the only UE8M0 path on SM100; for `gran_mn > 1`
the kernel only handles FP32 SFs and would otherwise reject our INT SF here.
dim)r   r.   z"DeepGEMM SF must be 2D or 3D, got D   r   r   dtyperT   )rU   rT   rr   r7   float8_e8m0fnusizerepeat_interleave
contiguousrc   rd   rf   float32ri   rm   
ValueErrorelement_sizetuplestrideempty_stridedshapecopy_)
rg   expected_mnis_sm100gran_mnmnkfalign_to
aligned_mntarget_stridesouts
             r(   _coerce_sf_for_kernelr      sr   ( #H	xx5'''"rwwr{['@!WWR[0G%%g2%6B%%ekk2BB	U]]	"xB	vvxv=bffhZqIJJ	B	BR__&&H3(?#h.J(*Aa_BOQPZ;[NRYY[^+	


bhhhhryy
YCIIbMJr'   c                   U R                   [        R                  :X  a  SSSS.$ Uc  [        S5      e[	        U5      nUS;  a  [        SU S35      eUR                   [        R
                  :X  a  U(       a  SSSS.$ S	SS
.$ )u#  Pick the `per_token_cast_to_fp8` kwargs from weight dtype + SF dtype + arch.

Cases mirror the kernel's recipes:
  - FP4 weights (`int8`): gran_k=32 packed-UE8M0 SF. SM100+ only.
  - FP8 weights + UE8M0 SF on SM100: gran_k=128 packed-UE8M0 SF (DSv4).
  - FP8 weights + UE8M0 SF on SM90: gran_k=128 FP32 SF — the SM90 dispatch in
    `layout.hpp` only matches FP32 SFs, so we keep act SFs as FP32 (and float
    the weight SF in `_coerce_sf_for_kernel`; UE8M0 → FP32 is an exact upcast).
  - FP8 weights + float SF: gran_k=128 float SF (DSv3).
T    	use_ue8m0gran_kuse_packed_ue8m0z]DeepGEMM requires block-wise quantized FP8 weights, but the experts have no `block_size` set.))   r   )r   r   u?   DeepGEMM requires `block_size` ∈ {(128, 128), (1, 128)}, got r,   r   F)r   r   )rr   r7   int8rx   rz   rs   )weightweight_scale_inv
block_sizer   s       r(   _select_fp8_cast_kwargsr     s     ||uzz!!RTJJk
 	
 z"J//\]g\hhijkk!5!55(!SdKK#..r'   c                   U R                   nU R                  S5      n[        R                  " U R	                  5       USUS-
  S9R                  5       nXb-   S-
  U-  U-  nU[        XQ5      US-
  -  -   nXv-
  n	[        R                  R                  R                  U	R                  S5      S5      n
[        R                  " XTS9X   -   nU(       a   UR                  S5      R	                  5       nOP[        R                  " U4SU[        R                  S9n[        R                  " X:  U R	                  5       S5      X'   XU4$ )av  Build the TMA-aligned grouped layout DeepGEMM expects.

Returns `(sorted_to_padded, grouped_layout, total_padded_rows)`:
  - `grouped_layout` is per-row expert id (Hopper, with `-1` for padding /
    sentinels) or a cumsum of aligned per-expert counts (Blackwell).
  - EP sentinels (values == `num_experts`) are routed past the last expert
    block so DeepGEMM skips them.
r   r   )binsminmax)r   r   rS   ro   rT   rr   )rT   rt   r7   histcr   longr   nn
functionalpadcumsumarangefullrd   where)expert_ids_sortednum_experts	alignmentuse_psum_layoutrT   
num_tokenstokens_per_expertaligned_tokens_per_experttotal_padded_rowspadding_per_expertcumulative_paddingsorted_to_paddedgrouped_layouts                r(   !_build_deepgemm_contiguous_layoutr   :  s9    %%F"''*J$5$9$9$;+STZehiZijooq"3"?!"C	!QU^ ^"S%AYQR]%SS 3F,,001C1J1J11MvV||J>ASAff299!<@@B%6$8"VSXS^S^_+0;;7H7VXiXmXmXoqs+t(->>>r'   c                    [         R                  " U/U R                  SS Q7U R                  U R                  S.6nXU'   U$ )z;Pad a sorted tensor into the TMA-aligned contiguous layout.r   Nr   )r7   emptyr}   rT   rr   )xr   r   paddeds       r(   _pad_for_deepgemmr   \  s@    [[*YQWWQR[YQRQXQXYF Mr'   c                
    X   $ Nr   )x_paddedr   s     r(   &_unpad_from_deepgemm_contiguous_layoutr   c  s    %%r'   c                *   UR                  S5      nUR                  S5      nUR                  S5      n[        R                  " U5      u  pX
U-     nX   n[	        XXE5      u  pnX:  R                  S5      nU	R                  US-
  S9  UUU	UU
UUU4$ )zSort tokens by expert id and build the M-grouped padded layout.

Returns `(sorted_hidden_states_g, sample_weights_g, expert_ids_g,
          sentinel_mask, perm, sorted_to_padded, grouped_layout,
          total_padded_rows)`.
ro   r   )r   )rt   reshaper7   sortr   	unsqueezeclamp_)hidden_statestop_k_indextop_k_weightsr   r   r   	num_top_k
expert_idssample_weightsexpert_ids_gpermsorted_hidden_states_gsample_weights_gr   r   r   sentinel_masks                    r(   _dispatch_routed_inputr   j  s       $I$$R(J"**2.N J/L*9+<=%+
 ;\;;7&7 "0;;B?MK!O,	 	r'   c	                   [        X5      n	XR                  U	R                  5      R                  S5      -  n
U
R	                  US5        [
        R                  " U5      n[
        R                  " UR                  S5      U	R                  S9X'   X   R                  XVU5      R                  SS9R                  U5      $ )uR   Unpad → weighted multiply → mask sentinels → restore order → top-k reduce.ro   g        r   rS   r   rl   )r   torr   r   masked_fill_r7   
empty_liker   rt   rT   rc   sum)
out_paddedsorted_weightsr   r   r   r   r   
hidden_dim	out_dtyper   weightedinv_perms               r(   _combine_routed_outputr     s     1
NC&&syy1;;B??H --%H\\$))A,szzBHN"":*EIIaIPSST]^^r'   c           
        [        U R                  SS9  Ub  [        S5      eU R                  [        R
                  [        R                  4;  a  [        SU R                   35      e[        UR                  [        R                  :H  S9n[        XU[        U R                  5      5      nU R                  SU R                  S   5      n	UR                  " U	40 UD6u  p[        R                  " U
R                  S   UR                  S   U R                  US	9nUR!                  S
5      (       a  SSUS   4OSnUR#                  U
[%        XR'                  S5      S94U[%        X!R'                  S5      S94UUS9  UR                  U R                  SS UR                  S   4-   5      nUb  UR)                  U5        U$ )u   End-to-end DeepGEMM linear: per-token activation quant + FP8/FP4 matmul.

Static (per-tensor) activation quantization is rejected — DeepGEMM needs
per-row SFs. Callers should route static activations through the Triton fallback.
rX   r^   Nz@DeepGEMM linear does not support static activation quantization.z7DeepGEMM linear requires FP16 or BF16 activations, got rM   ro   r   r   r   r   r   r   )recipe)ra   rT   NotImplementedErrorrr   r7   bfloat16float16rx   rQ   r   r   rU   rc   r}   r   r   getr   r   rt   add_)inputr   r   biasr   output_dtypeactivation_scaledeepgemmcast_kwargsinput_2d	qinput_2dscale_2doutput	sf_recipes                 r(   deepgemm_fp8_fp4_linearr     s    %,,9#!"dee{{5>>5==99RSXS^S^R_`aa#6<<5::3MNH)&JPYZ_ZfZfPghKzz"ekk"o.H"88Q[QI[[+V\\!_U\\YefF 2=AS1T1TA{8,-Z^I	)(q@QRS	&'7[[QR^TU	   [[Sb)V\\!_,>>?FDMr'   c                   UR                   [        R                  :w  a  [        SUR                    35      e[	        5       nU R
                  (       a  UR                  OUR                  nUR                  nUR                  S5      nUR                  S5      nUR                  S5      n	[        XX0R                  UR                  [        U5      5      u  n
nnnnnnn[        U R                  (       a  U R                   OU R"                  5      n[        U R$                  5      nU R&                  (       a2  [        U R                  (       a  U R(                  OU R*                  5      OS nU R&                  (       a  [        U R,                  5      OS nU R
                  (       a  UR.                  S   OUR.                  S   n[1        XU5      n[        R2                  " UUXaR                   S9nU" UUUU[        U5      S9  U R&                  (       a  UR5                  SUUU   5        U R                  (       a  U R7                  U5      OU R9                  U5      n[        R2                  " UXUR                   S9nU" UUUU[        U5      S9  U R&                  (       a  UR5                  SUUU   5        [;        UUUUUUUU	UR                   5	      $ )N;DeepGEMM experts path requires bfloat16 hidden states, got ro   r   r   r   )r   )rr   r7   r   rx   rQ   is_transposedr   r   rT   rt   r   r   r   rU   r   has_gategate_up_projup_proj	down_projhas_biasgate_up_proj_biasup_proj_biasdown_proj_biasr}   r   r   
index_add__apply_gateact_fnr   )selfr   r   r   r   grouped_bf16_matmulrT   r   r   r   sorted_hiddenr   r   r   r   r   r   r   	weight_upweight_downup_bias	down_bias
up_out_dimactproj_outr   s                             r(   deepgemm_bf16_experts_forwardr    s_    enn,VWdWjWjVklmm#%H=A=O=O(99U]UtUt!!F  $I##A&J##B'J 	M3C3CXEYEY[dek[l		
 dmm**NI4>>*KZ^ZgZght--DDUDUVmqG15,,-DI )-(:(:$	PQ@RJ
M=N
OC{{,jObObcHY.R[\bRcd}}A/1FG-1]]t)H@UH ++'-J]J]
^C+sNT]^dTef}}q*Il,CD!
 
r'   c                   [        UR                  SS9  U R                  S:X  a  [        S5      eUR                  [
        R                  :w  a  [        SUR                   35      e[        U R                  R                  [
        R                  :H  S9nU R                  (       a  UR                  OUR                  nUR                  nUR                  S5      nUR                  S5      nUR                  S5      n	[        U R                   (       a  U R"                  OU R$                  5      n
[        U R                   (       a  U R&                  OU R(                  5      n[        U R                  5      n[        U R*                  5      n[-        XU R.                  [1        U5      5      n[3        XX0R4                  UR6                  [1        U5      5      u  nnnnnnnnUR9                  S	5      (       a  S
S
US   4OS nUR:                  " U40 UD6u  nn[=        UUU5      n[=        UUU5      n[
        R>                  " UU
R@                  S
   U[
        R                  S9nU" U[C        UUS94U
[C        XR                  S5      S94UUU[1        U5      S9  U R                   (       a  U RE                  U5      OU RG                  U5      nUR:                  " U40 UD6u  nn[
        R>                  " UX[
        R                  S9nU" U[C        UUS94U[C        XR                  S5      S94UUU[1        U5      S9  [I        UUUUUUUU	UR                  5	      $ )Nexpertsr   staticzJDeepGEMM experts dispatch does not support static activation quantization.r   rM   ro   r   r   r   r   r   r   rk   )r   r   )%ra   rT   activation_schemer   rr   r7   r   rx   rQ   r   r   r   r   r   rt   r   r   r   r   gate_up_proj_scale_invup_proj_scale_invdown_proj_scale_invr   r   rU   r   r   r   r   r   r   r   r}   r   r   r   r   )r   r   r   r   r   grouped_fp8_fp4_matmulrT   r   r   r   r   weight_scale_upr   weight_scale_downr   r   r   _expert_ids_gr   r   r   r   r   r   act_fp8
act_scalesr  proj_fp8proj_scalesr   s                                 r(    deepgemm_fp8_fp4_experts_forwardr  "  s    -..	B)!"nooenn,VWdWjWjVklmm#4>>3G3G5::3UVH.2.@.@**hFhFh  !!F  $I##A&J##B'Jdmm**NIdmmt::QUQgQghO4>>*K !9!9:))dooW`agWhiK 	M3C3CXEYEY[dek[l		 2=AS1T1TA{8,-Z^I #88V+VGZ)9;LMG":/?ARSJ{{,iooa.@W\WeWefH	'
@QRS	)/~~VXGYZ[!&) .2]]t)H@UH %::8S{SHk
++'%..
YC	(BSTU	+,=K[K[\^K_`a!&) "
 
r'   c                "   [        SS9n[        U R                  R                  5      n[        U R                  R                  5      n[        U R
                  R                  5      R                  [        R                  5      R                  5       n[        U R                  R                  5      R                  [        R                  5      R                  5       nU R                  nU R                  nU R                  nUS-  S:w  d	  US-  S:w  a  [        SU SU S35      eUR                  UR!                  5       SU-  US	US
9n	UR                  UR!                  5       UUS	US
9n
UR#                  XI4XZ45      u  u  pu  p[        R$                  R'                  USS9U l        [        R$                  R'                  U	SS9U l        [        R$                  R'                  USS9U l
        [        R$                  R'                  U
SS9U l        g)u  One-shot pack + permute of an FP8Experts module's L1/L2 weights into the
Mega MoE UTCCP layout. Called lazily on the first megamoe forward; idempotent
via the caller's ``_megamoe_transformed`` flag.

Steps:
  1. Cast UE8M0 SF → FP32 and call ``transform_sf_into_required_layout`` →
     packed int32 in MN-major TMA-aligned layout.
  2. Run ``transform_weights_for_mega_moe``: interleaves gate/up on L1 and
     transposes both SFs for UTCCP.
  3. Overwrite the loader-side parameters in place; the interleave preserves
     the ``[E_local, 2*I, *]`` leading dims so downstream ``.size(...)`` reads
     stay valid.

Unwraps any ``DTensor`` wrappers FSDP2/EP may have placed around the loader-
side Parameters — the kernel takes raw pointers.
TrM   r   r   zwDeepGEMM Mega MoE requires `hidden_dim` and `intermediate_hidden` divisible by 32 (FP8 SF granularity); got hidden_dim=z, intermediate_hidden=r,   r   )r   r   )r   
num_groupsF)requires_gradN)rQ   r   r  datar
  r   rc   r7   r   rv   r   intermediate_dimr   r   rx   r   rf   r   r   	Parameter)moduler   gate_up_sf_rawdown_sf_raw	gate_up_wdown_wintermediate_hiddennum_local_expertsr   
gate_up_sfdown_sfgate_updowns                r(   setup_megamoe_weightsr%  u  s   " $48Hf;;@@AN655::;K,,11277

CNNPIf&&++,11%**=HHJF 11**""JB!2R71<44><?UViUjjkm
 	

 ;;	$ < J 88$ 9 G .6-T-T		.*W?D  ((,,WE,JF$)HH$6$6zQV$6$WF!xx))$e)DF!&!3!3G5!3!QFr'   c                F   U R                   R                  [        R                  :w  a#  [	        SU R                   R                   S35      eUc  [        S5      e[        SS9n[        U SS5      (       d  [        U 5        SU l	        UR                  S	5      nUR                  S
5      nUR                  S	5      nU R                   R                  S
5      n	U R                   R                  S5      S-  n
XR                  5       -  n[        U SS5      b  U R                  R                  U:  a  UR                  UUUUUU
S9U l        UR                  USSSS9u  pU R                  R                  SU R!                  U5        U R                  R"                  SU R!                  U5        U R                  R$                  SU R!                  U5        U R                  R&                  SU R!                  U5        [        R(                  " Xx4[        R*                  UR,                  S9nUR/                  UU R                   U R0                  4U R2                  U R4                  4U R                  [        [        U SS5      SS5      S9  UR7                  UR                  5      $ )u  FP8 acts × FP4 weights Mega MoE forward (SM100+).

Fuses EP dispatch + L1 + SwiGLU + L2 + EP combine into one kernel,
overlapping NVLink with tensor-core compute. The kernel handles the full
`(num_tokens, hidden) → (num_tokens, hidden)` MoE forward including the
weighted top-k reduction; the caller must NOT all-reduce the output.

`process_group` is supplied automatically by `MoeTensorParalellExperts._prepare_input_fn`
when the module is wrapped for TP — it's required for the symm-buffer rendezvous
on first forward. `top_k_index` is GLOBAL expert ids (`-1` marks skipped slots).

Caller-managed `self` attributes:
  - `gate_up_proj`, `gate_up_proj_scale_inv`: L1 weight + UE8M0 SF.
  - `down_proj`, `down_proj_scale_inv`: L2 weight + UE8M0 SF.
  Both pairs must be transformed together via
  `transform_weights_for_mega_moe((gate_up, gate_up_sf), (down, down_sf))`.
  - `config.swiglu_limit` (optional): SwiGLU clamp; absent → unclamped.
zJDeepGEMM Mega MoE requires FP4-packed expert weights (dtype=`int8`), got `z/`. Use the 'deepgemm' dispatch for FP8 experts.NzDeepGEMM Mega MoE requires a `process_group` for the EP group. The TP wrapping (MoeTensorParalellMegaMoeExperts) supplies it automatically; pass it explicitly otherwise.TrM   _megamoe_transformedFro   r   r   r   symm_buffer)hiddennum_topkr   num_max_tokens_per_rankr  r   r   rq   configswiglu_limit)activation_clamp)r   rr   r7   r   r]   rx   rQ   r;   r%  r'  rt   r(  r+  r   r   r   r~   x_sftopk_idxtopk_weightsr   r   rT   r   r  r   r
  r   )r   r   r   r   process_groupr   r   r   r   r   r  num_global_expertsx_fp8r/  ys                  r(   (deepgemm_fp8_fp4_megamoe_experts_forwardr6    s   2 %**,!!''((WY
 	

 i
 	

 $48H 4/77d#$(!  $I##A&J##B'J))..q1++003q8*-?-?-AA t]D)1T5E5E5]5]`j5j#@@*$. 3 A 
 00$WYlp0qKE{
#))%0+:&,,T2kz*00=!!+:.44]C 	Z,ENN=K_K_`A				D778	112 x!>PTU   44##$$r'   )F)r=   boolreturnr   )r=   r7  r8  None)rT   torch.devicer8  r7  )rT   r:  r^   strr8  r9  )rg   torch.Tensorr8  r<  r   )rg   r<  r   z
int | Noner8  r<  )
r   r<  r   r<  r   ztuple | Noner   r7  r8  dict)
r   r<  r   r   r   r   r   r7  r8  z&tuple[torch.Tensor, torch.Tensor, int])r   r<  r   r<  r   r   r8  r<  )r   r<  r   r<  r8  r<  )r   r<  r   r<  r   r<  r   r   r   r   r   r7  r8  rz   )r   r<  r   r<  r   r<  r   r<  r   r<  r   r   r   r   r   r   r   torch.dtyper8  r<  )r   r<  r   r<  r   r<  r   torch.Tensor | Noner   ztuple[int, int] | Noner   r>  r   r?  r8  r<  )
r   torch.nn.Moduler   r<  r   r<  r   r<  r8  r<  )r  r@  r8  r9  )r   r@  r   r<  r   r<  r   r<  r2  z%torch.distributed.ProcessGroup | Noner8  r<  )1r$   
__future__r   	functoolscollections.abcr   dataclassesr   r7   utilsr   utils.import_utilsr   r	   r
   r   hub_kernelsr   tensor_parallelr   
get_loggerr    loggerr   cacherJ   _dynamoallow_in_graphrO   rQ   rU   setrV   r%   ra   ri   r   r   r   r   r   r   r   r   r   r  r  r%  r6  r   r'   r(   <module>rO     s  
 #  $ !    * % 
		H	%
 $  , R Rj  
@ = = '*e 8 +!OH
Y-`//,8/FR/^b/	/>?#?25?BE?X\?+?D&///  / 	/
 / / /d__ _  _ 	_
 #_ _ _ _ _ _< !%)- %,0((( #( 	(
 '( ( *( (V>
>> >  	>
 >BP
PP P  	P
 Pf7R~ <@Q%
Q%Q% Q%  	Q%
 9Q% Q%r'   