
    3jG                       S SK Jr  S SKrS SKrS SKJr  S SKJr  S SKrS SK	J
r
  S SK	Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#  \RH                  " \%5      r&\RN                  r(\RR                  " \(5      RT                  r+\RR                  " \(5      RX                  r-\R\                  S/S j5       r/S r0\" SS9 " S S5      5       r1\R\                  S0S j5       r2\Rf                  Rh                  S1S j5       r5S0S jr6S2S jr7    S3                   S4S jjr8    S5               S6S jjr9    S5               S6S jjr: " S S\
Rv                  5      r< " S  S!\<5      r=          S7S" jr>          S7S# jr? " S$ S%\
R                  5      rA " S& S'\ 5      rB\B" 5       rC S8 S9S( jjrD " S) S*\5      rE " S+ S,\5      rF " S- S.\5      rGg):    )annotationsN)Callable)	dataclass)
functional   )ACT2FN)ConversionOps)get_module_from_nameshould_convert_module)logging)is_kernels_availableis_torchdynamo_compiling   ) deepgemm_fp8_fp4_experts_forwarddeepgemm_fp8_fp4_linear(deepgemm_fp8_fp4_megamoe_experts_forward)lazy_load_kernel)ExpertsInterfaceuse_experts_implementation)to_localc                     [        [        S5      (       d  [        S[        R                   S35      e[        R                  $ )zTReturn ``torch.float8_e8m0fnu`` or raise a clear error on torch without FP8 support.float8_e8m0fnuzbscale_fmt='ue8m0' requires torch.float8_e8m0fnu, which is only available in PyTorch >= 2.7 (found z.). Upgrade torch to use UE8M0 FP8 checkpoints.)hasattrtorchRuntimeError__version__r        c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/integrations/finegrained_fp8.py_get_ue8m0_dtyper    0   sG     5*++%%*%6%6$77eg
 	
 r   c                    U H   n[        X5      (       d  M  [        X5      s  $    [        [        U 5      R                   SU 35      e)Nz has none of: )r   getattrAttributeErrortype__name__)objnamesnames      r   _first_attrr)   ;   sE    33%%  DI../~eWE
FFr   T)frozenc                  8    \ rS rSr% SrS\S'   S\S'   S\S'   Srg)	FineGrainedFP8B   zNEntry points exposed by the `kernels-community/finegrained-fp8` Triton kernel.r   matmulbatched_matmulgrouped_matmulr   N)r%   
__module____qualname____firstlineno____doc____annotations____static_attributes__r   r   r   r,   r,   B   s    Xr   r,   c                    [        5       (       d  [        5       (       d  [        S5      e[        S5      n U c  [        S5      e[	        U SS5      n[	        U SS5      n[	        U SS5      nSU4SU4SU44 VVs/ s H  u  pEUb  M
  UPM     nnnU(       a  [        SS	R                  U5       S
35      e[        UUUS9$ s  snnf )z
Load the finegrained-fp8 Triton kernel once and return its entry points.

Raises `ImportError` if the `kernels` package is missing, or the kernel or required
symbols cannot be found.
z`finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`.zfinegrained-fp8Nu   Failed to load the finegrained-fp8 kernel — check that `kernels-community/finegrained-fp8` has a build matching the current torch/CUDA.r.   matmul_batchedmatmul_groupedz4finegrained-fp8 kernel is missing required symbols: , zA. Please update the `kernels` package (`pip install -U kernels`).)r.   r/   r0   )r   r   ImportErrorr   r"   joinr,   )kernelr.   r/   r0   r(   attrmissings          r   _load_finegrained_fp8_kernelr@   K   s    $%%#%%r  /0F~;
 	

 VXt,FV%5t<NV%5t<N
 v~.~.

JD
  	
   B499WCUBV WN N
 	

 %% s   8	B>B>c                     [        5       n g N)r@   )_s    r    _populate_finegrained_fp8_kernelrD   z   s    $&Ar   c                 H    [        5       (       a
  [        5         [        5       $ rB   )r   rD   r@   r   r   r   load_finegrained_fp8_kernelrF      s    !!(*'))r   c                    X-   S-
  U-  $ )zCeiling division.r   r   )abs     r   _cdivrJ      s    EAI!r   c	                >   [         R                  " XX%-  US9n	[        R                  " XR	                  5       S9n
[        Ub  [        X5      OSU5      nUb  [        X'5      OSn[         R                  " XXS9n[        R                  " XR	                  5       S9nX4$ )u  Allocate `(weight, weight_scale_inv)` parameters for one expert projection.

`weight_k_div` halves the K dim for FP4-packed storage (2 e2m1 values per byte).
`sf_gran_n` / `sf_gran_k` set per-block (None → per-row/per-tensor) SF granularity.
`min_sf_out` floors the SF tensor's output dim — used by the fused gate_up
projection to keep room for both halves (pass `2`) even when `proj_out < sf_gran_n`
would otherwise collapse the SF dim to 1.
dtype)requires_gradr   )r   emptynn	Parameteris_floating_pointmaxrJ   )num_expertsproj_outproj_inweight_dtypesf_dtypeweight_k_div	sf_gran_n	sf_gran_k
min_sf_outweight_tweightsf_outsf_insf_tsfs                  r   _alloc_expert_projrc      s    & {{;'2IQ]^H\\(2L2L2NOFy/Dx+!ZXF)2)>E'%AE;;{EBD	d*@*@*B	CB:r   c           	     j    [        5       nUR                  U UUUUUS9nUb  UR                  U5        U$ )u   Triton FP8/FP4 linear: fused act-quant + matmul, then optional bias add.

``activation_scale=None`` → dynamic per-K-block scales (inline); set it for
static per-tensor quant. ``weight_scale_inv`` accepts fp32 or UE8M0; the
dispatcher routes FP4 (``int8``-packed) weights automatically.
activation_scale)rF   r.   add_)	inputr^   weight_scale_inv
block_sizebiasrf   output_dtypefinegrained_fp8outputs	            r   finegrained_fp8_linearro      sL     23O##) $ F DMr   c           
     x   USL =(       a    UR                   R                  S:H  =(       a    [        R                  R	                  5       R
                  S:  =(       a    UR                  [        R                  :H  =(       d&    USL=(       a    US   US   s=:H  =(       a    S:H  Os  =(       a9    [        R                  R                  SS5      S	:g  =(       a    [        5       (       + nU(       a   [        U UUUUUUS
9$ [!        XX#XEU5      $ ! [         a"  n[        R                  SU 35         SnAN5SnAff = f)u  End-to-end FP8/FP4 linear used by `FP8Linear` and the eager `FP8Experts` loop.

Dispatch order — both backends handle FP8 and FP4 weights with fp32 or UE8M0 scales:
  1. DeepGEMM (`deepgemm_fp8_fp4_linear`) — 3-6× faster on the shapes it supports.
     Preferred for FP4, UE8M0 SFs, and 128×128 block FP8.
  2. Triton finegrained-fp8 fallback — used when DeepGEMM is unavailable, when the
     caller passes ``activation_scale`` (DeepGEMM is dynamic-only), or for any
     shape DeepGEMM declined.

Args:
    input: (..., K) bf16/fp16 activations.
    weight: (N, K) `float8_e4m3fn` or (N, K // 2) `int8` (FP4-packed).
    weight_scale_inv: per-block weight scales — `float32` (V3-style) or `float8_e8m0fnu`
        (V4-style; reinterpreted as int32 at the DeepGEMM kernel boundary).
    block_size: [block_n, block_k] for FP8 block-wise quant, or None/[N, K] for per-tensor.
        Ignored for FP4 weights (the kernel infers SF granularity from the dtype).
    bias: optional bias added to the matmul output.
    activation_scale: pass a per-tensor scalar to use static activation quant; leave `None`
        for dynamic (per-token) quant.
    output_dtype: desired output dtype.
Ncuda	   r   r      $TRANSFORMERS_DISABLE_DEEPGEMM_LINEAR01)rj   rl   rf   rk   zDDeepGEMM unavailable for this call, falling back to Triton. Reason: )devicer$   r   rq   get_device_propertiesmajorrM   int8osenvirongetr   r   r;   loggerwarning_oncero   )	rh   r^   ri   rj   rk   rf   rl   deepgemm_preferredes	            r   
fp8_linearr      s+   T 	D  	+MM&(	+JJ,,.449	+ \\UZZ'mJd,B,lzRS}XbcdXeGlGlilGl	+ JJNNA3G3N		+
 )**  	l* %)!1  "%1Atgstt  	l "fghfi jkk	ls   0D 
D9D44D9c                  X   ^  \ rS rSr    S           SU 4S jjjrSS jrSrU =r$ )	FP8Lineari	  c                  > [         T
U ]  X5        X`l        X0l        X@l        [
        R                  R                  [
        R                  " X![        S95      U l
        U R                  c=  [        R                  " [
        R                  " S[
        R                  S95      U l        OUS:X  a
  [        5       O[
        R                  nX R                  S   -   S-
  U R                  S   -  nXR                  S   -   S-
  U R                  S   -  n	[        R                  " [
        R                  " XUS95      U l        U R                  S:X  a=  [        R                  " [
        R                  " S[
        R                  S95      U l        OU R!                  SS 5        U R                  (       a:  [        R                  " [
        R                  " U R"                  5      5      U l        g U R!                  SS 5        g )	NrL         ?ue8m0r   r   staticrf   rk   )super__init__has_biasrj   activation_schemer   rP   rQ   rO   
_FP8_DTYPEr^   tensorfloat32ri   r    rf   register_parameterout_featuresrk   )selfin_featuresr   rj   r   	scale_fmtr   rX   scale_out_featuresscale_in_features	__class__s             r   r   FP8Linear.__init__
  sc    	3 $!2hh((\V`)ab??"$&LLc1W$XD!-6'-A')u}}H".1C"Ca"GDOO\]L^!^!,q/A!AA!E$//Z[J\ \$&LL=Ojr1s$tD!!!X-$&LLc1W$XD!##$6===U[[1B1B%CDDI##FD1r   c           
     V   U R                   R                  5       S:  a+  [        R                  " XR                   U R                  5      $ [        U R                   5      n[        U R                  5      n[        UUUU R                  U R                  UR                  U R                  S9$ )Nr   )rj   rf   rl   rk   )r^   element_sizeFlinearrk   r   ri   r   rj   rf   rM   )r   rh   r^   	scale_invs       r   forwardFP8Linear.forward-  s    ;;##%)88E;;		::$++&T223	!22
 	
r   )rf   r   rk   rj   r   r^   ri   NdynamicfloatF)r   intr   r   rj   tuple[int, int] | Noner   strr   r   r   bool)rh   torch.Tensorreturnr   )r%   r1   r2   r3   r   r   r6   __classcell__r   s   @r   r   r   	  s`    
 .2!* !2!2 !2 +	!2
 !2 !2 !2 !2F
 
r   r   c                  `   ^  \ rS rSrSr    S             SU 4S jjjrSS jrSrU =r$ )	FP8GroupedLineari?  u  FP8 drop-in for block-diagonal grouped linears.

The underlying nn.Linear stores a single `(n_groups * out_per_group, in_per_group)`
weight; logically that's `n_groups` independent `(out_per_group, in_per_group)`
sub-matrices, each consuming a disjoint slice of the input's last-but-one dim.
Forward expects input of shape `(..., n_groups, in_per_group)` and returns
`(..., n_groups, out_per_group)` — same contract as the vanilla bf16 grouped
linear it replaces.

c           	     6   > [         TU ]  UUUUUUS9  X0l        g )Nr   r   rj   r   r   r   )r   r   n_groups)	r   in_features_per_groupr   r   rj   r   r   r   r   s	           r   r   FP8GroupedLinear.__init__K  s0     	-%!/ 	 	
 !r   c           	     *   UR                   S S nUR                   S   nU R                  R                  5       S:  a  U R                  R                  U R                  SU5      R                  SS5      nUR                  SU R                  U5      R                  SS5      n[        R                  " X5      R                  SS5      nUR                  " / UQU R                  PSP76 nU R                  (       a5  UR                  U R                  R                  U R                  S5      5        U$ [        U R                  5      n[        U R                  5      nUR                  U R                  SU5      nUR                  SS5      R                  SU5      nUR                  U R                  UR                  S5      U R                  -  UR                  S5      5      nUR                  S5      U R                  -  n[        R                   " U R                  4XqR"                  [        R$                  S9n[        R&                  " SU R                  S-   UR"                  [        R$                  S9U-  n	[)        5       n
U
R+                  UUUU	UU R,                  S9nUR                  " U R                  /UQSP76 R                  SS5      nU R                  (       a5  UR                  U R                  R                  U R                  S5      5        U$ )Nr   r   r   )rw   rM   offsetstokens_per_expertrj   )shaper^   r   viewr   	transposereshaper   bmmr   rg   rk   r   ri   movedimsizefullrw   int32arangerF   r0   rj   )r   xinput_shape
hidden_dimwyr   tokens_per_groupr   r   rm   s              r   r   FP8GroupedLinear.forward_  s\   ggcrlWWR[
;;##%)  J?II!QOA		"dmmZ8BB1aHA		!))!Q/A		:;::r:A}}tyy~~dmmR89HT[[!T223	FF4=="j1IIb!$$R4NN4==)..2Ct}}2TV_VdVdefVgh	66!95!JJ'79IRZRZbgbmbmn,,q$--!"3AHHEKKX[kk57**/ + 
 IIdmm6k626>>q"E==FF499>>$--45r   )r   r   )r   r   r   r   r   r   rj   r   r   r   r   r   r   r   )r   r   r   r   )	r%   r1   r2   r3   r4   r   r   r6   r   r   s   @r   r   r   ?  sn    	  .2!* !"! ! 	!
 +! ! ! ! !($ $r   r   c                :   U R                   S:X  a  [        S5      e[        5       nUR                  S5      nUR                  S5      nUR                  S5      nUR	                  USS9nUR                  S5      n	UR                  S5      n
XR                  :  R                  S5      n[        U R                  (       a  U R                  OU R                  5      n[        U R                  (       a  U R                  OU R                  5      n[        U R                  5      n[        U R                  5      nUR!                  UUUU R"                  U
S9nU R                  (       a  U R%                  U5      nOU R'                  U5      nUR!                  UUUU R"                  U
S9nUU	R)                  UR*                  5      R                  S5      -  nUR-                  US5        UR/                  XeU5      R1                  SS9nUR)                  UR*                  5      $ )	Nr   zbatched_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.r   r   dim)rj   
expert_ids        r   )r   NotImplementedErrorrF   r   repeat_interleaver   rT   	unsqueezer   has_gategate_up_projup_projgate_up_proj_scale_invup_proj_scale_inv	down_projdown_proj_scale_invr/   rj   _apply_gateact_fntorM   masked_fill_r   sum)r   hidden_statestop_k_indextop_k_weightsrm   	num_top_k
num_tokensr   selected_hidden_statessample_weightsr   sentinel_mask	weight_upweight_scale_upweight_downweight_scale_downrU   weighted_outfinal_hidden_statess                      r   fp8_batched_mm_experts_forwardr     s    )!W
 	

 23O  $I##A&J##B'J +<<YA<N"**2.N$$R(J
  #3#33>>rBMdmm**NIdmmt::QUQgQghO4>>*K !9!9: --?? . H }}##H- ;;x( --?? . H n//?II"MML mS1 '++J:NRRWXRY!!-"5"566r   c           	        U R                   S:X  a  [        S5      e[        5       nUR                  nUR	                  S5      nUR	                  S5      nUR	                  S5      nUR                  S5      n	UR                  S5      n
[        R                  " U
5      u  pXU-     nX   nUR                  S:X  a  UR                  5       OUR                  5       n[        R                  " XR                  SU R                  S-
  S9n[        R                  " US[        R                  S9nXR                  :  R                  S5      n[!        U R"                  (       a  U R$                  OU R&                  5      n[!        U R"                  (       a  U R(                  OU R*                  5      n[!        U R,                  5      n[!        U R.                  5      nUR1                  UUUUUU R2                  S	9nU R"                  (       a  U R5                  U5      nOU R7                  U5      nUR1                  UUUUUU R2                  S	9nUUR9                  UR:                  5      R                  S5      -  nUR=                  US
5        [        R>                  " U5      n[        R@                  " UR	                  S5      US9UU'   UU   nURC                  XvU5      RE                  SS9nUR9                  UR:                  5      $ )Nr   zgrouped_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.r   r   cpur   )binsminrS   )r   rM   r   r   )rw   r   )#r   r   rF   rw   r   r   r   sortr$   r   r   histcrT   cumsumr   r   r   r   r   r   r   r   r   r   r0   rj   r   r   r   rM   r   
empty_liker   r   r   )r   r   r   r   rm   rw   r   r   r   r   r   expert_ids_gpermselected_hidden_states_gsample_weights_ghistc_inputr   r   r   r   r   r   r   rU   r   inv_permr   s                              r   fp8_grouped_mm_experts_forwardr     s    )!W
 	

 23O!!F  $I##A&J##B'J #**2.N$$R(J J/L,Y->?%+
 +1++*>,$$&LDTDTDVKK6F6FASWScScfgSghll,!5;;GG "%5%55@@DMdmm**NIdmmt::QUQgQghO4>>*K !9!9: -- +?? . H }}##H- ;;x( --+?? . H .11(..AKKBOOL mS1 %H\\$))A,v>HTN)L '++J:NRRWXRY!!-"5"566r   c                     ^  \ rS rSr% SSSS.0rS\S'        S         SU 4S jjjrSS	 jr        SS
 jr S         SS jjr	Sr
U =r$ )
FP8Expertsi.  deepgemm_megamoemegamoe_expertsmegamoe_router)moe_tp_experts	ep_routerzdict[str, dict[str, str]]_impl_tp_layer_overridesc                z  > [         T
U ]  5         USL d   S5       eXl        XPl        X`l        X l        UR                  U l        X0l        [        USS5      U l
        [        USS5      U l        [        USS 5      U l        [        USS 5      U l        [        [        US	S
5         U l        [        USS 5      U l        [        USS5      S:H  nUS:X  a
  [%        5       O[&        R(                  nU(       a  [&        R*                  USSSS.n	O[,        UUb  US   OS Ub  US   OS S.n	U R                  (       aQ  [/        U R                  SU R                  -  U R                  4SS0U	D6u  U l        U l        U R5                  SS 5        OK[/        U R                  U R                  U R                  40 U	D6u  U l        U l        U R5                  SS 5        [/        U R                  U R                  U R                  40 U	D6u  U l        U l        U R5                  SS 5        U R                  S:X  a  [>        R@                  " [&        RB                  " U R                  [&        R(                  S95      U l"        [>        R@                  " [&        RB                  " U R                  [&        R(                  S95      U l#        g g )NFzWFP8Experts does not support bias for now, please open an issue if you want this featurenum_local_expertsrT   moe_intermediate_sizeintermediate_sizeswiglu_alphaswiglu_limithidden_activation
hidden_actexpert_dtypefp8fp4r   r   r       )rW   rX   rY   rZ   r[   r   )rW   rX   rZ   r[   r\   gate_up_proj_biasup_proj_biasdown_proj_biasr   rL   )$r   r   configr   r   rj   hidden_sizer   r   r)   rT   intermediate_dimr"   r  r  r   r   limitr    r   r   rz   r   rc   r   r   r   r   r   r   r   rP   rQ   onesgate_up_proj_activation_scaledown_proj_activation_scale)r   r  rj   r   r   r   r   is_fp4rX   alloc_kwargsr   s             r   r   FP8Experts.__init__=  s~    	5  	
e	
    $ ,,!2&v/BMR +F4KM` a#FNDA#FNDA[1DlSTV^T:
 75@)2g)=#%5== %

$ !L !+$.8.DZ]$.8.DZ]$	L ===O  !d&;&;";T__>YZ>^j>:Dt: ##$7>3E  $"7"74LX40DL$0 ##ND93Edoot/D/D4
HT4
00 	 0$7!!X-13ejjIYIYafanan>o1pD..0ll5::dFVFV^c^k^k;l.mD+ .r   c                   UR                  SSS9u  p#U R                  bk  UR                  U R                  S9nUR                  U R                  * U R                  S9nU[        R
                  " X R                  -  5      -  nUS-   U-  $ U R                  b>  UR                  U R                  S9nUR                  U R                  * U R                  S9nU R                  U5      U-  $ )Nr   r   r   )rS   r   rS   r   )chunkr  clampr  r   sigmoidr  r   )r   gate_upgateupglus        r   r   FP8Experts._apply_gate  s    ===+(::$"3"3:4Dt000d6G6GHBt.?.?'?@@CH##ZZ#::$**:-Dtzzktzz:B{{4 2%%r   c                   [         R                  " U[         R                  S9n[         R                  " 5          [         R                  R
                  R                  X R                  S-   S9nUR                  SSS5      n[         R                  " UR                  SS9S5      R                  SS	9R                  S
5      nS S S 5        W GH  nXpR                  :X  a  M  [         R                  " WU   5      u  pX   n
U R                  S:X  a  U R                  U   OS nU R!                  U
U R"                  (       a  U R$                  U   OU R&                  U   U R"                  (       a  U R(                  U   OU R*                  U   US9nU R"                  (       a  U R-                  U5      OU R/                  U5      nU R                  S:X  a  U R0                  U   OS nU R!                  UU R2                  U   U R4                  U   US9nX9US 4   nXR7                  UR8                  5      -  nUR;                  SXR7                  UR8                  5      5        GM     UR7                  UR8                  5      $ ! , (       d  f       GN= f)NrL   r   )num_classesr   r   )r   r   r   F)as_tupler   r   re   )r   
zeros_liker   no_gradrP   r   one_hotrT   permutegreaterr   nonzeror   wherer   r  r   r   r   r   r   r   r   r   r  r   r   r   rM   
index_add_)r   r   r   r   r   expert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategate_up_act_scalerU   down_act_scalerouting_weightsr   s                   r   r   FP8Experts.forward  s=   
 $..}EMMR]]_((--55kO_O_bcOc5dK%--aA6K{8'DaHPPZ_P`eefhiJ 
 %J---#(;;{:/F#G I)4MBFBXBX\dBd22:>jn  {{15!!*-DLLQ[D\;?==++J7dNdNdeoNp!2	 # H 6:]]t''1T\H]H?C?U?UYa?a//
;gk  {{z*((4!/	 # H ,y$,FGO#&8&8&HHL**1iI\IbIb9cd7 %8 #%%m&9&9::C _s   BI;;
J
c           	         UR                  5       S:  a  [        R                  " XS 5      $ [        UUUU R                  UUR
                  S9$ )Nr   )rf   rl   )r   r   r   r   rj   rM   )r   rh   r^   ri   rf   s        r   r   FP8Experts.linear  sM      1$88E400OO-
 	
r   )r   r   rj   r  r   r  r   r   r  r   r   r   r   r  r  rT   r  r  r   r   )Nr   r   FT)
rj   r   r   r   r   r   r   r   r   r   )r!  r   r   r   )r   r   r   r   r   r   r   r   rB   )
rh   r   r^   r   ri   r   rf   torch.Tensor | Noner   r   )r%   r1   r2   r3   r  r5   r   r   r   r   r6   r   r   s   @r   r   r   .  s     	/)
;7  .2!* Dn +Dn 	Dn
 Dn Dn Dn DnL&(;)(;8D(;Ua(;	(;^ 15

 
 '	

 .
 

 
r   r   c                  &    \ rS rSrSr\\\\S.r	Sr
g)FP8ExpertsInterfacei  z?Interface for registering custom FP8 experts forward functions.)
batched_mm
grouped_mmdeepgemmr   r   N)r%   r1   r2   r3   r4   r   r   r   r   _global_mappingr6   r   r   r   r?  r?    s    I 544D	Or   r?  c                l   UR                   (       a  U $ SnU R                  5        GH  u  pV[        XQ5      (       d  M  Sn[        R                  " S5         UR                  S5      (       a}  [        USS5      n[        USS5      n	[        USU R                  R                  5       5      n
[        [        [        U	US	9nU" U
UR                  UR                  UR                  U	US
9nO[        U5      [         R"                  L aM  [%        UR&                  UR(                  UR                  UR                  UR                  UR*                  SLS9nO[-        U[         R"                  5      (       ap  S[        U5      R.                  ;   aW  [1        UR&                  UR(                  UR2                  UR                  UR                  UR                  UR*                  SLS9nUb  U R5                  XW5        SnSSS5        GM     U(       d  [6        R9                  S5        U $ ! , (       d  f       GM  = f)a  
A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.

Parameters:
    model (`torch.nn.Module`):
        Input model or `torch.nn.Module` as the function is run recursively.
    modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
        Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
    quantization_config (`FineGrainedFP8Config`):
        The quantization config object that contains the quantization parameters.
    pre_quantized (`book`, defaults to `False`):
        Whether the model is pre-quantized or not
FNmetaz.expertsr   Tr   r  )experts_classexperts_interfacer   r   )r  rj   r   r   r   r   r   GroupedLinear)r   r   r   rj   r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)
dequantizenamed_modulesr   r   rw   endswithr"   r  get_text_configr   r   ALL_FP8_EXPERTS_FUNCTIONSweight_block_sizer   r   r$   rP   Linearr   r   r   rk   
isinstancer%   r   r   set_submoduler~   warning)modelmodules_to_not_convertquantization_configpre_quantizedhas_been_replacedmodule_namemodule
new_moduler   r   r  	new_classs               r   replace_with_fp8_linearr\    s   " %%$224$[II
\\&!##J//"6:t<"6:u= 5<<3O3O3QR6",&?%%		 '!2DD&9&K&K1;;%%
 f*& & 2 2!'!4!42DD&9&K&K1;;#[[4
 FBII..?d6lF[F[3[ .*0*<*<!'!4!4#__2DD&9&K&K1;;#[[4
 %##K<$(!_ "!  5l <	
 Lm "!s   F"H##
H3	c                  P    \ rS rSrSrS rS
S jrSS jrSS jr\	SS j5       r
Srg	)Fp8Quantizei/  zV
A quantization operation that creates two tensors, weight and scale out of a weight.
c                    Xl         g rB   hf_quantizerr   ra  s     r   r   Fp8Quantize.__init__4      (r   c                l   S nU R                   R                  bp  [        U R                   R                  [        5      (       a&  U R                   R                  R	                  S5      nO![        U R                   R                  SS 5      nUc  UR                  S   UR                  S   4n[        U5      $ )NrN  r   r   )ra  rU  rP  dictr}   r"   r   tuple)r   valuerj   s      r   _resolve_block_sizeFp8Quantize._resolve_block_size7  s    
00<$++??FF!..BBFFGZ[
$T%6%6%J%JL_aef
++b/5;;r?;JZ  r   c                \   UR                   S:  a  X0$ U R                  U5      u  p4UR                  S   UR                  S   peXS-  S:w  d  Xd-  S:w  a  X0$ UR                  S S nXS-  nXd-  n	UR                  n
UR                  [        R
                  5      nUR                  " / UQUPUPU	PUP76 nUR                  5       R                  SS9n[        R                  " US:  U[        R                  " U5      5      n[        U-  n[        R                  " US:  U[        R                  " U5      5      nUR                  S5      R                  S5      nUU-  n[        R                  " U[        [        S9R                  [        5      nUR                  U
5      nS	U-  R                  [        R
                  5      nU R                   R"                  R$                  S
:X  a  [        R&                  " S[        R(                  " [        R*                  " UR                  [        R,                  " [        R
                  5      R.                  S95      5      5      nUR                  [1        5       5      nUR3                  S5      (       a  UR5                  SS5      S   S-   OUS-   nUUUU0$ )Nr   r   r   r   )r   r   rl  r  r   r          @)r   .weight.r   .weight_scale_inv
_scale_inv)ndimri  r   r   r   r   r   absamaxr/  	ones_like_FP8_MAXr   r  _FP8_MINr   ra  rU  r   powceillog2finfotinyr    rK  rsplit)r   keyrh  block_mblock_nrowscolsleading_shape
rows_tiles
cols_tilesoriginal_shape
value_fp32reshapedmax_abssafe_max_absscalesscales_broadcastscaled	quantized
inv_scales	scale_keys                        r   _quantize_oneFp8Quantize._quantize_oneB  sE    ::><33E:[[_ekk"od>Q$.A"5< CR(_
_
XXemm,
%%_}_j_'_:_W^_,,.%%(%3{{7Q;9QRL(Wq[&%//&2IJ!++B/99"=,,KKH(CFFzR	%%n5	Fl&&u}}5
 00::gE3

5::j>N>NSXS^S^_d_l_lSmSrSr>N>s3t(uvJ#'7'9:JCF<<PYCZCZCJJsA&q),??`cfr`r	Y	:66r   c                    0 nUR                  5        HA  u  pE[        U[        5      (       a  US   OUnUR                  U R	                  XF5      5        MC     U$ )Nr   )itemsrP  listupdater  )r   
input_dictkwargsresultr~  rh  r   s          r   convertFp8Quantize.convertj  sS     +-$**,JC!+E4!8!8U1XeFMM$,,S9: - r   c                ,    [        U R                  5      $ rB   )Fp8Dequantizera  r   s    r   
reverse_opFp8Quantize.reverse_opt  s    T..//r   r`  N)rh  r   r   ztuple[int, int])r~  r   rh  r   r   dict[str, torch.Tensor])r  r   r   r  r   r	   )r%   r1   r2   r3   r4   r   ri  r  r  propertyr  r6   r   r   r   r^  r^  /  s0    )	!&7P 0 0r   r^  c                      \ rS rSrSrS rSS jrSrSS jr S       SS jjr	SS	 jr
  S       SS
 jjr\SS j5       rSrg)r  iy  ux  Dequantize FP8 weights using their per-block ``weight_scale_inv``.

Designed to run as the *first* op in any :class:`WeightConverter` chain when
loading with ``dequantize=True`` — :meth:`update_weight_conversions` on the
FP8 quantizer attaches it to each existing model-specific converter so that
per-expert (weight, scale) pairs are folded into full-precision tensors before
the chain's merge / concat ops collapse the per-expert structure.

Pattern semantics
    Input ``input_dict`` carries one entry per source pattern; each value is a
    list of tensors (one per ``*`` match). For every weight pattern that has a
    sibling ``*.weight_scale_inv`` pattern in the dict, this op pairs them up by
    index, dequantizes per-pair, and emits the dequantized list under the
    original *weight* key. Scale entries are dropped from the output so the
    remaining ops only see weights.
c                    Xl         g rB   r`  rb  s     r   r   Fp8Dequantize.__init__  rd  r   c                    UR                  S5      nU(       a  US S OUnUR                  S5      (       a  US [        S5      *  S-   nOUS:X  a  SnOUS-   nU(       a  US-   $ U$ )N$r   rn  rp  r^   ri   rq  )rK  len)r   weight_patternanchoredbasescales        r   _scale_pattern_for Fp8Dequantize._scale_pattern_for  sr    !**3/&.~cr"N==##*C	N?+.AAEX&E<'E&us{1E1r   )r   g      ?r   g      ?rm  g      @g      @g      @g       g      g      g      g       g      g      g      c                   [         R                  " U R                  [         R                  UR                  S9nUR                  5       R                  [         R                  5      nUS-  R                  5       nUS-	  S-  R                  5       n[         R                  " X$   X%   /SS9nUR                  " / UR                  SS QSUR                  S   -  P76 $ )uR   Two ``e2m1`` FP4 values per byte → float32 tensor twice as wide on the last dim.)rM   rw         r   r   Nr   )r   r   _FP4_E2M1_LUTr   rw   
contiguousr   uint8longstackr   r   )r   packedlutu8lowhighunpackeds          r   _unpack_fp4Fp8Dequantize._unpack_fp4  s    ll4--U]]6==Y %%ekk2CxooqC%%';;#)4"=Icr!2IAR8H4HIIr   Nc                   [        [        SS 5      nUR                  [        R                  :X  d  Ub"  UR                  U:X  a  U R	                  U5      nOUR                  [        R                  5      nUR                  SS  u  pg UR                  SS  u  pXh-  (       d	  Xy-  (       a  [        SU SU SU SU	 S3	5      eXh-  n
Xy-  nUcK  UR                  R                  (       a   UR                  5       S:  a  UR                  O[        R                  nUR                  [        R                  :X  a1  UR                  [        R                  5      S	-
  R                  5       nOUR                  [        R                  5      nUR                  nUR                  S
XX5      nUR                  S
X5      R!                  S
5      R!                  S5      nX-  R                  U5      R                  U5      $ ! [         a    Su  p GNpf = f)Nfloat4_e2m1fn_x2r   )r   r   zWeight shape (r:   z) not divisible by scale grid (z).r        _@r   )r"   r   rM   rz   r  r   r   r   	Exception
ValueErrorrR   r   bfloat16r  exp2r   r   )r   r  r  rl   	fp4_dtypequantized_fp32r  r  
scale_rows
scale_colsr  r  s_fp32r  qss                   r   _dequantize_oneFp8Dequantize._dequantize_one  s   
 E#5t<	??ejj(Y-ByZcGc!--i8N&\\%--8N#))"#.
	*%+\\"#%6"J  1 b.Mj\Y[\f[ggij  $$
  & > >6CVCVCX\]C]chcqcq  <<5;;&ii.6<<>FYYu}}-F'--""2zJPNN2z6@@DNNqQzz,'//??7  	*%)"J
	*s   G6 6HHc                \    Ub  Uc  g [        X5      u  p4[        X4S 5      n[        USS 5      $ )NrM   )r
   r"   )r   rS  full_layer_namerY  tensor_nameparams         r   _get_target_dtypeFp8Dequantize._get_target_dtype  s7    =O325JT2ugt,,r   c                   U R                  X25      nSU;   ac  Ub  UOSnUS   n[        U[        5      (       a  US   OUnSU;   a2  US   n[        U[        5      (       a  US   OUnX`R                  XxUS90$ Xg0$ 0 n	UR	                  5        H  u  pSU
;   d  SU
;   a  M  U R                  U
5      nX;  a  XU
'   M/  [        U[        5      (       a  UOU/nX   n[        U[        5      (       a  UOU/n[        U5      [        U5      :w  a'  [        SU
 S[        U5       S	[        U5       S
35      e[        X5       VVs/ s H  u  pU R                  XUS9PM     snnX'   M     U	$ s  snnf )Nzweight$r^   r   ri   )rl   rf   z/Fp8Dequantize: weight/scale count mismatch for z (z weights vs z	 scales).)	r  rP  r  r  r  r  r  r  zip)r   r  r  rS  r  rl   
target_keyr  r  r  r~  rh  r  weightsr   r  s                   r   r  Fp8Dequantize.convert  s    --eE
 
" -<,GXJ"9-I(29d(C(C	!I!Z/#$67&0&>&>F"$8$8Ye$8$fgg** @B$**,JC!S(,>#,E//4I*#s)%66eUGG*F)&$77VfXF7|s6{* EcU KG~\#f+iI  ^aah]qr]qUYUV4//</P]qrFK! -"  ss   E)c                ,    [        U R                  5      $ rB   )r^  ra  r  s    r   r  Fp8Dequantize.reverse_op
  s    
 4,,--r   r`  )r  r   r   r   )r  r   r   r   rB   )r  r   r  r   rl   torch.dtype | Noner   r   )rS  torch.nn.Module | Noner  
str | Noner   r  )NN)r  ,dict[str, list[torch.Tensor] | torch.Tensor]r  r  rS  r  r   r  r  )r%   r1   r2   r3   r4   r   r  r  r  r  r  r  r  r  r6   r   r   r   r  r  y  s    ")
2 mMJ ae+@%+@/;+@K]+@	+@Z- '+(,	,@, $, &	, 
6,\ . .r   r  c                  <    \ rS rSrSrS r\SS j5       rS	S jrSr	g)
Fp8DecodeScalei  a7  Decode MXFP8 ``ue8m0`` per-block scales (stored as ``uint8`` exponents) into the
float32 multiplicative scales the FP8 compute path expects.

Native MXFP8 loading (``dequantize=False``) keeps weights in ``float8_e4m3fn`` and only
needs the sibling ``*.weight_scale_inv`` tensors turned from raw E8M0 bytes into real
scales (``2 ** (byte - 127)``). Prepended to each weight converter, this op runs before
any merge/concat collapses the per-expert structure: it rewrites only the ``uint8`` scale
entries and passes weights (and already-float scales) through untouched.
c                    Xl         g rB   r`  rb  s     r   r   Fp8DecodeScale.__init__  rd  r   c                    U R                   [        R                  :X  a0  U R                  [        R                  5      S-
  R                  5       $ U $ )Nr  )rM   r   r  r   r   r  )r   s    r   _decodeFp8DecodeScale._decode   s;     =CLLEKK<W		%--(50668c]ccr   c                    UR                  5        VVVs0 s HM  u  p4U[        U[        5      (       a!  U Vs/ s H  oPR                  U5      PM     snOU R                  U5      _MO     snnn$ s  snf s  snnnf rB   )r  rP  r  r  )r   r  r  r~  rh  ts         r   r  Fp8DecodeScale.convert%  sl     )..0
0
 Zt5L5L515a,,q/51RVR^R^_dRee0
 	
1
s   "A3A.A3.A3r`  N)r   r   r   r   )r  r  )
r%   r1   r2   r3   r4   r   staticmethodr  r  r6   r   r   r   r  r    s'    ) d d
r   r  )r   torch.dtype)r   r,   )r   None)rH   r   rI   r   r   r   )r   NNr   )rT   r   rU   r   rV   r   rW   r  rX   r  rY   r   rZ   
int | Noner[   r  r\   r   r   z!tuple[nn.Parameter, nn.Parameter])NNNN)rh   r   r^   r   ri   r   rj   zlist[int] | Nonerk   r=  rf   r=  rl   r  r   r   )
r   ztorch.nn.Moduler   r   r   r   r   r   r   r   )NNF)rT  zlist[str] | None)H
__future__r   	functoolsr{   collections.abcr   dataclassesr   r   torch.nnrP   r   r   activationsr   core_model_loadingr	   quantizers.quantizers_utilsr
   r   utilsr   utils.import_utilsr   r   rB  r   r   r   hub_kernelsr   moer   r   tensor_parallelr   
get_loggerr%   r~   float8_e4m3fnr   r{  r   rw  rS   rv  cacher    r)   r,   r@   _dynamoallow_in_graphrD   rF   rJ   rc   ro   r   rO  r   r   r   r   Moduler   r?  rM  r\  r^  r  r  r   r   r   <module>r     sM   #  	 $ !   $   . U  O 
 * = % 
		H	%   
;;z"&&;;z"&&    G $   + +\  
*     	
      '@ $( $,0'+ # !	
  * % B $( $,0'+BuBuBu #Bu !	Bu
 Bu *Bu %Bu BuJ3
		 3
lDy DNG7
G7G7 G7  	G7
 G7T[7
[7[7 [7  	[7
 [7|]
 ]
@*  01  ejP#3PfG0- G0TV.M V.r
] 
r   