
    3jV8                         S SK Jr  SSKJrJrJrJr  SSKJr  SSK	J
r
  \" 5       (       a  S SKr\(       a  SSKJr  SS	KJr  \R                   " \5      r " S
 S\5      rg)    )TYPE_CHECKING   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModel)FineGrainedFP8Configc                      ^  \ rS rSr% SrSrS\S'   U 4S jrS rSS	S
\	S\
4S jrSS	S
\	SSS\4U 4S jjrSS jr  SS jrS rS r\S\
4S j5       r\S\
4S j5       rS rS rS\
4S jrS rS rSrU =r$ )FineGrainedFP8HfQuantizer   zz
FP8 quantization implementation supporting both standard and MoE models.
Supports both e4m3fn formats based on platform.
Fr   quantization_configc                 (   > [         TU ]  " U40 UD6  g )N)super__init__)selfr   kwargs	__class__s      k/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   "FineGrainedFP8HfQuantizer.__init__   s    ,77    c                    [        5       (       d  [        S5      eU R                  R                  (       a  g [        R
                  R                  5       (       dR  [        5       (       dC  U R                  (       a'  [        R                  S5        SU R                  l        g [        S5      e[        R
                  R                  5       (       ab  [        R
                  R                  5       nUu  pEUS:  d  US:X  a4  US:  a.  [        R                  SU SU S	35        SU R                  l        g UR                  S
5      nUc  [        R                  S5        g [        U[        5      (       aV  U R                  (       dD  [!        U5      S:  a4  SUR#                  5       ;   d  SUR#                  5       ;   a  [%        S5      eg g g g )NzMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)zUsing FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is availableTzANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.z`. We will default to dequantizing the model to bf16. Feel free to use a different quantization method like bitsandbytes or torchao
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. r	   cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availabler   pre_quantizedloggerwarning_onceRuntimeErrorget_device_capabilityget
isinstancedictlenvalues
ValueError)r   argsr   compute_capabilitymajorminorr   s          r   validate_environment.FineGrainedFP8HfQuantizer.validate_environment   s   &((mnn##..zz&&((1G1I1I!!## [ 7;((3"#fgg::""$$!&!A!A!C-LE	uzeai####('5' 2Z[
 7;((3ZZ-
6
 
D))&&
Oa'j//11Vz?P?P?R5R k  6S ( ' *r   modelr   
param_namereturnc                     SSK JnJn  [        X5      u  pg[	        XeU45      (       a  U R
                  (       d  US:X  a  ggg)Nr   )
FP8Experts	FP8LinearbiasFT)integrations.finegrained_fp8r<   r=   r   r-   r'   )r   r8   r9   r   r<   r=   moduletensor_names           r   param_needs_quantization2FineGrainedFP8HfQuantizer.param_needs_quantizationN   s;    H25Ef*566!![F%:r   paramztorch.Tensorc                 R   > U R                  X5      (       a  g[        TU ]	  XU5      $ )z4Return the element size (in bytes) for `param_name`.r	   )rB   r   param_element_size)r   r8   r9   rD   r   s       r   rF   ,FineGrainedFP8HfQuantizer.param_element_sizeZ   s)    ((;;w)%UCCr   c                     U R                   R                  nU(       d  gSSKJn  U" U5      n/ nU H2  nUnU H  nUR	                  U5      u  pyM     UR                  U5        M4     XPR                   l        g)u  Rewrite the skip-list to the model's own module tree.
For models that were already released, if they have a list of modules to not quantize
we need to apply the weight renaming / weight conversion opérations to get the actual
layer name of the model in `transformers`.
Nr   )get_model_conversion_mapping)r   modules_to_not_convertconversion_mappingrI   rename_source_keyappend)
r   r8   skiprI   	renamingsremappednamerenamedrename_s
             r   !_normalize_modules_to_not_convert;FineGrainedFP8HfQuantizer._normalize_modules_to_not_converta   su     ''>>E07	DG##55g>
 $OOG$	 
 ;C  7r   c                     SSK Jn  U R                  U5        U R                  XR                  R
                  UR                  5      U l        U" UU R
                  U R                  U R                  S9ng )Nr   )replace_with_fp8_linear)rJ   r   r'   )r?   rX   rU   get_modules_to_not_convertr   rJ   _keep_in_fp32_modulesr'   )r   r8   r   rX   s       r   $_process_model_before_weight_loading>FineGrainedFP8HfQuantizer._process_model_before_weight_loadingv   sj    
 	K..u5&*&E&E++BBED_D_'
# (#'#>#> $ 8 8,,	
r   c                    SUR                   R                  ;   a  SSSSSSSSSSSSSSS.nX!l        SSKJn  [        USS 5      nUR                  R                  U5      nU(       ad  S H^  n[        XS 5      =(       d    0 nUR                  5        VV	s0 s H  u  pXR                  X5      _M     n
nn	X:w  d  MR  [        XU
5        M`     U$ s  sn	nf )	NQwen3colwiserowwise)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invr   )r<   _experts_implementation)base_model_tp_planbase_model_ep_plan)
r   __name__rb   r?   r<   getattr_impl_tp_layer_overridesr,   itemssetattr)r   config	text_planr<   impllayer_overrides	plan_attr	base_plankvupdated_plans              r   update_tp_plan(FineGrainedFP8HfQuantizer.update_tp_plan   s    f&&///4=>G4=>G4=>G4=>G1:;D/89B1:;DI" )2% 	>v8$?$==AA$GI	#Ft<B	IRIZ[IZ#6#6q#< <IZ[,F|<	 J 	  \s   Cc                     gNT r   s    r   is_serializable)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                     g)NFrv   rw   s    r   is_trainable&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                     gru   rv   rw   s    r   is_compileable(FineGrainedFP8HfQuantizer.is_compileable   s    r   c                     SSK Jn  U" U 5      $ )Nr   )Fp8Quantize)r?   r   )r   r   s     r   get_quantize_ops*FineGrainedFP8HfQuantizer.get_quantize_ops   s    >4  r   c                     SSK Jn  SSKJn  U R                  (       a-  U R
                  R                  (       a  U" / SQSU" U 5      /S9/$ / $ )Nr   WeightConverterFp8Dequantize)zweight$weight_scale_invactivation_scaleweightsource_patternstarget_patterns
operations)core_model_loadingr   r?   r   r'   r   r#   )r   r   r   s      r   get_weight_conversions0FineGrainedFP8HfQuantizer.get_weight_conversions   sK    8@$":":"E"E  $W$, -d 34  	r   c                 :    [        U R                  SS5      nUS:H  $ )zNMXFP8 checkpoints ship E8M0 (uint8) per-block scales; plain FP8 ships float32.quant_methodNmxfp8)re   r   )r   r   s     r   	_is_mxfp8#FineGrainedFP8HfQuantizer._is_mxfp8   s!    t77Nw&&r   c           	      h   SSK Jn  SSKJn  / nU H~  n[	        XR5      (       aZ  [        S UR                   5       5      (       a9  U" UR                  UR                  U" U 5      /[        UR                  5      -   S9nUR                  U5        M     UR                  U" S/SU" U 5      /S95        U$ )a5  
Native MXFP8 path: prepend a `Fp8DecodeScale` op so the uint8 E8M0
scales are decoded to float32 `2 ** (byte - 127)` *before* any merge/concat op
and add a generic fallback converter that decodes the scales of plain `FP8Linear` weights (attention / dense projections)
which have no model-specific converter.
r   r   )Fp8DecodeScalec              3   B   #    U  H  oR                  S 5      v   M     g7f).weightN)endswith).0ps     r   	<genexpr>MFineGrainedFP8HfQuantizer._update_weight_conversions_mxfp8.<locals>.<genexpr>   s     8mXlSTI9N9NXls   r   r   )r   r   r?   r   r-   anyr   _original_target_patternslistr   rM   )r   weight_conversionsr   r   updatedconvs         r    _update_weight_conversions_mxfp8:FineGrainedFP8HfQuantizer._update_weight_conversions_mxfp8   s     	9A&D$00S8mX\XlXl8m5m5m&$($8$8$($B$B .t 45T__8MM
 NN4  ' 	!3 4 2*401	
 r   c           	         SSK JnJn  SSKJn  U" SSS9nU/[        U5      -   nU R                  (       a  U R                  R                  (       dI  U R                  (       a&  U R                  5       (       a  U R                  U5      $ XR                  5       -   $ / nU GH  n[        Xr5      (       d  UR                  U5        M'  UR                   Vs/ s H  oR                  S5      (       d  M  UPM     n	nU	(       a  U	 Vs/ s H  oS-   PM	     n
nU	 Vs/ s H  oS	[!        S5      *  S
-   PM     nnUR                   Vs/ s H  oR                  S5      (       a  M  UPM     nnX-   U-   nU" U 5      /[        UR"                  5      -   nU" UUR$                  US9nUR                  U5        GM     UR'                  U R                  5       5        U$ s  snf s  snf s  snf s  snf )u]  When loading with ``dequantize=True``, attach an :class:`Fp8Dequantize` op to
every existing :class:`WeightConverter` so that per-block scales are folded into
the weight *before* any later merge/concat ops collapse the per-expert structure.

For each model-supplied converter that has a ``.weight`` source, we:
  1. anchor the existing weight patterns with ``$`` so they don't accidentally
     also match the ``.weight_scale_inv`` keys (the regex is searched, so the
     unanchored prefix would match both, sending scales to the wrong bucket);
  2. add anchored ``*.weight_scale_inv`` sources next to each weight pattern so
     the loader collects scale tensors alongside the weight tensors into the
     *same* converter bucket (both keys rewrite to the same target);
  3. prepend a fresh :class:`Fp8Dequantize` op so dequant runs first, before
     any merge/concat collapses the per-expert structure.

The generic ``weight$ + weight_scale_inv → weight`` converter from
:meth:`get_weight_conversions` is still appended at the end as a fallback for
plain ``nn.Linear`` weights with no model-specific converter.
r   )r   WeightRenamingr   z^(.+)\.scale$z\1.weight_scale_inv)r   r   r   $Nz.weight_scale_inv$r   )r   r   r   r?   r   r   r'   r   r#   r   r   r   r-   rM   r   r   r/   r   r   extend)r   r   r   r   r   scale_renamer   r   r   weight_sourcesanchored_weightscale_sourcesothernew_sourcesnew_opss                  r   update_weight_conversions3FineGrainedFP8HfQuantizer.update_weight_conversions   s   & 	I@ &6FXno*^d3E.FF""t'?'?'J'J!!dnn&6&6<<=OPP%(C(C(EEE&D d44t$)-)=)=W)=AIAVa)=NW4B"CNqs7N"CVd eVdQR#4c)n_!58L!LVd e$($8$8V$8q

9@U$8V-=E(./$t2GG&$/$($B$B&
 NN4 % '( 	t2245 X"C eVs$   G0GGG G%G%)rJ   )r8   r   )rd   
__module____qualname____firstlineno____doc__requires_calibration__annotations__r   r6   strboolrB   floatrF   rU   r[   rr   rx   propertyr{   r~   r   r   r   r   r   __static_attributes____classcell__)r   s   @r   r   r      s    
 !//8.`
.? 
S 
_c 
D(9 Ds DSa Dfk DC*
 
&$L d     !
 '4 '
<: :r   r   )typingr   utilsr   r   r   r   baser
   quantizers_utilsr   r$   modeling_utilsr   utils.quantization_configr   
get_loggerrd   r(   r   rv   r   r   <module>r      sI      ` `  2 0@			H	%Z Zr   