
    3jj                       S SK Jr  S SKJr  S SKJr  SSKJr  SSKJ	r	  SSK
JrJrJrJr  SS	KJr  SS
KJr  \" 5       (       a:  S SKr\R(                  R+                  \5      r\R(                  R+                  \5      r\R,                  " \5      r  S"         S#S jjr          S$S jrS%S jrS%S jrS rS r\" 5       (       aS  \R>                  RA                  S\SSS9  \R>                  RC                  S\5        \R>                  RE                  S\\S9  S&S jr#        S%S jr$  S"           S'S jjr%          S$S jr& " S S\	5      r'\'" 5       r(S(S jr) S)\(SSSSS .             S*S! jjjr*g)+    )annotations)Callable)wraps   )logging)GeneralInterface)is_torch_availableis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compiling   )deepgemm_bf16_experts_forward)sonicmoe_experts_forwardNFc                   U(       a6  [         R                  " U R                  S5      U5      R                  S5      nO4[         R                  " XR                  S5      5      R                  S5      nUb  UR	                  U5        U$ )a  Batched linear layer supporting optional bias and transposed weights.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (batch_size, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
        else of shape (batch_size, input_dim, output_dim).
    bias (`torch.Tensor`, *optional*):
        Bias tensor of shape (batch_size, output_dim). Default is `None`.
    is_transposed (`bool`, *optional*, defaults to `False`):
        Whether the weight tensor is transposed.
Returns:
    `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
r   )torchbmm	unsqueezesqueezeadd_)inputweightbiasis_transposedouts        W/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/integrations/moe.py_batched_linearr   T   se    * ii*F3;;A> ii 34<<R@J    c                   UR                  S5      nUR                  S5      nUR                  S5      nUR                  USS9nUR                  S5      nUR                  S5      n	U	R                  SU R                  S-
  5      n	U R
                  (       a2  U R                  U	   n
U R                  (       a  U R                  U	   OS nO1U R                  U	   n
U R                  (       a  U R                  U	   OS n[        XzXR                  S9nU R
                  (       a  U R                  U5      nOU R                  U5      nU R                  U	   n
U R                  (       a  U R                   U	   OS n[        XXR                  S9nXR#                  S5      -  nUR%                  XTU5      R'                  SS9nUR)                  UR*                  5      $ )Nr   r   dimr   r   r   )sizerepeat_interleavereshapeclampnum_expertshas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr   r   _apply_gateact_fn	down_projdown_proj_biasr   viewsumtodtype)selfhidden_statestop_k_indextop_k_weights	num_top_k
num_tokens
hidden_dimselected_hidden_statessample_weights
expert_idsselected_weightsselected_biasesproj_outweighted_outfinal_hidden_statess                  r   batched_mm_experts_forwardrE   v   s      $I##A&J##B'J +<<YA<N"**2.N$$R(J !!!T%5%5%9:J }},,Z8@D$00<SW<<
3;?==$++J7d VhVhH
 }}##H- ;;x( ~~j19=d))*5DO HZHZH
 66r::L '++J:NRRWXRY!!-"5"566r   c                *   [         R                  " U R                  S5      UR                  S5      U R                  U R                  S9nSn[        UR                  5       5       H*  u  pVXF:X  a  M  [         R                  " XU X   X4U S9  UnM,     U$ )a  
Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

Args:
    input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
    weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
    offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
Returns:
    `torch.Tensor`: Output of shape (S, output_dim).
r   r   devicer5   r   )r   zerosr#   rH   r5   	enumeratetolistmm)r   r   offsoutputstartiends          r   _grouped_mm_fallbackrS      s     [[AAu||SXS^S^_FE DKKM*<S!69&s2CD	 + Mr   c                z   U R                  5       S:X  d   S[        U R                  5       35       eUR                  5       S:X  d   S[        UR                  5       35       eUR                  5       S:X  d   S[        UR                  5       35       eUR                  S5      UR                  S5      :X  d+   SUR                  S5       S	UR                  S5       35       eU R                  S5      UR                  S5      :X  d+   S
U R                  S5       SUR                  S5       35       eUR                  [
        R                  [
        R                  4;   d   SUR                   35       e[
        R                  " U R                  S5      UR                  S5      U R                  U R                  S9$ )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rG   )
r!   tupleshaper#   r5   r   int32int64emptyrH   r   r   rN   s      r   _grouped_mm_fallback_faker\      s   99;!_J5QVQ\Q\K]J^__::<1 
LUSYS_S_M`Lab 88:?\HtzzIZH[\\?99Q<6;;q>)v\$))A,Geflfqfqrsfteu+vv)::a=FKKN* 
(A}V[[QR^DTU* ::%++u{{33h7[\`\f\f[g5hh3;;uzz!}fkk!nU\\QVQ\Q\]]r   c                H    U R                  US   US   5        US   U l        g)zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrN   )ctxinputsrO   s      r   "_grouped_mm_fallback_setup_contextra      s%    &)VAY/ayCHr   c                   U R                   u  p#[        R                  " U5      n[        R                  " U5      nSn[        U R                  R                  5       5       HZ  u  pxXh:X  a  M  [        R                  " XU X7   R                  XFU S9  [        R                  " X&U R                  XU XW   S9  UnM\     XES4$ )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rI   N)saved_tensorsr   
zeros_likerK   rN   rL   rM   T)	r_   grad_outputr   r   
grad_inputgrad_weightrP   rQ   rR   s	            r   _grouped_mm_fallback_backwardri      s    %%ME!!%(J""6*KE CHHOO-.<3'*3:OPS!##[s%;P / D((r   z!transformers::grouped_mm_fallback z4(Tensor input, Tensor weight, Tensor offs) -> Tensor)mutates_argsschema)setup_contextc                Z   [        5       (       a  UR                  [        R                  :w  dW  UR                  R
                  S:X  a>  [        SSS9(       a/  UR                  5       S-  S:w  d  U R                  5       S-  S:w  a  gUR                  R
                  S:X  a  [        [        R                  R                  S	5      (       a,  [        R                  R                  UR                  5      S
:  $ [        [        S5      (       ag  [        SSS9(       a,  [        R                  R                  UR                  5      S
:  $ [        R                  R                  UR                  5      S:  $ g[        [        R                  R                  S	5      =(       d    [        [        S5      $ )a  
Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (S, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (num_experts, input_dim, output_dim).
    offs (`torch.Tensor`):
        Offsets tensor indicating the boundaries of each group in the input tensor.
Returns:
    `bool`: True if grouped_mm can be used, False otherwise.
cpuz2.10.0T)
accept_dev   r   Fcuda
grouped_mm)   r   _grouped_mmz2.9)	   r   )r   r5   r   bfloat16rH   typer   data_ptrhasattrnn
functionalrr   get_device_capabilityr
   r[   s      r   _can_use_grouped_mmr~   
  s&    	!""v||u~~'Ee#"8=__#q(ENN,<r,AQ,F 
 }}V#588&&55::33FMMBfLL5-(((4@zz77F&PPzz77F&PP588&&5V9VVr   c                   [        XU5      (       a  [        [        R                  R                  S5      (       aA  [        R                  R                  R                  U R                  UR                  5      XS9$ [        [        S5      (       a.  [        R                  " U R                  UR                  5      XS9$ [        R                  R                  R                  XUS9$ )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (S, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (num_experts, input_dim, output_dim).
    offs (`torch.Tensor`):
        Offsets tensor indicating the boundaries of each group in the input tensor.
Returns:
    `torch.Tensor`: Output tensor of shape (S, output_dim).
rs   rN   ru   )r~   rz   r   r{   r|   rs   r4   r5   ru   opstransformersgrouped_mm_fallbackr[   s      r   ru   ru   7  s    $ 5$//
 588&&5588&&11%((6<<2H&1\\UM**$$UXXfll%;VOO99!!55e$5OOr   c                    U(       a  [        XUS9nO[        XR                  SS5      US9nUb  UR                  U5        U$ )a  Grouped linear layer supporting optional bias and transposed weights.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (S, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
        else of shape (num_experts, output_dim, input_dim).
    offs (`torch.Tensor`):
        Offsets tensor indicating the boundaries of each group in the input tensor.
    bias (`torch.Tensor`, *optional*):
        Bias tensor of shape (num_experts, output_dim). Default is `None`.
    is_transposed (`bool`, *optional*, defaults to `False`):
        Whether the weight tensor is transposed.
Returns:
    `torch.Tensor`: Output tensor of shape (S, output_dim).
r   r   )ru   	transposer   )r   r   rN   r   r   r   s         r   _grouped_linearr   V  sD    0 %d3 %!1!1"b!9EJr   c                   UR                   nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      n	[        R                  " U	5      u  pXU-     nX   nUR
                  S;   a  U
R                  5       OU
R                  5       n[        R                  " XR                  SU R                  S-
  S9n[        R                  " US[        R                  S9nXR                  :  R                  S5      nU
R                  U R                  S-
  S9  U R                  (       a/  U R                  nU R                   (       a  U R"                  U
   OS nO.U R$                  nU R                   (       a  U R&                  U
   OS nUR)                  US5        [+        UUUUU R,                  S	9nU R                  (       a  U R/                  U5      nOU R1                  U5      nU R2                  nU R                   (       a  U R4                  U
   OS n[+        UUUUU R,                  S	9nUUR                  S5      -  nUR)                  US5        [        R6                  " U5      n[        R8                  " UR                  S5      US
9UU'   UU   nUR;                  XeU5      R=                  SS9nUR?                  UR@                  5      $ )Nr   r   )ro   mpsr   )binsminmax)r!   r5   )r   g        r"   )rH   r    )!rH   r#   r%   r   sortrx   floatinthistcr'   cumsumrX   r   clamp_r(   r)   r*   r+   r,   r-   masked_fill_r   r   r.   r/   r0   r1   
empty_likearanger2   r3   r4   r5   )r6   r7   r8   r9   rH   r:   r;   r<   r>   r?   expert_ids_gpermselected_hidden_states_gsample_weights_ghistc_inputtokens_per_expertoffsetssentinel_maskr@   rA   rB   rC   inv_permrD   s                           r   grouped_mm_experts_forwardr   |  s    !!F  $I##A&J##B'J #**2.N$$R(J J/L,Y->?%+ +1++*G,$$&\M]M]M_KK6F6FASWScScfgSghll,!5;;GG" "%5%55@@DMD,,q01 }},,BF--$00>UY<<=A]]$++L9PT ))-=  "2G/aeasasH
 }}##H- ;;x( ~~;?==d)),7dO "G/QUQcQcH
 .88<<L mS1 %H\\$))A,v>HTN)L '++J:NRRWXRY!!-"5"566r   c                  >   ^  \ rS rSrSr\\\\S.r	SU 4S jjr
SrU =r$ )ExpertsInterfacei  z;Interface for registering custom experts forward functions.)deepgemm
batched_mmrs   sonicmoec                   > Uc  [         R                  S5        OUS:w  a  X;  a  [        SU S35      e[        TU ]  X5      $ )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.a
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r6   experts_implementationdefault	__class__s      r   get_interfaceExpertsInterface.get_interface  s[    !)N
 $w.3I3U*++wx  w{1;;r   rj   )r   strr   r   returnr   )__name__
__module____qualname____firstlineno____doc__r   rE   r   r   _global_mappingr   __static_attributes____classcell__)r   s   @r   r   r     s%    E 200,	O< <r   r   c                N    UR                  SSS9u  p#U R                  U5      U-  $ )a{  
Default gating mechanism: splits the gate_up_out into gate and up parts,
applies the activation function to the gate part, and multiplies it with the up part.
Args:
    gate_up_out (`torch.Tensor`):
        The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
Returns:
    `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
r   r   r    )chunkr/   )r6   gate_up_outgateups       r   _default_apply_gater      s/        +HD;;tr!!r   T)experts_interfaceis_concatenatedr   r*   r(   c               >   ^^^^^ SUUUUU4S jjnU b  U" U 5      $ U$ )a\  Decorator to modify experts class to support different experts implementations.

Args:
    experts_class (`type[torch.nn.Module]`, *optional*):
        The experts class to modify. If not provided, returns a decorator that can be applied to the class.
    experts_interface (`ExpertsInterface`, *optional*, defaults to `ALL_EXPERTS_FUNCTIONS`):
        The experts interface to use for dispatching the forward method.
    is_concatenated (`bool`, *optional*, defaults to `True`):
        Whether the expert weights are stored in concatenated layout [gate;up]
        or interleaved layout [gate0, up0, gate1, up1, ...].
    is_transposed (`bool`, *optional*, defaults to `False`):
        Whether the expert weights are stored in transposed format.
    has_bias (`bool`, *optional*, defaults to `False`):
        Whether the expert layers include bias terms or not.
    has_gate (`bool`, *optional*, defaults to `True`):
        Whether the experts use a gating mechanism or not.
        Whether it has gate_up_proj weights or just up_proj weights.

Returns:
    `type[torch.nn.Module]`: The modified experts class.
c                   >^^ U R                   mU R                  m[        T5      UUUU	U4S j5       n[        T5      UU4S j5       n[        U S5      (       d  [        U l        Xl         X l        U $ )Nc                b   > T" X/UQ70 UD6  Xl         TU l        TU l        TU l        TU l        g N)configr(   r*   r   r   )	r6   r   argskwargsr*   r(   r   r   original_inits	       r   __init__=use_experts_implementation.<locals>.wrapper.<locals>.__init__1  s8    $888 K$DM$DM!.D#2D r   c                h   > TR                  U R                  R                  T5      nU" U /UQ70 UD6$ r   )r   r   _experts_implementation)r6   r   r   experts_forwardr   original_forwards       r   forward<use_experts_implementation.<locals>.wrapper.<locals>.forward:  s5    /==dkk>a>acstO"49$9&99r   r.   )r   r   r   rz   r   r.   )
experts_classr   r   r   r   r   r*   r(   r   r   s
      @@r   wrapper+use_experts_implementation.<locals>.wrapper-  sy    %..(00	}		3 	3 
	3 
	 	: 
!	: }m44(;M%!) 'r   )r   type[torch.nn.Module]r   r   rj   )r   r   r   r   r*   r(   r   s    ````` r   use_experts_implementationr     s%    > 2  }%%Nr   )NF)
r   torch.Tensorr   r   r   torch.Tensor | Noner   boolr   r   )
r6   ztorch.nn.Moduler7   r   r8   r   r9   r   r   r   )r   r   r   r   rN   r   r   r   )r   r   r   r   rN   r   r   r   )r   r   r   r   rN   r   r   r   r   r   r   r   )r   r   r   r   r   )r   ztype[torch.nn.Module] | Noner   r   r   r   r   r   r*   r   r(   r   r   r   )+
__future__r   collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr	   r
   r   r   r   r   r   r   r   _dynamoassume_constant_result
get_loggerr   r   r   rE   rS   r\   ra   ri   library	custom_opregister_fakeregister_autogradr~   ru   r   r   r   ALL_EXPERTS_FUNCTIONSr   r   rj   r   r   <module>r      s   # $   ,  4 . 
 !& D DE^ _"]]AABXY 
		H	%\ !%	  	
 D=7
=7=7 =7  	=7
 =7F4^)& 	MM+E	   
MM CE^_	MM##+%8 $ *WZPPP P 	PF !%### # 	#
 # #Le7
e7e7 e7  	e7
 e7P<' <2 )* " 37; +@ ;/; (; 	;
 ; ; ; ;r   