
    3j                     l    S SK Jr  SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/r	g
)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz JetBrains/Mellum2-12B-A2.5B-Base)
checkpointc                   n  ^  \ rS rSr% SrSrS/rSS0rSSSSSS	S
S	SSSS	S.rSSSSS.r	S/S/4SS/S/4S/S/4S.r
Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\\S!'   S"r\\S#'   S$r\\S%'   S&r\\S''   S(r\\S)'   S*r\\S+'   S,r\\-  S,-  \S-'   S*r\\S.'   S/r \S,-  \S0'   S1r!\\-  \S2'   S3r"\\S4'   S5r#\\S6'   S7r$\\S'   S(r%\\S8'   S*r&\\S9'   S:r'\\S;'   S,r(\S,-  \S<'   S,r)\S,-  \S='   S,r*\\+\   -  S,-  \S>'   S?r,\\S@'   S,r-\+\   S,-  \SA'   S,r.\+\   S,-  \SB'   U 4SC jr/SD r0SEr1U =r2$ )FMellumConfig   uU  
mlp_layer_types (`list[str]`, *optional*):
    Per-layer MLP type — `"dense"` or `"sparse"`. Length must equal
    `num_hidden_layers`. Defaults to all sparse.

```python
>>> from transformers import MellumModel, MellumConfig

>>> configuration = MellumConfig()
>>> model = MellumModel(configuration)
>>> configuration = model.config
```
mellumpast_key_valuesnum_expertsnum_local_expertscolwisereplicated_with_grad_allreducerowwisepacked_colwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_proj!layers.*.mlp.experts.gate_up_projlayers.*.mlp.experts.down_projlayers.*.mlp.expertszlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	ep_routergrouped_gemm)zlayers.*.mlp.gater   r   r   	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei 	  hidden_sizei   intermediate_sizenum_hidden_layers    num_attention_heads   num_key_value_headssilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cacheFtie_word_embeddingsNrope_parametersattention_biasi   sliding_windowg        attention_dropouti  moe_intermediate_size   num_experts_per_tok@   norm_topk_proboutput_router_logitsgMbP?router_aux_loss_coefpad_token_idbos_token_ideos_token_id   head_dimlayer_typesmlp_layer_typesc                    > U R                   c  S/U R                  -  U l         U R                  c  S/U R                  -  U l        U R                  c  SSS.SSS.S.U l        [        TU ]  " S
0 UDSS	S10D6  g )Nfull_attentionsparsedefaultg    A)	rope_type
rope_thetag     @)rC   sliding_attentionignore_keys_at_rope_validationrH    )r@   r$   rA   r0   super__post_init__)selfkwargs	__class__s     i/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/mellum/configuration_mellum.pyrL   MellumConfig.__post_init__o   s    # 01D4J4JJD'$,:0F0F#FD '09"R3<G%T$D 
 	 	
	
,?AQ+R	
    c                     U$ )NrJ   )rM   rN   s     rP   convert_rope_params_to_dict(MellumConfig.convert_rope_params_to_dict   s    rR   )r@   rA   r0   )3__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapbase_model_tp_planbase_model_ep_planbase_model_pp_planr!   int__annotations__r"   r#   r$   r&   r(   r*   strr+   r,   floatr-   r.   boolr/   r0   dictr   r1   r2   r3   r4   r6   r   r8   r9   r:   r;   r<   r=   listr?   r@   rA   rL   rT   __static_attributes____classcell__)rO   s   @rP   r
   r
      s    J#4"5 	*M &/%.%.%E%E%.-=*3 0"+ )"+$ )-;*8 0	 &(9:#%568IJ!"_$56 JK!s!s!!  J#)S)#u#L%It %%48OTN*T18 ND !%NC$J%%(us{(!$3$  KND!&$&"'%'#L#*##L#*#+/L#S	/D(/Hc$(KcT!((,OT#Y%,
" rR   r
   N)
huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__rJ   rR   rP   <module>ro      sK   * / 3 1 # =>d# d  ?dN 
rR   