
    3j                        S SK r S SKJr  SSKJr  SSKJr  SSKJ	r	J
r
  SSKJrJr  SS	KJrJrJr  SS
KJr  SSKJrJrJrJr  \R2                  " \5      r\" SS9\ " S S\5      5       5       r " S S\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r#/ SQr$g)    N)strict   )initialization)PreTrainedConfig)ROPE_INIT_FUNCTIONSRopeParameters)auto_docstringlogging   )LagunaDecoderLayerLagunaModelLagunaRotaryEmbedding)Qwen3MoeConfig)Qwen3MoeAttentionQwen3MoeForCausalLMQwen3MoePreTrainedModelQwen3MoeSparseMoeBlockz JetBrains/Mellum2-12B-A2.5B-Base)
checkpointc                   4   \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S	'   S
r\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\   S-  \S'   Sr\\   S-  \S'   Sr\\-  S-  \S'   \" 5       r\" 5       r\" 5       rS rS rSr g)MellumConfig&   uU  
mlp_layer_types (`list[str]`, *optional*):
    Per-layer MLP type — `"dense"` or `"sparse"`. Length must equal
    `num_hidden_layers`. Defaults to all sparse.

```python
>>> from transformers import MellumModel, MellumConfig

>>> configuration = MellumConfig()
>>> model = MellumModel(configuration)
>>> configuration = model.config
```
mellumi  
vocab_sizei 	  hidden_sizei   intermediate_size   num_hidden_layers   head_dimi   max_position_embeddingsi   Nsliding_window@   num_expertsi  moe_intermediate_sizeTnorm_topk_problayer_typesmlp_layer_typesrope_parametersc                     U R                   c  S/U R                  -  U l         U R                  c  S/U R                  -  U l        U R                  c  SSS.SSS.S.U l        [        R
                  " U 40 UDSS	S10D6  g )
Nfull_attentionsparsedefaultg    A)	rope_type
rope_thetag     @)r*   sliding_attentionignore_keys_at_rope_validationr/   )r&   r   r'   r(   r   __post_init__selfkwargss     c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/mellum/modular_mellum.pyr1   MellumConfig.__post_init__K   s    # 01D4J4JJD'$,:0F0F#FD '09"R3<G%T$D 
 	&&	
	
 -@AQ+R	
    c                     U$ N r2   s     r5   convert_rope_params_to_dict(MellumConfig.convert_rope_params_to_dict]   s    r7   )r&   r'   r(   )!__name__
__module____qualname____firstlineno____doc__
model_typer   int__annotations__r   r   r   r   r    r!   r#   r$   r%   boolr&   liststrr'   r(   dictr   AttributeErroruse_sliding_windowdecoder_sparse_stepmlp_only_layersr1   r;   __static_attributes__r:   r7   r5   r   r   &   s     JJK!s!sHc#)S)!%NC$J%K!$3$ND$(KcT!((,OT#Y%,48OTN*T18')(*$&O
$r7   r   c                       \ rS rSrSrg)MellumRotaryEmbeddingb   r:   Nr=   r>   r?   r@   rM   r:   r7   r5   rO   rO   b       r7   rO   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )MellumAttentionf   config	layer_idxc                 |   > [         TU ]  X5        UR                  U   S:X  a  UR                  U l        g S U l        g )Nr/   )super__init__r&   r!   r3   rV   rW   	__class__s      r5   rZ   MellumAttention.__init__g   s9    +7=7I7I)7TXk7kf33qur7   )r!   	r=   r>   r?   r@   r   rC   rZ   rM   __classcell__r\   s   @r5   rT   rT   f   s    v| v v vr7   rT   c                       \ rS rSrSrg)MellumSparseMoeBlockl   r:   NrQ   r:   r7   r5   rb   rb   l   rR   r7   rb   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )MellumDecoderLayerp   rV   rW   c                 B   > [         TU ]  5         [        X5      U l        g r9   )rY   rZ   rT   	self_attnr[   s      r5   rZ   MellumDecoderLayer.__init__q   s    (;r7   )rh   r^   r`   s   @r5   re   re   p   s    <| < < <r7   re   c                   P   ^  \ rS rSr\R
                  " 5       U 4S j5       rSrU =r$ )MellumPreTrainedModelv   c                   > [         TU ]  U5        [        U[        5      (       a  UR                   H  nUR
                  nUR                  U   S:w  a  [        UR                  U      nU" UR                  US9u  pE[        R                  " [        X S35      U5        [        R                  " [        X S35      U5        M     g g )Nr,   )
layer_type	_inv_freq_original_inv_freq)rY   _init_weights
isinstancerO   r&   compute_default_rope_parametersr-   r   rV   initcopy_getattr)r3   modulern   rope_init_fncurr_inv_freq_r\   s         r5   rq   #MellumPreTrainedModel._init_weightsw   s    f%f344$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 5r7   r:   )	r=   r>   r?   r@   torchno_gradrq   rM   r_   r`   s   @r5   rk   rk   v   s    
]]_	^ 	^r7   rk   c                       \ rS rSrSrg)MellumModel   r:   NrQ   r:   r7   r5   r   r      rR   r7   r   c                       \ rS rSrSrg)MellumForCausalLM   r:   NrQ   r:   r7   r5   r   r      rR   r7   r   )r   r   r   rk   )%r|   huggingface_hub.dataclassesr    r   rt   configuration_utilsr   modeling_rope_utilsr   r   utilsr	   r
   laguna.modeling_lagunar   r   r   !qwen3_moe.configuration_qwen3_moer   qwen3_moe.modeling_qwen3_moer   r   r   r   
get_loggerr=   loggerr   rO   rT   rb   re   rk   r   r   __all__r:   r7   r5   <module>r      s     . & 3 - [ [ >  
		H	% =>7> 7  ?7t	1 	v' v	1 	<+ <^3 ^	+ 		+ 	r7   