
    3jN                        S r SSKJr  SSKJrJrJr  SSKrSSKJ	s  J
r  SSKJr  SSKJ	r	  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,  SSK-J.r.  SSK/J0r0J1r1  SSK2J3r3  SSK4J5r5J6r6J7r7J8r8  SSK9J:r:J;r;  SSK<J=r=J>r>  \%R~                  " \@5      rA\$" SS9\ " S S\35      5       5       rB " S  S!\85      rC " S" S#\,5      rD " S$ S%\65      rE " S& S'\:5      rF " S( S)\=5      rG " S* S+\>5      rH\( " S, S-\*5      5       rI " S. S/\.5      rJ " S0 S1\75      rK " S2 S3\05      rL " S4 S5\55      rM/ S6QrNg)7zPyTorch Laguna model.    )Callable)AnyLiteralOptionalN)strict)nn   )initialization)CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)MoeModelOutputWithPast)ROPE_INIT_FUNCTIONS)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging)TransformersKwargsno_inherit_decorator   )AfmoeAttention)Gemma3RotaryEmbedding)Glm4MoeLiteDecoderLayer)
LlamaModeleager_attention_forward)Qwen2MoeConfig)Qwen2MoeForCausalLMQwen2MoeMLPQwen2MoePreTrainedModelQwen2MoeRMSNorm)Qwen3_5MoeTopKRouterapply_rotary_pos_emb)Qwen3MoeExpertsQwen3MoeSparseMoeBlockzpoolside/laguna-XS.2)
checkpointc                      \ rS rSr% SrSr0 SS_SS_SS_SS_S	S
_SS_SS_SS_SS_SS
_SS_SS
_SS_SS_SS_SS
_rSr\\	S'   Sr
\\	S'   Sr\\	S'   Sr\\	S'   S r\\	S!'   S"r\\	S#'   S$r\\	S%'   S r\\	S&'   S'r\\	S('   S'r\\	S)'   S'r\\	S*'   S+r\\	S,'   S-r\\	S.'   S/r\\   S/-  \	S0'   S/r\\   S/-  \	S1'   S2r\\	S3'   S-r\\	S4'   S5r\\	S6'   \" 5       r \" 5       r!\" 5       r"\" 5       r#\" 5       r$\" 5       r%S7 r&S8 r'S9 r(S:r)g/);LagunaConfig0   uB  
num_attention_heads_per_layer (`list[int]`, *optional*):
    Per-layer override for ``num_attention_heads``. Length must equal ``num_hidden_layers``.
mlp_layer_types (`list[str]`, *optional*):
    Per-layer MLP type — ``"dense"`` or ``"sparse"``. Length must equal
    ``num_hidden_layers``. Defaults to first layer dense, rest sparse.
moe_routed_scaling_factor (`float`, *optional*, defaults to 1.0):
    Scalar applied to routed-expert output before combining with the shared-expert output.
moe_apply_router_weight_on_input (`bool`, *optional*, defaults to `False`):
    Whether to apply router weights to the MoE input rather than the output. Not supported
    in transformers yet; ``True`` will raise a ``NotImplementedError`` for now.
moe_router_logit_softcapping (`float`, *optional*, defaults to 0.0):
    Scaling factor when applying tanh softcapping on the logits of the MoE router logits.

Example:

```python
>>> from transformers import LagunaModel, LagunaConfig

>>> configuration = LagunaConfig()
>>> model = LagunaModel(configuration)
>>> configuration = model.config
```
lagunazlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.g_projzlayers.*.self_attn.o_projrowwisezlayers.*.self_attn.q_normreplicated_with_grad_allreducezlayers.*.self_attn.k_normzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projz!layers.*.mlp.experts.gate_up_projpacked_colwisezlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsmoe_tp_expertsz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_proji  
vocab_sizei    intermediate_size(   num_hidden_layersnum_attention_heads   num_key_value_headsi   max_position_embeddings   num_expertsnum_experts_per_toki   moe_intermediate_sizeshared_expert_intermediate_sizesliding_window   head_dimFattention_biasNnum_attention_heads_per_layermlp_layer_types      ?moe_routed_scaling_factor moe_apply_router_weight_on_input        moe_router_logit_softcappingc                 l   U R                   c  S/U R                  -  U l         U R                  c  S/S/U R                  S-
  -  -   U l        U R                  c  U R                  /U R                  -  U l        SSSS.SS	S
S.S.nU R
                  c  X l        [        R                  " U 40 UDSSS10D6  g )Nfull_attentiondensesparse   defaultg    Ag      ?)	rope_type
rope_thetapartial_rotary_factorg     @rE   rK   sliding_attentionignore_keys_at_rope_validationrT   )layer_typesr5   rD   rC   r6   rope_parametersr   __post_init__)selfkwargsdefault_rope_paramss      c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/laguna/modular_laguna.pyrX   LagunaConfig.__post_init__   s    # 01D4J4JJD'$+9zT=S=SVW=W/X#XD --5262J2J1KdNdNd1dD. -6Xhkl/8jm!ne
 '#6  	&&	
	
<OQa;b	
    c                     U$ N )rY   rZ   s     r\   convert_rope_params_to_dict(LagunaConfig.convert_rope_params_to_dict   s    r^   c                 B   U R                   (       a  [        S5      eU R                  bR  [        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      e[        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      e[        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      eg)z'Part of ``@strict``-powered validation.zhmoe_apply_router_weight_on_input=True is not yet supported in the transformers implementation of Laguna.Nz&num_attention_heads_per_layer length (z ) must equal num_hidden_layers (z).zlayer_types length (zmlp_layer_types length ()rG   NotImplementedErrorrC   lenr5   
ValueErrorrV   rD   )rY   s    r\   validate_architecture"LagunaConfig.validate_architecture   s,   00%9 
 ..:D6674;Q;QQ8T=_=_9`8a b1151G1G0HL  t D$:$::&s4+;+;'<&= >1151G1G0HL  t##$(>(>>*3t/C/C+D*E F1151G1G0HL  ?r^   )rV   rD   rC   rW   )*__name__
__module____qualname____firstlineno____doc__
model_typebase_model_tp_planr2   int__annotations__r3   r5   r6   r8   r9   r;   r<   r=   r>   r?   rA   rB   boolrC   listrD   strrF   floatrG   rI   AttributeErrordecoder_sparse_stepmlp_only_layersqkv_biasnorm_topk_probuse_sliding_windowmax_window_layersrX   rb   rh   __static_attributes__ra   r^   r\   r*   r*   0   s   2 J#Y#Y 	$Y 	$Y	
 	$Y 	$%E 	$%E 	!) 		 	!) 	,-= 	)) 	 0 	0 	.y  	0!& J!s!s!!  #)S)K  !$3$+.#S.NC Hc ND 6:!49t#3:(,OT#Y%,'*u*-2$d2*- %- )*$&OH#%N')&(
(r^   r*   c                       \ rS rSrSrg)LagunaRMSNorm   ra   Nrj   rk   rl   rm   r~   ra   r^   r\   r   r          r^   r   c                      ^  \ rS rSrS\4U 4S jjr\    SS\S-  S\S   S\S-  S\	S-  S	\
S
\4   4
S jj5       rSrU =r$ )LagunaRotaryEmbedding   configc                 $   > [         TU ]  U5        g r`   )super__init__rY   r   	__class__s     r\   r   LagunaRotaryEmbedding.__init__   s     r^   Ndeviceztorch.deviceseq_len
layer_typereturnztorch.Tensorc           	      v   U R                   U   S   nU R                   U   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        Xe-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  n	X4$ )
a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
rQ   rR   rE   rA   Nr   r   )dtype)r   r   )rW   getgetattrhidden_sizer6   rq   torcharangeint64torv   )
r   r   r   r   baserR   rA   dimattention_factorinv_freqs
             r\   compute_default_rope_parameters5LagunaRotaryEmbedding.compute_default_rope_parameters   s    . %%j1,? & 6 6z B F FG^`c d6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r^   ra   )NNNN)rj   rk   rl   rm   r*   r   staticmethodr   rq   ru   tuplerv   r   r~   __classcell__r   s   @r\   r   r      sy    !| ! &*+/"!%	"*t#"*("* t"* $J	"*
 
~u$	%"* "*r^   r   c                       \ rS rSrSrg)	LagunaMLP   ra   Nr   ra   r^   r\   r   r      r   r^   r   c                      ^  \ rS rSrU 4S jrS\R                  S\\R                  \R                  \R                  4   4S jrSr	U =r
$ )LagunaTopKRouter   c                    > [         TU ]  5         [        R                  " [        R
                  " UR                  5      SS9U l        UR                  U l	        g )NF)requires_grad)
r   r   r   	Parameterr   zerosr;   e_score_correction_biasrI   router_logit_softcappingr   s     r\   r   LagunaTopKRouter.__init__   s?    ')||EKK@R@R4Sch'i$(.(K(K%r^   hidden_statesr   c                 V   UR                  SU R                  5      n[        R                  " XR                  5      R                  5       nU R                  S:  a/  [        R                  " X R                  -  5      U R                  -  n[        R                  " U5      nX0R                  R                  UR                  5      -   n[        R                  " X@R                  SS9u  pVUR                  SU5      nXwR!                  SSS9-  nUR                  UR                  5      nX'U4$ )NrH   )r   T)r   keepdim)reshape
hidden_dimFlinearweightrv   r   r   tanhsigmoidr   r   r   topktop_kgathersum)rY   r   router_logitsrouting_scoresscores_for_selection_selected_expertsrouting_weightss           r\   forwardLagunaTopKRouter.forward   s     &--b$//B<BBD((3.!JJ}7T7T'TUX\XuXuuM}5-0L0L0O0OP^PdPd0ee#jj)=zzrR(//4DE),?,?BPT,?,UU),,]-@-@A/???r^   )r   r   )rj   rk   rl   rm   r   r   Tensorr   r   r~   r   r   s   @r\   r   r      sE    L
@||@ 
u||U\\5<<7	8@ @r^   r   c                       \ rS rSrSrg)LagunaExperts   ra   Nr   ra   r^   r\   r   r      r   r^   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )LagunaSparseMoeBlocki  r   c                 v   > [         TU ]  U5        [        XR                  S9U l        UR
                  U l        g )Nr3   )r   r   r   r>   shared_expertsrF   routed_scaling_factorr   s     r\   r   LagunaSparseMoeBlock.__init__  s1     'BhBhi%+%E%E"r^   r   r   c                     UR                   u  p#nUR                  SU5      nU R                  U5      nU R                  U5      u  pgnU R	                  XU5      nXR
                  -  nX-   nUR                  X#U5      nU$ )Nr   )shapeviewr   gateexpertsr   r   )	rY   r   
batch_sizesequence_lengthr   shared_outputr   r   r   s	            r\   r   LagunaSparseMoeBlock.forward	  s    2?2E2E/
Z%**2z:++M:/3yy/G,,]oV%(B(BB%5%--j:Vr^   )r   r   )rj   rk   rl   rm   r*   r   r   r   r   r~   r   r   s   @r\   r   r     s1    F| F
U\\ ell  r^   r   c                     ^  \ rS rSrSrS\S\S\4U 4S jjr SS\R                  S	\
\R                  \R                  4   S
\R                  S-  S\S-  S\\   S\
\R                  \R                  S-  4   4S jjrSrU =r$ )LagunaAttentioni  zSAfmoe-style SWA/GQA attention with Laguna-specific gating and per-layer head count.r   	layer_idx	num_headsc                   > X0l         [        TU ]	  X5        U R                   UR                  -  U l        [
        R                  " UR                  U R                   U R                  -  UR                  S9U l
        [
        R                  " U R                   U R                  -  UR                  UR                  S9U l        U ?[
        R                  " UR                  U R                   SS9U l        g )N)biasF)r   r   r   r8   num_key_value_groupsr   Linearr   rA   rB   q_projo_proj	gate_projg_proj)rY   r   r   r   r   s       r\   r   LagunaAttention.__init__  s    "+$(NNf6P6P$P!ii 2 2DNNT]]4RY_YnYnoii >@R@RY_YnYno Nii 2 2DNNOr^   Nr   position_embeddingsattention_maskpast_key_valuesrZ   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
U R                  U5      R                  SS5      nU R                  U	5      R                  SS5      n	U
R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       n[.        R0                  " U R3                  U5      R5                  5       5      R7                  UR8                  5      nUR                  " / UQSPU R                  P76 UR;                  S5      -  R                  " / UQSP76 nU R=                  U5      nX4$ )Nr   rN   r   rH   )dropoutscalingr?   )r   rA   r   r   k_projv_projq_norm	transposek_normr%   updater   r   get_interfacer   _attn_implementationr   trainingattention_dropoutr   r?   r   
contiguousr   softplusr   rv   r   r   	unsqueezer   )rY   r   r   r   r   rZ   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightsr   s                    r\   r   LagunaAttention.forward*  s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D{{<0::1a@[[,66q!<
#--a3&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
! "));;;;FFHzz$++m4::<=@@ARARS"''HHbH$--H4>>Z\K]]ccuepurtukk+.((r^   )r   r   r   r   r   r`   )rj   rk   rl   rm   rn   r*   rq   r   r   r   r   r   r   r   r   r~   r   r   s   @r\   r   r     s    ]P| P P P& )-.)||.) #5<<#=>.) t+	.)
 .) -..) 
u||U\\D00	1.) .)r^   r   c                   &    \ rS rSrS\S\4S jrSrg)LagunaDecoderLayeri[  r   r   c                    [         R                  R                  U 5        UR                  U l        [	        XUR
                  U   5      U l        UR                  U   S:X  a  [        U5      U l	        O[        XR                  S9U l	        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )NrM   r   )eps)r   Moduler   r   r   rC   	self_attnrD   r   mlpr   r3   r   rms_norm_epsinput_layernormpost_attention_layernorm)rY   r   r   s      r\   r   LagunaDecoderLayer.__init__\  s    
		4 !--(F<`<`aj<kl!!),8+F3DH ;S;STDH,V-?-?VEXEXY(5f6H6HfNaNa(b%r^   )r   r  r  r  r  N)rj   rk   rl   rm   r*   rq   r   r~   ra   r^   r\   r	  r	  [  s    	c| 	c 	cr^   r	  c                   P   ^  \ rS rSr\R
                  " 5       U 4S j5       rSrU =r$ )LagunaPreTrainedModelih  c                 $  > [         TU ]  U5        [        U[        5      (       a4  [        R
                  R                  R                  UR                  5        g [        U[        5      (       a  UR                   H  nUR                  nUR                  U   S:w  a  [        UR                  U      nU" UR                  US9u  pE[        R                  " [!        X S35      U5        [        R                  " [!        X S35      U5        M     g g )NrO   )r   	_inv_freq_original_inv_freq)r   _init_weights
isinstancer   r   r   initzeros_r   r   rV   r   rP   r   r   copy_r   )rY   moduler   rope_init_fncurr_inv_freqr   r   s         r\   r  #LagunaPreTrainedModel._init_weightsi  s    f%f.//HHMM  !?!?@ 566$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 7r^   ra   )	rj   rk   rl   rm   r   no_gradr  r~   r   r   s   @r\   r  r  h  s    
]]_^ ^r^   r  c                       \ rS rSr      SS\R
                  S-  S\R                  S-  S\R
                  S-  S\S-  S\R                  S-  S\	S-  S	\
\   S
\4S jjrSrg)LagunaModelix  N	input_idsr   position_idsr   inputs_embeds	use_cacherZ   r   c           	        ^ US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       dR  U R                  UUUUS.mU4S jU4S jS	.n
0 n	[        U R                  R                  5       H  nX   " 5       X'   M     Un0 n[        U R                  R                  5       H  nU R                  XU5      X'   M     [        U R                   S U R                  R"                   5       HE  u  pU" U4XR                  R                  U      XR                  R                  U      UUS
.UD6nMG     U R%                  U5      n['        UU(       a  US9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rN   )r   )r   r&  r   r   r%  c                     > [        S0 T D6$ Nra   )r   mask_kwargss   r\   <lambda>%LagunaModel.forward.<locals>.<lambda>  s    *<*K{*Kr^   c                     > [        S0 T D6$ r*  )r   r+  s   r\   r-  r.    s    -N-]Q\-]r^   rS   )r   r   r%  r   )last_hidden_stater   )rg   embed_tokensr   r   get_seq_lengthr   r   r   r   r   r  dictsetrV   
rotary_emb	enumeratelayersr5   normr   )rY   r$  r   r%  r   r&  r'  rZ   past_seen_tokenscausal_mask_mappingmask_creation_functionsr   r   r   idecoder_layerr,  s                   @r\   r   LagunaModel.forwardy  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF++!."0#2 ,K #L%]'# #%!$++"9"9:
2I2U2W#/ ; & dkk556J.2oom[e.f+ 7 !*$++6U8U8U*V WA)2;;3J3J13MN$78O8OPQ8R$S) / M !X 		-0%+/8O
 	
>B
 	
r^   ra   )NNNNNN)rj   rk   rl   rm   r   
LongTensorr   r   FloatTensorrs   r   r   r   r   r~   ra   r^   r\   r#  r#  x  s     .2.204(,26!%<
##d*<
 t+<
 &&-	<

 <
 ((4/<
 $;<
 +,<
 
 <
 <
r^   r#  c                   (   ^  \ rS rSrU 4S jrSrU =r$ )LagunaForCausalLMi  c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
ra   )r   r   )rY   super_kwargsr   s     r\   r   LagunaForCausalLM.forward  s     w...r^   ra   )rj   rk   rl   rm   r   r~   r   r   s   @r\   rB  rB    s    / /r^   rB  )r*   rB  r#  r  )Orn   collections.abcr   typingr   r   r   r   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r
   r  cache_utilsr   r   configuration_utilsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   r   afmoe.modeling_afmoer   gemma3.modeling_gemma3r   $glm4_moe_lite.modeling_glm4_moe_liter   llama.modeling_llamar   r   !qwen2_moe.configuration_qwen2_moer   qwen2_moe.modeling_qwen2_moer    r!   r"   r#    qwen3_5_moe.modeling_qwen3_5_moer$   r%   qwen3_moe.modeling_qwen3_moer&   r'   
get_loggerrj   loggerr*   r   r   r   r   r   r   r   r	  r  r#  rB  __all__ra   r^   r\   <module>ra     sF    $ ) )    .  & . 3 R B 6 6 5 & , E 1 : J F > u u Y R 
		H	% 12}> }  3}@	O 	'*1 '*T	 	@+ @6	O 	1 * ?)n ?) ?)D
c0 
c^3 ^ =
* =
@/+ /r^   