
    3j                        S SK r S SKrS SKJr  SSKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJrJrJrJr  S	S
KJr  \R,                  " \5      r " S S\R2                  5      r " S S\5      r " S S\5      rS rS"S jr\ " S S\5      5       r " S S\5      r  " S S\5      r! " S S\!\5      r" " S S\	5      r# " S S\
5      r$ " S S \5      r%/ S!Qr&g)#    N   )logging)no_inherit_decorator   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )HeliumRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/helium/modular_helium.pyr   HeliumRMSNorm.__init__    s-    ll5::k#:; #    c                 V   UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  R                  [        R                  5      U-  R                  U5      $ )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr   r   )r   hidden_statesinput_dtypevariances       r#   forwardHeliumRMSNorm.forward%   s    #))%((7 $$Q',,R,>%H?T?T4T(UUu}}-=AA+NNr%   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   shaper   )r   s    r#   
extra_reprHeliumRMSNorm.extra_repr,   s*    ))*+6$2G2G1HIIr%   )r   r   )gư>)	__name__
__module____qualname____firstlineno__r   r2   r7   __static_attributes____classcell__r"   s   @r#   r   r      s    $
OJ Jr%   r   c                       \ rS rSrSrg)HeliumRotaryEmbedding0    Nr9   r:   r;   r<   r=   rC   r%   r#   rA   rA   0       r%   rA   c                       \ rS rSrSrg)	HeliumMLP4   rC   NrD   rC   r%   r#   rG   rG   4   rE   r%   rG   c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   Nr   r   r'   dim)r   stackflatten)xx1x2s      r#   rotate_halfrR   8   sJ    	
319B	
319B;;Ryb)11"55r%   c                 4   UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.Nr'   r   rJ   )	unsqueezer6   repeat_interleaverR   )qkcossinunsqueeze_dimq_embedk_embeds          r#   apply_rotary_pos_embr]   ?   s    $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
ECw;q>C/0Gw;q>C/0Gr%   c                   >   ^  \ rS rSrSS\S\S-  4U 4S jjjrSrU =r$ )HeliumAttention^   Nconfig	layer_idxc                    > [         TU ]  X5        [        R                  " UR                  UR                  SS9U l        S[        R                  " U R                  5      -  U l	        g )NF)biasr   )
r   r   r   Linearr    o_projmathsqrthead_dimscalingr   ra   rb   r"   s      r#   r   HeliumAttention.__init__`   sI    +ii 2 2F4F4FUS499T]]33r%   )rf   rj   r   	r9   r:   r;   r<   r   intr   r=   r>   r?   s   @r#   r_   r_   ^   s    4| 4d
 4 4r%   r_   c                   >   ^  \ rS rSrSS\S\S-  4U 4S jjjrSrU =r$ )HeliumDecoderLayerf   Nra   rb   c                    > [         TU ]  X5        [        U5      U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        g )Nr!   )	r   r   rG   mlpr   r    rms_norm_epsinput_layernormpost_attention_layernormrk   s      r#   r   HeliumDecoderLayer.__init__g   sR    +V$,V-?-?VEXEXY(5f6H6HfNaNa(b%r%   )rv   rt   rw   r   rm   r?   s   @r#   rp   rp   f   s#    c| cd
 c cr%   rp   c                       \ rS rSrSrg)HeliumPreTrainedModelo   rC   NrD   rC   r%   r#   rz   rz   o   rE   r%   rz   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )HeliumModels   ra   c           	      2  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        SU l        U R                  5         g s  snf )Nrs   F)r   r   r   
ModuleListrangenum_hidden_layersrp   layersr   r    ru   normgradient_checkpointing	post_initrk   s      r#   r   HeliumModel.__init__t   s{     mmDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	&+# 	 es   B)r   r   r   )r9   r:   r;   r<   r   r   r=   r>   r?   s   @r#   r}   r}   s   s    	| 	 	r%   r}   c                       \ rS rSrSrg)HeliumForCausalLM   rC   NrD   rC   r%   r#   r   r      rE   r%   r   c                       \ rS rSrSrg)HeliumForSequenceClassification   rC   NrD   rC   r%   r#   r   r      rE   r%   r   c                       \ rS rSrSrg)HeliumForTokenClassification   rC   NrD   rC   r%   r#   r   r      rE   r%   r   )rz   r}   r   r   r   )r   )'rg   r   torch.nnr   utilsr   utils.genericr   gemma.modeling_gemmar   r   r	   granite.modeling_graniter
   llama.modeling_llamar   r   r   r   r   configuration_heliumr   
get_loggerr9   loggerModuler   rA   rG   rR   r]   r_   rp   rz   r}   r   r   r   __all__rC   r%   r#   <module>r      s        1 p p 7 v v . 
		H	%JBII J"	0 		 	6> 4& 4 4c* c	0 	
' 
	( 		&D 		#> 	r%   