
    3j                        S SK Jr  S SKrS SKJr  S SKJs  Jr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJr  SS	KJr  S
SKJrJrJrJrJrJrJrJrJr  SSKJr  \R@                  " \!5      r" " S S\RF                  5      r$ " S S\5      r% " S S\5      r&\" S5      S!S j5       r' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\5      r,/ S Qr-g)"    )CallableN   )Cache)use_kernel_func_from_hub)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)logging)maybe_autocast   )	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForSequenceClassificationLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
OlmoLayerNorm1   z/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2   > [         TU ]  5         U4U l        g N)super__init__normalized_shape)selfr   	__class__s     _/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/olmo/modular_olmo.pyr   OlmoLayerNorm.__init__4   s    !,    hidden_statesc                     UR                   n[        R                  " UR                  [        R
                  S9U R                  S S SS9R                  U5      $ )N)dtypegh㈵>)eps)r(   F
layer_normtotorchfloat32r    )r!   r&   
orig_dtypes      r#   forwardOlmoLayerNorm.forward8   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r%   )r    )__name__
__module____qualname____firstlineno____doc__intr   r-   Tensorr0   __static_attributes____classcell__r"   s   @r#   r   r   1   s9    9/C /D /
U\\ 
ell 
 
r%   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )OlmoMLP?   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r!   configr"   s     r#   r   OlmoMLP.__init__@   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr%   )rF   rD   rE   )r2   r3   r4   r5   r   r9   r:   r;   s   @r#   r=   r=   ?   s    Y Yr%   r=   c                   L    \ rS rSr\R
                  " 5       \S 5       5       rSrg)OlmoRotaryEmbeddingI   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        X4$ ! , (       d  f       WW	4$ = f)
Nr   r   mpscpuF)device_typeenabledr   )dim)inv_freqfloatexpandshaper,   device
isinstancetypestrr
   	transposer-   catcosattention_scalingsin)
r!   xposition_idsinv_freq_expandedposition_ids_expandedrP   freqsembr]   r_   s
             r#   r0   OlmoRotaryEmbedding.forwardJ   s7    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D
 x DC
 Cxs   BE&&
E7 N)	r2   r3   r4   r5   r-   no_gradr   r0   r9   rg   r%   r#   rJ   rJ   I   s"    
]]_
  
r%   rJ   rotary_pos_embc                    U R                   UR                   peUR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nUR                  U5      UR                  U5      4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r(   	unsqueezer   r,   )	qkr]   r_   unsqueeze_dimq_typek_typeq_embedk_embeds	            r#   apply_rotary_pos_embrs   Y   sv    & WWaggF
--
&C
--
&Cw;q>C/0Gw;q>C/0G::fwzz&111r%   c                       \ rS rSr S
S\R
                  S\\R
                  \R
                  4   S\R
                  S-  S\S-  S\\R
                  \R
                  S-  4   4
S jjrS	r	g)OlmoAttentiont   Nr&   position_embeddingsattention_maskpast_key_valuesr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      nU R                  U5      n	U R	                  U5      n
U R
                  R                  b  UR                  U R
                  R                  * U R
                  R                  S9  U	R                  U R
                  R                  * U R
                  R                  S9  U
R                  U R
                  R                  * U R
                  R                  S9  UR                  U5      R                  SS5      nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R
                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )NrM   )minmaxr   r   g        )dropoutscaling)rV   head_dimq_projk_projv_projrG   clip_qkvclamp_viewr[   rs   update	layer_idxr   get_interface_attn_implementationr   trainingattention_dropoutr~   reshape
contiguouso_proj)r!   r&   rw   rx   ry   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesr]   r_   attention_interfaceattn_outputattn_weightss                   r#   r0   OlmoAttention.forwardu   s    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r%   rg   r   )
r2   r3   r4   r5   r-   r8   tupler   r0   r9   rg   r%   r#   ru   ru   t   s{     )-/)||/) #5<<#=>/) t+	/)
 /) 
u||U\\D00	1/) /)r%   ru   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )OlmoDecoderLayer   rG   r   c                    > [         TU ]  X5        [        UR                  5      U l        [        UR                  5      U l        [        XS9U l        g )N)rG   r   )r   r   r   r   input_layernormpost_attention_layernormru   	self_attnr!   rG   r   r"   s      r#   r   OlmoDecoderLayer.__init__   sB    +,V-?-?@(5f6H6H(I%&fJr%   )r   r   r   )	r2   r3   r4   r5   r   r7   r   r9   r:   r;   s   @r#   r   r      s    Kz Kc K Kr%   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	OlmoModel   rG   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  5      U l
        g s  snf r   )r   r   rA   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r#   r   OlmoModel.__init__   s_     mmBGH`H`BabBaYf0Bab
 "&"4"45	 cs   A4)r   r   )r2   r3   r4   r5   r   r   r9   r:   r;   s   @r#   r   r      s    6z 6 6r%   r   c                       \ rS rSrSrg)OlmoForCausalLM   rg   Nr2   r3   r4   r5   r9   rg   r%   r#   r   r          r%   r   c                       \ rS rSrSrg)OlmoForSequenceClassification   rg   Nr   rg   r%   r#   r   r      r   r%   r   )r   r   r   OlmoPreTrainedModel)r   ).collections.abcr   r-   torch.nnrA   torch.nn.functional
functionalr*   cache_utilsr   integrationsr   modeling_rope_utilsr   modeling_utilsr   utilsr	   utils.genericr
   llama.modeling_llamar   r   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr2   loggerModuler   r=   rJ   rs   ru   r   r   r   r   __all__rg   r%   r#   <module>r      s   ( %       4 6 5  +
 
 
 + 
		H	%
BII 
Yh Y.   *+2 ,240)N 0)fK( K6
 6	& 		$B 	r%   