
    <ht                     ~   S SK JrJr  S SKrS SKJr  S SKJs  Jr  S SK	rSSK
Jr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJr  S	S
KJr  \R6                  " \5      r " S S\R<                  5      r " S S\5      r SS jr! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r&/ SQr'g)    )CallableOptionalN   )Cache)ALL_ATTENTION_FUNCTIONS)logging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
OlmoLayerNorm   z/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2   > [         TU ]  5         U4U l        g N)super__init__normalized_shape)selfr   	__class__s     ]/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/olmo/modular_olmo.pyr   OlmoLayerNorm.__init__   s    !,    hidden_statesc                     UR                   n[        R                  " UR                  [        R
                  S9U R                  S S SS9R                  U5      $ )N)dtypegh㈵>)eps)r%   F
layer_normtotorchfloat32r   )r   r#   
orig_dtypes      r    forwardOlmoLayerNorm.forward"   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r"   )r   )__name__
__module____qualname____firstlineno____doc__intr   r*   Tensorr-   __static_attributes____classcell__r   s   @r    r   r      s9    9/C /D /
U\\ 
ell 
 
r"   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )OlmoMLP)   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr   s     r    r   OlmoMLP.__init__*   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr"   )rC   rA   rB   )r/   r0   r1   r2   r   r6   r7   r8   s   @r    r:   r:   )   s    Y Yr"   r:   c                    U R                   UR                   pvUR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   n	UR                  U5      U	R                  U5      4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r%   	unsqueezer   r)   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r    apply_rotary_pos_embrR   1   sv    ( WWaggF
--
&C
--
&Cw;q>C/0Gw;q>C/0G::fwzz&111r"   c                      \ rS rSr  SS\R
                  S\\R
                  \R
                  4   S\\R
                     S\\   S\\R                     S\\R
                  \\R
                     \\\R
                        4   4S	 jjr
S
rg)OlmoAttentionM   Nr#   position_embeddingsattention_maskpast_key_valuecache_positionr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  b  U	R                  U R
                  R                  * U R
                  R                  S9  U
R                  U R
                  R                  * U R
                  R                  S9  UR                  U R
                  R                  * U R
                  R                  S9  U	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R
                  R                  S:w  a  [        U R
                  R                     nU" U U	U
UU4U R                   (       d  SOU R"                  U R$                  S.UD6u  nnUR&                  " / UQSP76 R)                  5       nU R+                  U5      nUU4$ )	N)minmaxr   r	   )rK   rJ   rY   eagerg        )dropoutscaling)shapehead_dimq_projk_projv_projrD   clip_qkvclamp_view	transposerR   update	layer_idxr   _attn_implementationr   trainingattention_dropoutr`   reshape
contiguouso_proj)r   r#   rV   rW   rX   rY   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrJ   rK   cache_kwargsattention_interfaceattn_outputattn_weightss                     r    r-   OlmoAttention.forwardN   s1    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r"    )NN)r/   r0   r1   r2   r*   r5   tupler   r   
LongTensorr-   r6   r}   r"   r    rT   rT   M   s     +/592)||2) #5<<#=>2) !.	2)
 !2) !!1!122) 
u||Xell3XeELL>Q5RR	S2) 2)r"   rT   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )OlmoDecoderLayer   rD   rk   c                    > [         TU ]  X5        [        UR                  5      U l        [        UR                  5      U l        [        XS9U l        g )N)rD   rk   )r   r   r   r   input_layernormpost_attention_layernormrT   	self_attnr   rD   rk   r   s      r    r   OlmoDecoderLayer.__init__   sB    +,V-?-?@(5f6H6H(I%&fJr"   )r   r   r   )	r/   r0   r1   r2   r   r4   r   r6   r7   r8   s   @r    r   r      s    Kz Kc K Kr"   r   c                       \ rS rSrS rSrg)OlmoRotaryEmbedding   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	X4sS S S 5        $ ! , (       d  f       g = f)
Nr   r[   r   mpscpuF)device_typeenabledr	   )dim)inv_freqfloatexpandra   r)   device
isinstancetypestrr*   autocastri   catrJ   attention_scalingrK   )
r   xrL   inv_freq_expandedposition_ids_expandedr   freqsembrJ   rK   s
             r    r-   OlmoRotaryEmbedding.forward   s'    MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C8 DCCs   $BE22
F r}   N)r/   r0   r1   r2   r-   r6   r}   r"   r    r   r      s    
r"   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	OlmoModel   rD   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  5      U l
        g s  snf r   )r   r   r>   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r    r   OlmoModel.__init__   s_     mmBGH`H`BabBaYf0Bab
 "&"4"45	 cs   A3)r   r   )r/   r0   r1   r2   r   r   r6   r7   r8   s   @r    r   r      s    6z 6 6r"   r   c                       \ rS rSrSrg)OlmoForCausalLM   r}   N)r/   r0   r1   r2   r6   r}   r"   r    r   r      s    r"   r   )r   r   OlmoPreTrainedModel)Nr   )(typingr   r   r*   torch.nnr>   torch.nn.functional
functionalr'   torch.utils.checkpointcache_utilsr   modeling_utilsr   utilsr   llama.modeling_llamar
   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr/   loggerModuler   r:   rR   rT   r   r   r   r   __all__r}   r"   r    <module>r      s    %        5 	 	 	 + 
		H	%
BII 
Yh Y283)N 3)lK( K. 6
 6	& 	r"   