
    <h.0                        S r SSKJrJr  SSKrSSKJr  SSKJrJ	r	  SSK
JrJrJr  \" 5       (       a  SSKJr  SS	KJrJrJr  \	R(                  " \5      r " S
 S5      r S!S\R0                  S\R0                  S\R0                  S\R0                  4S jjr\\R0                  \4   r     S"S\R0                  S\\   S\\\\4      S\\   SS4
S jjrS\R0                  S\S\R0                  4S jr    S#S\R@                  RB                  S\R0                  S\R0                  S\R0                  S\\R0                  S4   S\\"   S\\"   S\\R0                     S\\R0                     S\\R0                  \R0                  4   4S  jjr#g)$a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)_torch_versionis_torch_less_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attentionc                   |   ^  \ rS rSrSrSrSrSrU 4S jr\	R                  R                  SS9S 5       rS rS	rU =r$ )
WrappedFlexAttention.   z`
We are doing a singleton class so that flex attention is compiled once when it's first called.
NFc                 ^   > U R                   c  [        TU ]	  U 5      U l         U R                   $ N)	_instancesuper__new__)clsargskwargs	__class__s      `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/integrations/flex_attention.pyr   WrappedFlexAttention.__new__7   s'    == !GOC0CM}}    )	recursivec                    U R                   (       a  XR                  :w  a  Xl        [        S5      (       a  [        R                  " [
        SS9U l        On[        R                  " [        5      R                  S:X  a'  U(       a   [        R                  " [
        SSS9U l        O[        R                  " [
        5      U l        SU l         gg)	z.
Initialize or update the singleton instance.
2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r"   modeTN)_is_flex_compiledtrainingr
   torchcompiler   _compiled_flex_attentionr   parser	   base_version)selfr%   s     r   __init__WrappedFlexAttention.__init__=   s    
 %%]])B$M%g..05nV[0\- ~.;;wF805"E8T1-
 16n0M-%)D" *Cr   c                     U R                   $ r   )r(   )r+   s    r   __call__WrappedFlexAttention.__call__S   s    ,,,r   )r(   r$   r%   )__name__
__module____qualname____firstlineno____doc__r   r$   r(   r   r&   compilerdisabler,   r/   __static_attributes____classcell__)r   s   @r   r   r   .   sP     I# ^^e,* -**- -r   r   querykeyvaluereturnc                 b    [        5       (       d  [        U5      " 5       O[        nU" U UU40 UD6$ r   )r   r   r   )r:   r;   r<   r%   r   flex_attention_compileds         r   compile_friendly_flex_attentionr@   W   s@     G_F`F`28<>ft" 	 r   attention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 h  ^ ^^^^^^ T R                   u  pgU(       d  UnU(       d  UnU[        -  S-   [        -  n[        R                  R                  R                  T SSX-
  4S9m T R                  n	T R                  5       mUb4  TR                  5       R                  S5      R                  S5      S-
  U-  mU U4S jmUU4S jn
U U4S jnU(       d  UmOUc  TOU
mUb1  US   R                  U	5      mUS   R                  U	5      mUUU4S	 jnOTn[        UUSUUU	[        S
5      (       + S9$ )a  
IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
and will be removed in a future version without warnings. New code should not use it. It is only kept here
for BC for now, while models using it are being patched accordingly.

Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
The resultant BlockMask is a compressed representation of the full (causal) block
mask. BlockMask is essential for performant computation of flex attention.
See: https://pytorch.org/blog/flexattention/

Args:
    attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
    of shape (batch_size, total_seq_len). e.g.

    For unpacked sequence:
    [[1, 1, 1, 1, 0, 0, 0],
     [1, 1, 1, 1, 1, 0, 0]]

    For packed sequence:
    [[1, 1, 1, 2, 2, 2, 0],
     [1, 1, 2, 2, 2, 3, 3]]

Returns:
    BlockMask
   r   )r<   padNc                 J   > X#:  nT	X4   T	X4   :H  nTX4   S:  nXF-  U-  nU$ )z
Defines the logic of a block causal mask by combining both a standard causal mask
and a block diagonal document mask.
See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
for an illustration.
r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskrA   document_idss
           r   causal_mask_mod4make_flex_block_causal_mask.<locals>.causal_mask_mod   sK     o$Y%56,yGX:YY()9:Q> /-?
r   c                 8   > TX4   TX4   :H  nT" XX#5      nXE-  $ )zE
Combines the chunk mask with the causal mask for chunked attention.
rJ   )rK   rL   rM   rN   
chunk_maskcausal_doc_maskrT   
chunk_idxss         r   chunk_causal_mask_mod:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s4      	 01Z	@Q5RR
))uM++r   c                 <   > TX4   TX4   :H  nTX4   S:  nXT-  nU$ )zX
Utilizes default attention mask to enable encoder and encoder-decoder
attention masks.
r   rJ   )	rK   rL   rM   rN   rP   rQ   rR   rA   rS   s	          r   default_mask_mod5make_flex_block_causal_mask.<locals>.default_mask_mod   s?    
 %Y%56,yGX:YY():;a?!1
r   c                 *   > UT-   nUT-   nT" XXE5      $ r   rJ   )	rK   rL   rM   rN   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_mod-make_flex_block_causal_mask.<locals>.mask_mod   s$    x'H*I*9TTr   r!   )re   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer&   nn
functionalrG   rk   clonefill_cumsumtor   r
   )rA   rB   query_length
key_lengthrC   rD   
batch_sizetotal_seq_lenpad_lenrk   rZ   r]   re   rT   rY   rS   rb   rc   rd   s   `            @@@@@@r   make_flex_block_causal_maskrz   m   sD   D !2 7 7J"
$55:>UUG++//0AQRT[ThPi/j%%F$**,L'"((*003::2>BH\]
,	 "25I5Q/Wl1:==(AJMM&)		U 	U
 +

+G44	 	r   hidden_statesn_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
rF   N)rm   expandreshape)r{   r|   batchnum_key_value_headsslenhead_dims         r   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr   moduleattention_maskscalingsoftcap	head_masks_auxc	                   ^^^^ Tb  [         R                  S5        U	R                  SS5      S:  a  [        S5      eS n
S m[	        U[
        5      (       a  Un
OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUUU4S jnSnUR                  S	   nXS	-
  -  S:X  dR  [        X!R                  S	   UR                  S	   -  5      n[        X1R                  S	   UR                  S	   -  5      nS
nU	R                  S5      n[        UUUUU
UUUSU R                  S9
u  nnUR                  UR                  5      nUR                  S	S5      R                  5       nUU4$ )Nzm`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature.dropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 l  > Tb  T[         R                  " U T-  5      -  n Tb  U TU   S   U   U   -   n T	b  U T	U   U   S   S   -   n T
bh  [         R                  " U SSS9R                  n[         R                  " T
U-
  5      n[         R                  " X-
  5      nUR                  SSS9U-   nXx-  n U $ )Nr   rH   T)dimkeepdim)r&   tanhmaxvaluesexpsum)scorerK   rL   rM   rN   
logits_maxsinksunnormalized_scores
normalizerr   r   
score_maskr   s            r   	score_mod)flex_attention_forward.<locals>.score_mod  s    ejj99E!Jy1!4U;FCCE Ii0:1=a@@E5b$?FFJIIej01E"'))E,>"?,00R0FNJ'4Er   TrF   Fkernel_options)r   
block_mask
enable_gqascaler   
return_lser%   r   )loggerwarning_onceget
ValueError
isinstancer   rm   r   r@   r%   rt   dtype	transpose
contiguous)r   r:   r;   r<   r   r   r   r   r   r   r   r   r   num_local_query_headsr   attn_outputattention_weightsr   s         ```        @r   flex_attention_forwardr      s    {	
 zz)S!A%a
 	

 JJ.),,#
#
1a399R= 89
  J!KKN #a&?@QF[[^syy|;<%Q5;;q>!AB
ZZ 01N%D% &"K" *,,U[[9''1-88:K)))r   )F)NNNNT)NNNN)$r5   typingr   r   r&   	packagingr   utilsr   r   utils.import_utilsr	   r
   r   !torch.nn.attention.flex_attentionr   rn   r   r   r   
get_loggerr1   r   r   Tensorr@   intOffsettupleboolrz   r   ro   Modulefloatr   rJ   r   r   <module>r      s  8 #   9 a a  !!g^^ 
		H	%&- &-Z 	<<	 << \\$ 
u||S 	! +//3 $o||o"3-o
 eFFN+,o ~o od	UU\\ 	U# 	U%,, 	U$  $#(,$(L*HHOOL*<<L* 
L* <<	L*
 %,,34L* e_L* e_L* %L* ELL!L* 5<<%&L*r   