ó
    <±h.0  ã                   ó  • S r SSKJrJr  SSKrSSKJr  SSKJrJ	r	  SSK
JrJrJr  \" 5       (       a  SSKJr  SS	KJrJrJr  \	R(                  " \5      r " S
 S5      r S!S\R0                  S\R0                  S\R0                  S\R0                  4S jjr\\R0                  \4   r     S"S\R0                  S\\   S\\\\4      S\\   SS4
S jjrS\R0                  S\S\R0                  4S jr    S#S\R@                  RB                  S\R0                  S\R0                  S\R0                  S\\R0                  S4   S\\"   S\\"   S\\R0                     S\\R0                     S\\R0                  \R0                  4   4S  jjr#g)$a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
é    )ÚOptionalÚUnionN)Úversioné   )Úis_torch_flex_attn_availableÚlogging)Ú_torch_versionÚis_torch_less_or_equalÚis_torchdynamo_compiling)Ú_DEFAULT_SPARSE_BLOCK_SIZE)Ú	BlockMaskÚcreate_block_maskÚflex_attentionc                   ó|   ^ • \ rS rSrSrSrSrSrU 4S jr\	R                  R                  SS9S 5       rS rS	rU =r$ )
ÚWrappedFlexAttentioné.   z`
We are doing a singleton class so that flex attention is compiled once when it's first called.
NFc                 ó^   >• U R                   c  [        TU ]	  U 5      U l         U R                   $ ©N)Ú	_instanceÚsuperÚ__new__)ÚclsÚargsÚkwargsÚ	__class__s      €Ú`/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/integrations/flex_attention.pyr   ÚWrappedFlexAttention.__new__7   s'   ø€ Ø=‰=Ñ ä!™G™O¨CÓ0ˆCŒMØ}‰}Ðó    )Ú	recursivec                 óš  • U R                   (       a  XR                  :w  a«  Xl        [        S5      (       a  [        R                  " [
        SS9U l        On[        R                  " [        5      R                  S:X  a'  U(       a   [        R                  " [
        SSS9U l        O[        R                  " [
        5      U l        SU l         gg)	z.
Initialize or update the singleton instance.
ú2.5.1F)Údynamicz2.6.0zmax-autotune-no-cudagraphs)r"   ÚmodeTN)Ú_is_flex_compiledÚtrainingr
   ÚtorchÚcompiler   Ú_compiled_flex_attentionr   Úparser	   Úbase_version)Úselfr%   s     r   Ú__init__ÚWrappedFlexAttention.__init__=   s   € ð
 ×%×%¨·]±]Ó)BØ$ŒMÜ% g×.Ñ.Ü05·²¼nÐV[Ñ0\Õ-ô —’œ~Ó.×;Ñ;¸wÓFÎ8Ü05·²Ü"¨EÐ8Tñ1Õ-ô
 16·²¼nÓ0MÔ-à%)ˆDÕ"ð *Cr   c                 ó   • U R                   $ r   )r(   )r+   s    r   Ú__call__ÚWrappedFlexAttention.__call__S   s   € Ø×,Ñ,Ð,r   )r(   r$   r%   )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r   r$   r(   r   r&   ÚcompilerÚdisabler,   r/   Ú__static_attributes__Ú__classcell__)r   s   @r   r   r   .   sP   ø† ñð €IØÐØ#Ðõð ‡^^×Ñ eÐÐ,ñ*ó -ð*÷*-ð -r   r   ÚqueryÚkeyÚvalueÚreturnc                 ób   • [        5       (       d  [        U5      " 5       O[        nU" U UU40 UD6$ r   )r   r   r   )r:   r;   r<   r%   r   Úflex_attention_compileds         r   Úcompile_friendly_flex_attentionr@   W   s@   € ô G_×F`ÑF`Ô2°8Ô<Ô>ÔftÐÙ"ØØØñð ñ	ð r   Úattention_mask_2dÚattention_chunk_sizeÚoffsetsÚ	is_causalr   c                 óh  ^ ^^^^^^• T R                   u  pgU(       d  UnU(       d  UnU[        -  S-   [        -  n[        R                  R                  R                  T SSXƒ-
  4S9m T R                  n	T R                  5       mUb4  TR                  5       R                  S5      R                  S5      S-
  U-  mU U4S jmUU4S jn
U U4S jnU(       d  UmOUc  TOU
mUb1  US   R                  U	5      mUS   R                  U	5      mUUU4S	 jnOTn[        UUSUUU	[        S
5      (       + S9$ )a÷  
IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
and will be removed in a future version without warnings. New code should not use it. It is only kept here
for BC for now, while models using it are being patched accordingly.

Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
The resultant BlockMask is a compressed representation of the full (causal) block
mask. BlockMask is essential for performant computation of flex attention.
See: https://pytorch.org/blog/flexattention/

Args:
    attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
    of shape (batch_size, total_seq_len). e.g.

    For unpacked sequence:
    [[1, 1, 1, 1, 0, 0, 0],
     [1, 1, 1, 1, 1, 0, 0]]

    For packed sequence:
    [[1, 1, 1, 2, 2, 2, 0],
     [1, 1, 2, 2, 2, 3, 3]]

Returns:
    BlockMask
é   r   )r<   ÚpadNéÿÿÿÿc                 óJ   >• X#:¬  nT	X4   T	X4   :H  nTX4   S:„  nXF-  U-  nU$ )zÔ
Defines the logic of a block causal mask by combining both a standard causal mask
and a block diagonal document mask.
See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
for an illustration.
r   © )
Ú	batch_idxÚhead_idxÚq_idxÚkv_idxÚcausal_maskÚdocument_maskÚpadding_maskÚ
final_maskrA   Údocument_idss
           €€r   Úcausal_mask_modÚ4make_flex_block_causal_mask.<locals>.causal_mask_mod£   sK   ø€ ð ‘oˆØ$ YÐ%5Ñ6¸,ÀyÐGXÑ:YÑYˆØ(¨Ð)9Ñ:¸QÑ>ˆØ Ñ/°-Ñ?ˆ
ØÐr   c                 ó8   >• TX4   TX4   :H  nT" XX#5      nXE-  $ )zE
Combines the chunk mask with the causal mask for chunked attention.
rJ   )rK   rL   rM   rN   Ú
chunk_maskÚcausal_doc_maskrT   Ú
chunk_idxss         €€r   Úchunk_causal_mask_modÚ:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod°   s4   ø€ ð   	Ð 0Ñ1°ZÀ	Ð@QÑ5RÑRˆ
Ù)¨)¸uÓMˆØÑ+Ð+r   c                 ó<   >• TX4   TX4   :H  nTX4   S:„  nXT-  nU$ )zX
Utilizes default attention mask to enable encoder and encoder-decoder
attention masks.
r   rJ   )	rK   rL   rM   rN   rP   rQ   rR   rA   rS   s	          €€r   Údefault_mask_modÚ5make_flex_block_causal_mask.<locals>.default_mask_mod¸   s?   ø€ ð
 % YÐ%5Ñ6¸,ÀyÐGXÑ:YÑYˆà(¨Ð):Ñ;¸aÑ?ˆØ!Ñ1ˆ
ØÐr   c                 ó*   >• UT-   nUT-   nT" XXE5      $ r   rJ   )	rK   rL   rM   rN   Úoffset_qÚ	offset_kvÚ	kv_offsetÚmask_mod_maybe_combinedÚq_offsets	         €€€r   Úmask_modÚ-make_flex_block_causal_mask.<locals>.mask_modÌ   s$   ø€ Ø˜xÑ'ˆHØ Ñ*ˆIÙ*¨9ÀÓTÐTr   r!   )re   ÚBÚHÚQ_LENÚKV_LENÚdeviceÚ_compile)ÚshapeÚflex_default_block_sizer&   ÚnnÚ
functionalrG   rk   ÚcloneÚfill_ÚcumsumÚtor   r
   )rA   rB   Úquery_lengthÚ
key_lengthrC   rD   Ú
batch_sizeÚtotal_seq_lenÚpad_lenrk   rZ   r]   re   rT   rY   rS   rb   rc   rd   s   `            @@@@@@r   Úmake_flex_block_causal_maskrz   m   sD  þ€ ðD !2× 7Ñ 7Ñ€JÞØ"ˆ
ÞØ$ˆàÔ5Ñ5¸Ñ:Ô>UÑU€GÜŸ™×+Ñ+×/Ñ/Ð0AÈÐQRÐT[ÑThÐPiÐ/ÐjÐØ×%Ñ%€FØ$×*Ñ*Ó,€LàÑ'à"×(Ñ(Ó*×0Ñ0°Ó3×:Ñ:¸2Ó>ÀÑBÐH\Ñ]ˆ
öö,ö	ö Ø"2Ñà5IÑ5Q¡/ÐWlÐàÑØ˜1‘:—=‘= Ó(ˆØ˜A‘J—M‘M &Ó)ˆ	÷	Uð 	Uð
 +ˆäØØ
Ø
ØØØä+¨GÓ4Ô4ñ	ð 	r   Úhidden_statesÚn_repc                 ó    • U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )zÈ
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
rF   N)rm   ÚexpandÚreshape)r{   r|   ÚbatchÚnum_key_value_headsÚslenÚhead_dims         r   Ú	repeat_kvr„   ß   s_   € ð
 2?×1DÑ1DÑ.€E ØƒzØÐØ!¢!¢Q¨ªa²Ð"2Ñ3×:Ñ:¸5ÐW\ÐdlÓm€MØ× Ñ  ¸eÑ(CÀTÓTÐTr   ÚmoduleÚattention_maskÚscalingÚsoftcapÚ	head_maskÚs_auxc	                 óÆ  ^^^^• Tb  [         R                  S5        U	R                  SS5      S:”  a  [        S5      eS n
S m[	        U[
        5      (       a  Un
OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUUU4S jnSnUR                  S	   nXÝS	-
  -  S:X  dR  [        X!R                  S	   UR                  S	   -  5      n[        X1R                  S	   UR                  S	   -  5      nS
nU	R                  S5      n[        UUUUU
UUUSU R                  S9
u  nnUR                  UR                  5      nUR                  S	S5      R                  5       nUU4$ )Nzm`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature.Údropoutg        r   z›`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.éþÿÿÿc                 ól  >• Tb  T[         R                  " U T-  5      -  n Tb  U TU   S   U   U   -   n T	b  U T	U   U   S   S   -   n T
bh  [         R                  " U SSS9R                  n[         R                  " T
U-
  5      n[         R                  " X-
  5      nUR                  SSS9U-   nXx-  n U $ )Nr   rH   T)ÚdimÚkeepdim)r&   ÚtanhÚmaxÚvaluesÚexpÚsum)ÚscorerK   rL   rM   rN   Ú
logits_maxÚsinksÚunnormalized_scoresÚ
normalizerr‰   rŠ   Ú
score_maskrˆ   s            €€€€r   Ú	score_modÚ)flex_attention_forward.<locals>.score_mod  sÒ   ø€ ØÑØœeŸjšj¨°©Ó9Ñ9ˆEØÑ!Ø˜J yÑ1°!Ñ4°UÑ;¸FÑCÑCˆEØÑ Ø˜I iÑ0°Ñ:¸1Ñ=¸aÑ@Ñ@ˆEØÑÜŸš 5¨b¸$Ñ?×FÑFˆJÜ—I’I˜e jÑ0Ó1ˆEÜ"'§)¢)¨EÑ,>Ó"?ÐØ,×0Ñ0°RÀÐ0ÐFÈÑNˆJØ'Ñ4ˆEØˆr   TrF   FÚkernel_options)rœ   Ú
block_maskÚ
enable_gqaÚscalerž   Ú
return_lser%   r   )ÚloggerÚwarning_onceÚgetÚ
ValueErrorÚ
isinstancer   rm   r„   r@   r%   rt   ÚdtypeÚ	transposeÚ
contiguous)r…   r:   r;   r<   r†   r‡   rˆ   r‰   rŠ   r   rŸ   rœ   r    Únum_local_query_headsrž   Úattn_outputÚattention_weightsr›   s         ```        @r   Úflex_attention_forwardr®   ë   s  û€ ð ÑÜ×ÑØ{ô	
ð ‡zz)˜SÓ! AÓ%Üðaó
ð 	
ð
 €JØ€JÜ.¤)×,Ñ,Ø#‰
à#ˆ
àÑØ¢¢1¢a¨¨3¯9©9°R©=¨Ð 8Ñ9ˆ
÷ð ð €JØ!ŸK™K¨™NÐð #¸aÑ&?Ñ@ÀQÓFÜ˜Ÿ[™[¨™^¨s¯y©y¸©|Ñ;Ó<ˆÜ˜%§¡¨Q¡°5·;±;¸q±>Ñ!AÓBˆØˆ
à—Z‘ZÐ 0Ó1€NÜ%DØØØØØØØØ%ð Ø—‘ñ&Ñ"€KÐ"ð *×,Ñ,¨U¯[©[Ó9ÐØ×'Ñ'¨¨1Ó-×8Ñ8Ó:€KàÐ)Ð)Ð)r   )F)NNNNT)NNNN)$r5   Útypingr   r   r&   Ú	packagingr   Úutilsr   r   Úutils.import_utilsr	   r
   r   Ú!torch.nn.attention.flex_attentionr   rn   r   r   r   Ú
get_loggerr1   r£   r   ÚTensorr@   ÚintÚOffsetÚtupleÚboolrz   r„   ro   ÚModuleÚfloatr®   rJ   r   r   Ú<module>r¼      sù  ðñ÷8 #ã Ý ç 9ß aÑ añ  ×!Ñ!Ýgß^Ñ^ð 
×	Ò	˜HÓ	%€÷&-ñ &-ðZ ñ	Ø<‰<ðà	‰ðð <‰<ðð ‡\\õð$ 
ˆu|‰|˜SÐ Ñ	!€ð +/ØØØ/3Ø $ñoØ—|‘|ðoà" 3™-ðoð
 e˜F F˜NÑ+Ñ,ðoð ˜‰~ðoð õoðd	U˜UŸ\™\ð 	U°#ð 	U¸%¿,¹,ô 	Uð$  $Ø#Ø(,Ø$(ñL*ØH‰HO‰OðL*à<‰<ðL*ð 
‰ðL*ð <‰<ð	L*ð
 ˜%Ÿ,™,¨Ð3Ñ4ðL*ð e‰_ðL*ð e‰_ðL*ð ˜Ÿ™Ñ%ðL*ð E—L‘LÑ!ðL*ð ˆ5<‰<˜Ÿ™Ð%Ñ&öL*r   