ó
    <±h?{  ã            (       ó^  • S SK r S SKrS SKrS SKJr  S SKJrJr  S SKrS SK	J
s  Jr  SSKJrJrJrJrJr  \R&                  " \5      rS rS rSqSqSqSqSqSS	S
.qS\\   4S jrS r S\\   4S jr!S r"S<S jr#S r$S\RJ                  S\&\RJ                  \RJ                  \'4   4S jr(S\RJ                  S\RJ                  S\RJ                  S\RJ                  S\'4
S jr)S=S\*4S jjr+S r,S r-S r. S<S\RJ                  S \RJ                  S!\RJ                  S"\\R^                     4S# jjr0 " S$ S%\S&S'9r1        S>S\'S(\'S)\*S*\2S+\\2   S,\\'   S-\*S.\\2   S/\\*   S0\\RJ                     S1\\3\\*4      4S2 jjr4             S?S3\RJ                  S4\RJ                  S5\RJ                  S\\RJ                     S\'S)\*S*\2S6\\RJ                     S+\\2   S,\\'   S-\*S.\\2   S/\\*   S7\\Rj                     S8\\Rj                     S9\\'   S:\\'   S"\\R^                     S\\   4&S; jjr6g)@é    N)Úpartial)ÚOptionalÚ	TypedDicté   )Úis_flash_attn_2_availableÚis_flash_attn_3_availableÚ#is_flash_attn_greater_or_equal_2_10Úis_torch_npu_availableÚloggingc                  óx   • [        5       (       a  g[        5       (       a  [        5       (       + $ SSKJn   U " 5       $ )NFr   ©Ú'is_npu_fa2_top_left_aligned_causal_mask)r   r   r	   Ú integrations.npu_flash_attentionr   r   s    Úc/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/modeling_flash_attention_utils.pyÚ!flash_attn_supports_top_left_maskr   $   s/   € Ü ×"Ñ"ØÜ ×"Ñ"Ü6Ó8Ô8Ð8åYá2Ó4Ð4ó    c                  óZ   • [        5       =(       d    [        5       =(       d
    [        5       $ ©N)r   r   r
   © r   r   Úis_flash_attn_availabler   0   s   € Ü$Ó&×aÔ*CÓ*E×aÔI_ÓIaÐar   Ú	dropout_pÚwindow_size)ÚdropoutÚsliding_windowÚimplementationc                 ó`  • [        5       n[        5       nU S:X  d  U c  U(       a  U(       d  SSKJnJn  SSKJnJn  Oo[        [        peU S:X  d
  U c  U(       a	  SSK
JnJn  OK[        5       (       a  SSKJn  SS	KJn  O/[        U S
S5      n[        U SS5      nUb  Uc  [!        SU  S35      eX4XV4$ )aÎ  
Lazy loads the respective flash attention implementations.

Return:
    flash_attn_func: The base flash attention function.
    flash_attn_varlen_func: The flash attention function supporting variable sequence lengths,
                            e.g. for padding-free training.
    pad_input: The function to pad inputs into one sequence and returning the respective kwargs.
    unpad_input: The function to unpad outputs based on the kwargs (from pad_input).
Úflash_attention_2Nr   )Úflash_attn_funcÚflash_attn_varlen_func)Ú	pad_inputÚunpad_inputÚflash_attention_3r   )Únpu_flash_attn_func)Únpu_flash_attn_varlen_funcr   r   zJCould not find the currently requested flash attention implementation at `z^`.Make sure that you request a valid kernel from the hub, e.g. `kernels-community/flash-attn`.)r   r   Ú
flash_attnr   r   Úflash_attn.bert_paddingr    r!   Ú
_pad_inputÚ_unpad_inputÚflash_attn_interfacer
   r   r#   r$   ÚgetattrÚ
ValueError)r   Úis_fa2Úis_fa3r   r   r    r!   s          r   Ú_lazy_importsr.   C   sº   € ô 'Ó(€FÜ&Ó(€FØÐ,Ó,°Ñ1GÎFÖ[aßFßBÐBä!+¬\;ØÐ0Ó0°^Ñ5KÖPVßTÐTÜ#×%Ñ%Ý`Þnô & nÐ6GÈÓNˆOÜ%,¨^Ð=UÐW[Ó%\Ð"Ø%Ñ-°Ñ1HÜ Ø`ÐaoÐ`pð qsð tóð ð
 °IÐJÐJr   c                 óò   • [         R                  " U 5      R                  n[         R                  " [        5      R                  n0 nU H  n[        R                  XD5      nXQ;   X5'   M      [        [        US9$ )a“  
Depending on the version and kernel some features are not supported. Due to limitations in
`torch.compile`, we opt to statically type which (optional) kwarg parameters are supported
within `_process_flash_attention_kwargs`.

NOTE: While all supported kwargs are marked as `True`, everything else is marked as `False`.
      This might be confusing for kwargs that we use in any case, e.g. `is_causal`.
)Úsupports_mapping)ÚinspectÚ	signatureÚ
parametersÚ_process_flash_attention_kwargsÚ_hf_api_to_flash_mappingÚgetr   )Úflash_functionÚflash_parametersÚprocess_parametersr0   ÚparamÚfa_params         r   Ú_lazy_define_process_functionr<   g   sp   € ô ×(Ò(¨Ó8×CÑCÐÜ ×*Ò*Ô+JÓK×VÑVÐàÐÛ#ˆÜ+×/Ñ/°Ó=ˆØ%-Ñ%AÐÓ"ñ $ô Ô2ÐEUÑVÐVr   c                 óÜ   • [        S [        [        [        [        4 5       5      (       a  [        U 5      u  qqqq[        c  [        [        5      q[        [        [        [        4[        4$ )a
  
Lazy loading flash attention and returning the respective functions + flags back

NOTE: For fullgraph, this needs to be called before compile while no fullgraph can
      can work without preloading. See `_check_and_adjust_attn_implementation` in `modeling_utils`.
c              3   ó&   #   • U H  oS L v •  M
     g 7fr   r   )Ú.0Úks     r   Ú	<genexpr>Ú.lazy_import_flash_attention.<locals>.<genexpr>…   s   é € Ð
PÑO˜9ÒOùó   ‚)ÚanyÚ	_flash_fnÚ_flash_varlen_fnÚ_pad_fnÚ	_unpad_fnr.   Ú_process_flash_kwargs_fnr<   )r   s    r   Úlazy_import_flash_attentionrJ   }   s]   € ô Ñ
PœyÔ*:¼GÄYÑOÓ
P×PÑPÜ:GÈÓ:WÑ7ˆ	Ð# W¨iô  Ñ'Ü#@ÔAQÓ#RÐ äÔ'¬´)Ð<Ô>VÐVÐVr   c                 óJ   • U R                   " S/U R                  SS Q76 nX!   $ )zø
A local implementation of the PyTorch indexing operation `tensor[indices]` on the first axis,
after flattening the first two dimensions of the tensor. This is functionally equivalent to
FA2's `index_first_axis` and replaces the need to import it.
éÿÿÿÿé   N)ÚreshapeÚshape)ÚtensorÚindicesÚreshaped_tensors      r   Ú_index_first_axisrS      s+   € ð —n’n RÐ;¨&¯,©,°q°rÐ*:Ò;€OØÑ#Ð#r   c                 ó¼  • Ub  X-   OUnUR                  S[        R                  S9nUR                  S[        R                  S9n[        R                  " UR	                  5       SS9R	                  5       nUR                  5       R                  5       n[        R                  " [        R                  " US[        R                  S9S5      n[        X5      UUUU4$ )a  
unpad_input function for flash attention variants that do not have them within their pkg themselves, e.g. fa3.

Arguments:
    hidden_states: (batch, seqlen, ...)
    attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
    unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.

Return:
    hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
    indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
    cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
    max_seqlen_in_batch: int
    seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
rL   ©ÚdimÚdtypeF©Úas_tupler   ©r   r   )ÚsumÚtorchÚint32ÚnonzeroÚflattenÚmaxÚitemÚFÚpadÚcumsumrS   )	Úhidden_statesÚattention_maskÚunused_maskÚ	all_masksÚseqlens_in_batchÚused_seqlens_in_batchrQ   Úmax_seqlen_in_batchÚ
cu_seqlenss	            r   r(   r(   ›   sÁ   € ð  3>Ñ2IÒ-È~€IØ —}‘}¨´5·;±;}Ð?ÐØ*×.Ñ.°2¼U¿[¹[Ð.ÐIÐÜmŠm˜I×-Ñ-Ó/¸%Ñ@×HÑHÓJ€GØ*×.Ñ.Ó0×5Ñ5Ó7ÐÜ—’”u—|’|Ð$4¸!Ä5Ç;Á;ÑOÐQWÓX€Jô 	˜-Ó1ØØØØðð r   c                 ó°   • U R                   SS n[        R                  " X#-  /UQ7U R                  U R                  S.6nXU'   UR
                  " X#/UQ76 $ )aú  
pad_input function for flash attention variants that do not have them within their pkg themselves, e.g. fa3.

Arguments:
    hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
    indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
    batch: int, batch size for the padded sequence.
    seqlen: int, maximum sequence length for the padded sequence.

Return:
    hidden_states: (batch, seqlen, ...)
r   N©ÚdevicerW   )rO   r\   Úzerosro   rW   Úview)re   rQ   ÚbatchÚseqlenrV   Úoutputs         r   r'   r'   »   sZ   € ð ×
Ñ
˜a˜bÐ
!€CÜ[Š[˜%™.Ðh¨CÑh¸×8LÑ8LÐTa×TgÑTgÒh€FØ#ˆ7OØ;Š;uÐ+ sÒ+Ð+r   rf   Úreturnc                 óX  • U R                  S[        R                  S9n[        R                  " U R	                  5       SS9R	                  5       nUR                  5       R                  5       n[        R                  " [        R                  " US[        R                  S9S5      nUUU4$ )aA  
Retrieves indexing data required to repad unpadded (ragged) tensors.

Arguments:
    attention_mask (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

Return:
    indices (`torch.Tensor`):
        The indices of non-masked tokens from the flattened input sequence.
    cu_seqlens (`torch.Tensor`):
        The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    max_seqlen_in_batch (`int`):
        Maximum sequence length in batch.
rL   rU   FrX   r   rZ   )
r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   )rf   ri   rQ   rk   rl   s        r   Ú_get_unpad_datarw   Î   s   € ð  &×)Ñ)¨b¼¿¹Ð)ÐDÐÜmŠm˜N×2Ñ2Ó4¸uÑE×MÑMÓO€Gð +×.Ñ.Ó0×5Ñ5Ó7ÐÜ—’”u—|’|Ð$4¸!Ä5Ç;Á;ÑOÐQWÓX€JàØØðð r   Úquery_layerÚ	key_layerÚvalue_layerÚquery_lengthc                 ó   • [        U5      u  pgnUR                  S   UR                  S   =n	:”  a!  USS2SU	2SS2SS24   USS2SU	2SS2SS24   p!UR                  u  p«pÍ[        X5      n[        X&5      nXK:X  a  [        X5      n UnUnUnOhUS:X  aJ  Sn[        R                  " U
S-   [        R
                  U R                  S9nUSS nU R                  S5      n OUSS2U* S24   nU" X5      tn npïnU UUUXç4Xø44$ )a‡  
Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.
This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
tensors for query, key, value tensors.

Arguments:
    query_layer (`torch.Tensor`):
        Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
    key_layer (`torch.Tensor`):
        Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    value_layer (`torch.Tensor`):
        Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    attention_mask (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    query_length (`int`):
        Target length.
    unpad_input_func:
        The function to use for unpadding the input tensors.

Return:
    query_layer (`torch.Tensor`):
        Query state without padding. Shape: (total_target_length, num_heads, head_dim).
    key_layer (`torch.Tensor`):
        Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    value_layer (`torch.Tensor`):
        Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    indices_q (`torch.Tensor`):
        The indices of non-masked tokens from the flattened input target sequence.
    (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
        The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
        Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
r   rL   N©rW   ro   )rw   rO   rS   r\   Úaranger]   ro   Úsqueeze)rx   ry   rz   rf   r{   Úunpad_input_funcÚ	indices_kÚcu_seqlens_kÚmax_seqlen_in_batch_kÚseq_lenÚ
batch_sizeÚ
kv_seq_lenÚnum_key_value_headsÚhead_dimÚcu_seqlens_qÚmax_seqlen_in_batch_qÚ	indices_qÚ_s                     r   Ú_upad_inputr   ë   sE  € ôR 6EÀ^Ó5TÑ2€IÐ2ð ‡qÑ¨×(<Ñ(<¸RÑ(@Ð@˜WÓAØ!*ª1¨h¨w¨hºº1Ð+<Ñ!=¸{Ê1ÈhÈwÈhÒXYÒ[\ÐK\Ñ?];à<E¿O¹OÑ9€JÐ/ä! )Ó7€IÜ# KÓ;€KØÓ!Ü'¨Ó?ˆØ#ˆØ 5ÐØ‰	Ø	˜Ó	Ø !ÐÜ—|’|Ø˜‰N¤%§+¡+°k×6HÑ6Hñ
ˆð !  "Ð%ˆ	Ø!×)Ñ)¨!Ó,‰ð (ª¨L¨=©>Ð(9Ñ:ˆÙJZÐ[fÓJwÐGˆY Àað 	ØØØØ	Ð$Ø	Ð6ðð r   Úis_packed_sequencec           	      ó˜  • U(       Gdg  [         R                  U R                  S.nU SS2S4   nU R                  S   S:X  a&  [         R                  " U R                  S5      40 UD6OUR                  S5      n[         R                  " [         R                  " S0 UD6UR                  S5      R                  [         R                  5      /S5      n[         R                  " [         R                  " S0 UD6UR                  S5      R                  S5      R                  [         R                  5      /S5      n[        UR                  5       5      n[        UR                  5       5      S-   nOÖU R                  5       n [         R                  " U R                  S5      U R                  [         R                  S9n	[         R                  " XS:H     [         R                  " U R                  5       U R                  [         R                  S945      nUnUR!                  5       R                  5       nUR#                  5       nUnXV4Xx44$ )aã  
This function returns all the necessary kwargs to call `flash_attn_varlen_func`
extracted from position_ids. The `position_ids` can be either packed sequence or
the usual padded position ids, for example in inference time.

Arguments:
    position_ids (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    is_packed_sequence (`bool`, *optional*, defaults to `True`):
        Whether the input position ids are a packed sequence or not.

Return:
    (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
        The cumulative sequence lengths for the target (query) and source (key, value), used to index into
        ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
        Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query,
        `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
r}   NrL   r   r   rn   )r   )r\   r]   ro   rO   ÚonesÚsizeÚaddÚcatrp   rd   ÚtoÚintr`   r_   r~   rP   Údiffra   )
Úposition_idsrŽ   Útensor_kwargsÚlast_position_idsÚq_lenÚcu_seq_lens_qÚcu_seq_lens_kÚmax_length_qÚmax_length_kr‹   s
             r   Ú#prepare_fa_kwargs_from_position_idsrŸ   :  sê  € ÷. Ü"'§+¡+¸×9LÑ9LÑMˆà(ª¨B¨Ñ/Ðð ×!Ñ! "Ñ%¨Ó*ô JŠJ|×(Ñ(¨Ó+Ñ=¨}Ò=à"×&Ñ& qÓ)ð 	ô
 Ÿ	š	¤5§;¢;Ñ#B°MÑ#BÀEÇLÁLÐQRÃO×DVÑDVÔW\×WbÑWbÓDcÐ"dÐfgÓhˆÜŸ	š	Ü[Š[Ñ,˜mÑ,Ð.?×.CÑ.CÀAÓ.F×.MÑ.MÈaÓ.P×.SÑ.SÔTY×T_ÑT_Ó.`ÐaÐcdó
ˆô ˜5Ÿ9™9›;Ó'ˆÜÐ,×0Ñ0Ó2Ó3°aÑ7‰à#×+Ñ+Ó-ˆÜ—L’L ×!2Ñ!2°1Ó!5¸l×>QÑ>QÔY^×YdÑYdÑeˆ	äŸ	š	à¨!Ñ+Ñ,Ü—’˜\×.Ñ.Ó0¸×9LÑ9LÔTY×T_ÑT_Ñ`ðó
ˆð &ˆð
 %×)Ñ)Ó+×/Ñ/Ó1ˆð $×(Ñ(Ó*ˆØ#ˆàÐ)¨LÐ+GÐGÐGr   c                 óÒ  • UR                   S   nXE:H  nU R                  5       R                  SU R                  S5      U R                  S5      5      n UR                  5       R                  SUR                  S5      UR                  S5      5      nUR                  5       R                  SUR                  S5      UR                  S5      5      n[	        X6S9u  u  pxu  pšXX'U4Xš44$ )a$  
This function returns necessary arguments to call `flash_attn_varlen_func`.
All three query, key, value states will be flattened.
Cumulative lengths of each examples in the batch will be extracted from position_ids.
NOTE: ideally cumulative lengths should be prepared at the data collator stage

Arguments:
    query (`torch.Tensor`):
        Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
    key (`torch.Tensor`):
        Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    value (`torch.Tensor`):
        Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    position_ids (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    query_length (`int`):
        Sequence length of the input queries.

Return:
    query (`torch.Tensor`):
        Query state without padding. Shape: (total_target_length, num_heads, head_dim).
    key (`torch.Tensor`):
        Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    value (`torch.Tensor`):
        Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    indices_q (`torch.Tensor`):
        The indices of non-masked tokens from the flattened input target sequence.
    (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
        The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
        Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
r   rL   éþÿÿÿ)rŽ   )rO   Ú
contiguousrq   r‘   rŸ   )ÚqueryÚkeyÚvaluer—   r{   Ú	kv_lengthrŽ   r›   rœ   r   rž   s              r   Ú_prepare_from_posidsr§   |  sÐ   € ðB —	‘	˜!‘€IØ%Ñ2Ðà×ÑÓ×#Ñ# B¨¯
©
°2«¸¿
¹
À2»ÓG€EØ
.‰.Ó
×
Ñ
  C§H¡H¨R£L°#·(±(¸2³,Ó
?€CØ×ÑÓ×#Ñ# B¨¯
©
°2«¸¿
¹
À2»ÓG€EäCfØñDÑ@Ñ"€]Ñ$@ \ð ˜¨}Ð=ÀÐ?[Ð\Ð\r   c                 óP   • [         R                  " S[        5        [        XX#5      $ )NzÌThe function `_prepare_flash_attention_from_position_ids` in `transformers.modeling_flash_attention_utils` is deprecated and will be removed in a future version. Please use `_prepare_from_posids` instead.)ÚwarningsÚwarnÚFutureWarningr§   )r£   r¤   r¥   r—   s       r   Ú*_prepare_flash_attention_from_position_idsr¬   «  s&   € Ü‡M‚Mð 	WÜôô   ¨EÓ@Ð@r   c                 óø   • U c  g[         R                  " U R                  S   U R                  S9U R	                  5       -   nUS:H  =(       a.    X -
  R                  5       R                  5       R                  5       $ )a  
Check the position ids whether packed sequences are indicated or not
    1. Position ids exist
    2. Flattened sequences only are supported
    3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. we have multiple increasing sequences
Fr   )ro   )r\   r~   rO   ro   ÚminÚabsr[   Úbool)r—   r…   Úincreasing_position_sequencess      r   Ú_is_packed_sequencer²   ³  sq   € ð ÑØô 	Š\×'Ñ'¨Ñ*°<×3FÑ3FÑGÈ,×JZÑJZÓJ\Ñ\ð "ð ˜‰?×`Ð =Ñ L×QÑQÓS×WÑWÓY×^Ñ^Ó`Ð`r   Úqr@   ÚvÚtarget_dtypec                 óê   • U(       ai  U R                   [        R                  :X  aK  [        R	                  SU S35        U R                  U5      UR                  U5      UR                  U5      p!n XU4$ )aM  
PEFT usually casts the layer norms in float32 for training stability reasons
therefore the input hidden states gets silently casted in float32. Hence, we need
cast them back in float16 / bfloat16 just to be sure everything works as expected.
This might slowdown training & inference so it is recommended to not cast the LayerNorms!
zCasting fp32 inputs back to z for flash-attn compatibility.)rW   r\   Úfloat32ÚloggerÚwarning_oncer”   )r³   r@   r´   rµ   s       r   Úfa_peft_integration_checkrº   Ã  s^   € ö ˜Ÿ™¤5§=¡=Ó0Ü×ÑÐ:¸<¸.ÐHfÐgÔhØ—$‘$|Ó$ a§d¡d¨<Ó&8¸!¿$¹$¸|Ó:LˆaˆØˆ7€Nr   c                   ó‚   • \ rS rSr% Sr\\R                     \S'   \\R                     \S'   \\	   \S'   \\	   \S'   Sr
g)	ÚFlashAttentionKwargsiÕ  aÒ  
Keyword arguments for Flash Attention with Compile.

Attributes:
    cumulative_seqlens_q (`torch.LongTensor`, *optional*)
        Gets cumulative sequence length for query state.
    cumulative_seqlens_k (`torch.LongTensor`, *optional*)
        Gets cumulative sequence length for key state.
    max_length_q (`int`, *optional*):
        Maximum sequence length for query state.
    max_length_k (`int`, *optional*):
        Maximum sequence length for key state.
Úcumulative_seqlens_qÚcumulative_seqlens_kr   rž   r   N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r   r\   Ú
LongTensorÚ__annotations__r•   Ú__static_attributes__r   r   r   r¼   r¼   Õ  s?   ‡ ñð # 5×#3Ñ#3Ñ4Ó4Ø" 5×#3Ñ#3Ñ4Ó4Ø˜3‘-ÓØ˜3‘-Ör   r¼   F)ÚtotalÚ
key_lengthÚ	is_causalr   Úsoftmax_scaler   Úuse_top_left_maskÚsoftcapÚdeterministicÚs_auxr0   c                 ó,  • U=(       a    U=(       a    U S:H  (       + US.nU
S   (       a  X<S'   U
S   (       a  Ub  X:”  a  XU4US'   U
S   (       a"  Ub  UO[         R                  " SS5      S:H  US'   U
S	   (       a  Ub  X|S	'   U
S
   (       a  U	b  XœS
'   U$ )aå  
Returns a set of kwargs that are passed down to the according flash attention function based on
requested features and whether it is supported - depends on the version and kernel implementation
which is dynamically configued at `lazy_import_flash_attention`. The (un)supported features can be
inspected in `supports_mapping`, see `_lazy_define_process_function` for more details.

Args:
    query_length (`int`):
        Length of the query states
    key_length (`int`):
        Length of the key states
    is_causal (`bool`):
        Whether we perform causal (decoder) attention or full attention.
    dropout (`float`):
        Attention dropout.
    softmax_scale (`float`, *optional*):
        The scaling of QK^T before applying softmax. Default to `1 / sqrt(head_dim)`.
    sliding_window (`int`, *optional*):
        The size of the sliding window, i.e. we look at a max of `sliding_window` tokens back.
    use_top_left_mask (`bool`):
        Deprecated behavior of older versions of flash attention requiring different masking.
    softcap (`float`, *optional*):
        Softcap for the attention logits, used e.g. in gemma2.
    deterministic (`bool`, *optional*):
        Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
    s_aux (`torch.Tensor`, *optional*):
        Attention sink auxiliary that adds a `bias` to the attention calculation via an additional head.
Return:
    flash_kwargs (`dict`):
        A dict of kwargs that are requested and supported.
r   )ÚcausalrÊ   r   r   rÍ   ÚFLASH_ATTENTION_DETERMINISTICÚ0Ú1rÌ   rÎ   )ÚosÚgetenv)r{   rÈ   rÉ   r   rÊ   r   rË   rÌ   rÍ   rÎ   r0   ÚkwargsÚflash_kwargss                r   r4   r4   ê  s¹   € ð\ ×MÐ%6×%L¸<È1Ñ;LÔ MØ&ñ€Lð
 ˜×$Ø$+[Ñ!à˜×&¨>Ñ+EÈ*ÓJeØ'5Ð&Fˆ]Ñ#à˜×(à*Ñ6‰M¼B¿IºIÐFeÐgjÓ<kÐorÑ<rð 	_Ñ%ð ˜	×" wÑ':Ø")YÑð ˜×  UÑ%6Ø %WÑàÐr   Úquery_statesÚ
key_statesÚvalue_statesr—   r›   rœ   r   rž   c                 óò  • [        U5      u  u  nnnnn[        XUU5      u  pnU" SUUR                  S5      UUUU	U
UUS.	UD6n[        XpR                  S5      S9n[	        S XÞUU4 5       5      nUb‰  [        XX#UU5      u  nnnnu  pÞu  nnS[        UR                  5      ;   a  UR                  5       nU" UUU4UUUUS.UD6n [        U [        5      (       a  U S   n U" U UU R                  S5      U5      n!U!$ U(       d  U(       GaF  Ub  Uc  [        XX'US9u  nnnu  pÞu  nnO“U R                  S	U R                  S
5      U R                  S	5      5      nUR                  S	UR                  S
5      UR                  S	5      5      nUR                  S	UR                  S
5      UR                  S	5      5      nS[        UR                  5      ;   a  UR                  5       nU" UUU4UUUUS.UD6n![        U![        5      (       a  U!S   n!U!R                  U R                  S5      S	U!R                  S
5      U!R                  S	5      5      n!U!$ U" XU40 UD6n![        U![        5      (       a  U!S   n!U!$ )aÚ  
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.

(Optional) kwargs are described further in `_process_flash_attention_kwargs` and `FlashAttentionKwargs`.

Args:
    query_states (`torch.Tensor`):
        Input query states to be passed to Flash Attention API
    key_states (`torch.Tensor`):
        Input key states to be passed to Flash Attention API
    value_states (`torch.Tensor`):
        Input value states to be passed to Flash Attention API
    attention_mask (`torch.Tensor`, *optional*):
        The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
        position of padding tokens and 1 for the position of non-padding tokens.
    implementation (`str`, *optional*):
        The attention implementation to use. If None, will default to the one based on the environment.
r   )	r{   rÈ   rÉ   r   rÊ   r   rË   rÌ   rÍ   r   )r…   c              3   ó&   #   • U H  oS Lv •  M
     g 7fr   r   )r?   Úkwargs     r   rA   Ú+_flash_attention_forward.<locals>.<genexpr>y  s   é € ð #Ù'a˜eTÕÒ'aùrC   Úmps)r‰   r‚   Úmax_seqlen_qÚmax_seqlen_k)r{   rL   r¡   r   )rJ   rº   r‘   r²   Úallr   Ústrro   ÚcloneÚ
isinstanceÚtupler§   rN   rq   )"rØ   rÙ   rÚ   rf   r{   rÉ   r   r—   rÊ   r   rË   rÌ   rÍ   r›   rœ   r   rž   rµ   r   rÖ   Úflash_fnÚflash_varlen_fnÚpad_fnÚunpad_fnÚprocess_flash_kwargs_fnr×   Úis_fa_with_position_idsÚis_fa_with_varlen_kwargsr³   r@   r´   r‹   Ú	out_unpadÚouts"                                     r   Ú_flash_attention_forwardrð   1  sö  € ôR NiØóNÑJÑ1€Xˆ ¨Ð3Jô
 .GØ ,°ó.Ñ*€L˜lñ
 +ð Ø!Ø—?‘? 1Ó%ØØØ#Ø%Ø+ØØ#ñð ñ€Lô( 2°,×K\ÑK\Ð]^ÓK_Ñ`ÐÜ"ñ #Ø(5ÀlÐT`Ñ'aó#ó  Ðð
 Ñ!Ü[fØ lÀLÐRZó\
ÑXˆˆ1ˆaÑ:˜]Ñ<X¸\È<ð ”C˜Ÿ™“MÓ!Ø)×/Ñ/Ó1ˆMá#ØØØð	
ð 'Ø&Ø%Ø%ñ	
ð ñ	
ˆ	ô i¤×'Ñ'Ø! !™ˆIáY 	¨<×+<Ñ+<¸QÓ+?ÀÓNˆðN €JöI 
"×%<ØÑ  MÑ$9ÜThØ¨,ÐS_ñUÑQˆAˆq!Ñ3mÑ5Q°lÁLð ×$Ñ$ R¨×):Ñ):¸2Ó)>À×@QÑ@QÐRTÓ@UÓVˆAØ×"Ñ" 2 z§¡°rÓ':¸J¿O¹OÈBÓ<OÓPˆAØ×$Ñ$ R¨×):Ñ):¸2Ó)>À×@QÑ@QÐRTÓ@UÓVˆAð ”C˜Ÿ™“MÓ!Ø)×/Ñ/Ó1ˆMáØØØð	
ð 'Ø&Ø%Ø%ñ	
ð ñ	
ˆô cœ5×!Ñ!Øa‘&ˆCàh‰h|×(Ñ(¨Ó+¨R°·±¸"³¸s¿x¹xÈ»|ÓLˆð €Jñ	 |°ÑNÀÑNˆÜcœ5×!Ñ!Øa‘&ˆCà€Jr   r   )T)ç        NNFNNNN)rñ   NNNFNNNNNNNN)7r1   rÔ   r©   Ú	functoolsr   Útypingr   r   r\   Útorch.nn.functionalÚnnÚ
functionalrb   Úutilsr   r   r	   r
   r   Ú
get_loggerr¿   r¸   r   r   rE   rF   rG   rH   rI   r5   rã   r.   r<   rJ   rS   r(   r'   ÚTensorræ   r•   rw   r   r°   rŸ   r§   r¬   r²   rW   rº   r¼   ÚfloatÚdictr4   rÄ   rð   r   r   r   Ú<module>rü      sª  ðó Û 	Û Ý ß &ã ß Ð ÷õ ð 
×	Ò	˜HÓ	%€ò5òbð
 €	ØÐ Ø
€Ø€	ð  Ð ð Ø#ñÐ ð!K (¨3¡-ô !KòHWð,W°¸±ô Wò$	$ôò@,ð& E§L¡Lð °U¸5¿<¹<ÈÏÉÐWZÐ;ZÑ5[ô ð:LØ—‘ðLà|‰|ðLð —‘ðLð —L‘Lð	Lð
 ôLñ^?HÈ$õ ?HòD,]ò^Aòað( +/ñ	Ø‡||ðà‡||ðð ‡||ðð ˜5Ÿ;™;Ñ'õ	ô$ ˜9¨Eò  ð2 Ø%)Ø$(Ø#Ø#Ø$(Ø$(Ø26ñDØðDàðDð ðDð ð	Dð
 ˜E‘?ðDð ˜S‘MðDð ðDð e‰_ðDð ˜D‘>ðDð E—L‘LÑ!ðDð ˜t C¨ I™Ñ/õDð\ Ø+/Ø%)Ø$(Ø#Ø#Ø$(Ø04Ø04Ø"&Ø"&Ø*.Ø$(ñ'KØ—,‘,ðKà—‘ðKð —,‘,ðKð ˜UŸ\™\Ñ*ð	Kð
 ðKð ðKð ðKð ˜5Ÿ<™<Ñ(ðKð ˜E‘?ðKð ˜S‘MðKð ðKð e‰_ðKð ˜D‘>ðKð ˜E×,Ñ,Ñ-ðKð ˜E×,Ñ,Ñ-ðKð  ˜3‘-ð!Kð" ˜3‘-ð#Kð$ ˜5Ÿ;™;Ñ'ð%Kð& ˜S‘Mö'Kr   