
    <h                     (   S SK r SSKJr  SSKJr   \" 5       (       a  S SKJr          SS\ R                  R                  S\ R                  S\ R                  S	\ R                  S
\ R                  S\S\ R                  4S jjrg! \ a     Nlf = f)    N   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funcmoduleqkvattention_maskcachereturnc           	         UR                   " X#U R                  4SU0UD6u  p#[        U SS5      (       d  SOU R                  S4nUb  UR                  nSUR                  S5      0nW" UR                  SS	5      R                  S5      R                  5       UR                  SS	5      R                  S5      R                  5       UR                  SS	5      R                  S5      R                  5       UR                  [        R                  5      UR                  [        R                  5      R                  5       UU	4U R                  S
US.UD6n[        U[        5      (       a  US   nUS4$ )a  Perform the forward pass of attention with paged key-value cache.

This function handles the cache updates and performs the attention computation
using the flash_attn_varlen_func for efficient processing.

Args:
    q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
    k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
    v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
    cumulative_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
       of the sequences in the batch, used to index into q.
    cumulative_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
       of the sequences in the batch, used to index into kv.
    max_seqlen_q: int. Maximum query sequence length in the batch.
    max_seqlen_k: int. Maximum key sequence length in the batch.
    dropout_p: float. Dropout probability.
    softmax_scale: float. The scaling of QK^T before applying softmax.
        Default to 1 / sqrt(headdim).
    causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
    window_size: (left, right). If not (-1, -1), implements sliding window local attention.
    softcap: float. Anything > 0 activates softcapping attention.
cumulative_seqlens_ksliding_windowF)r   r   Ns_aux   r   T)softmax_scalecausalwindow_size)update	layer_idxgetattrr   r   get	transposesqueeze
contiguoustotorchint32clonescaling
isinstancetuple)r   r   r	   r
   r   r   cumulative_seqlens_qr   max_seqlen_qmax_seqlen_kblock_tablesimplementationkwargsr   r   custom_kwargsattn_outputs                    ]/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardr.      sU   J <<f..dEYd]cdDA%,V5Eu%M%MXTZTiTiklSmN!!/!F!Ffjj12M(	Aq!!!$//1	Aq!!!$//1	Aq!!!$//1,,224 nn" K +u%%!!n    )NNNNNNNN)r   generation.continuous_batchingr   utilsr   
flash_attnr   	ExceptionnnModuleTensorr.    r/   r-   <module>r8      s     @ -	 ""5 $(!%;HHOO;||; ||; ||	;
 LL; ; \\;	  		s   B BB