
    ho                     d   d dl mZ d dlZddlmZ ddlmZ 	  e       r	d dlmZ eZ	n e
d      	 	 	 	 	 	 	 	 dd	ej                  j                   d
ej"                  dej"                  dej"                  deej"                     dedej"                  fdZy# e$ rZ ee      Zd Z	Y dZ[ydZ[ww xY w)    )OptionalN   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funczFlash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install itc                  &    t        dt               )Nz)flash_attn_varlen_func is not available: )	Exceptionmsg)argskwargss     d/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/integrations/flash_paged.pyFLASH_ATTN_VARLEN_FUNCr      s    CC5IJJ    moduleqkvattention_maskcachereturnc           	         t        | dd      sdn| j                  dz
  df}|dk(  rdnd}|" |j                  ||| j                  fi |\  }}t	        |t
              r
||   }|	|   }	|
t        |
d	      r|
j                  }nt        }d
|v rd
|j                  d
      ini } ||j                  dd      j                  d      j                         |j                         |j                         |j                  t        j                        |j                  t        j                        j!                         ||	f| j"                  d|d|}t	        |t$              r|d   }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    sliding_windowF)r      r   full_attentionsliding_attentionNr   s_auxr   T)softmax_scalecausalwindow_size)getattrr   update	layer_idx
isinstancedicthasattrr   r   get	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r   r   r   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kimplementationr   r   
layer_typer   custom_kwargsattn_outputs                    r   paged_attention_forwardr9      ss   H &-V5Eu%MXTZTiTilmTmopSqN%3x%?!EXJ u||Aq&"2"2=f=1 -&%j1#J/!gn>V&W!/!F!F!76=6GWfjj12RM(	Aq!!!$//1		%%++- nn" K +u%!!nr   )NNNNNNN)typingr   r,   generation.continuous_batchingr   utilsr   
flash_attnr   r   RuntimeErrorr	   ereprr
   nnModuleTensorr9    r   r   <module>rE      s      @ -K "5!7 ]
 	
 	" .2!%FHHOOF||F ||F ||	F
 U\\*F F \\F  K
q'CKKs   B B/B**B/