o
    
sÒh˜  ã                   @   s”   d dl Z ddlmZ ddlmZ zeƒ rd dlmZ W n	 ey$   Y nw 								dde jj	de j
de j
d	e j
d
e j
dede j
fdd„ZdS )é    Né   )ÚPagedAttentionCache)Úis_flash_attn_2_available)Úflash_attn_varlen_funcÚmoduleÚqÚkÚvÚattention_maskÚcacheÚreturnc                 K   sè   |j ||| jfi |¤Ž\}}t| ddƒsdn| jdf}|dur"|j}d|v r-d| d¡ini }|| dd¡ d¡ ¡ | dd¡ d¡ ¡ | dd¡ d¡ ¡ | 	t
j¡| 	t
j¡ ¡ ||	f| jd	|d
œ|¤Ž}t|tƒrp|d }|dfS )aæ  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    Úsliding_windowF)éÿÿÿÿr   r   NÚs_auxé   r   T)Úsoftmax_scaleÚcausalÚwindow_size)ÚupdateÚ	layer_idxÚgetattrr   r   ÚgetÚ	transposeÚsqueezeÚ
contiguousÚtoÚtorchÚint32ÚcloneÚscalingÚ
isinstanceÚtuple)r   r   r   r	   r
   r   Úcu_seq_lens_qÚcu_seq_lens_kÚmax_seqlen_qÚmax_seqlen_kÚblock_tablesÚimplementationÚkwargsr   r   Úcustom_kwargsÚattn_output© r+   úc/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/integrations/flash_paged.pyÚpaged_attention_forward   s.   %
ùöô
r-   )NNNNNNNN)r   Úgeneration.continuous_batchingr   Úutilsr   Ú
flash_attnr   Ú	ExceptionÚnnÚModuleÚTensorr-   r+   r+   r+   r,   Ú<module>   sB    €ÿ	ôÿþýüûúò