
    hEz                         d dl mZ d dlmZmZmZ d dlmZmZ d dl	Z	ddl
mZ ddlmZ ddlmZmZ d	d
lmZmZmZ d	dlmZmZ dedeeee      ee   f   fdZ e        G d d             Z G d d      Zy)    )deque)floorgcdsqrt)OptionalUnionN   )PretrainedConfig)GenerationConfig)attach_tracertraced   )CacheAllocatorFullAttentionCacheAllocatorSlidingAttentionCacheAllocator)get_device_and_memory_breakdownloggerconfigreturnc                 $   t        | dd      }|3t        | dd      dnd}t        | j                        D cg c]  }| }}i }t        |      D ]  \  }}|j	                  |g       |gz   ||<     t        |j                         D cg c]  }t        |       c} }g }	|j                         D ]7  \  }}t        dt        |      |      D ]  }|	j                  ||||z            9 |	D 
cg c]
  }
||
d       }}
|	|fS c c}w c c}w c c}
w )a  
    Group layers depending on the attention mix, according to VLLM's hybrid allocator rules:
        - Layers in each group need to have the same type of attention
        - All groups have the same number of layers

    For a model with the following layer types: ["sliding", "full", "full", "sliding", "full", "full", "full", "full"]
    We would get two groups: [0, 3] and [1, 2], [4,5], [6,7].
    layer_typesNsliding_windowsliding_attentionfull_attentionr   )
getattrrangenum_hidden_layers	enumerategetr   valueslenitemsappend)r   r   	attn_type_layer_countsi
layer_typeindices
group_sizelayer_groupslggroup_typess               p/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache.pygroup_layers_by_attn_typer/      s=    &-6K+26;KT+R+^'dt	*/0H0H*IJ*IQy*IJ L";/:#/#3#3J#Cqc#IZ  0 <3F3F3HI3Hs7|3HIJJ L+113
Gq#g,
3AA
N ;< 4  4 1==";r!u%K=$$# K J >s   	DD.Dc                   :   e Zd ZdZej
                  ddfdededej                  dej                  de
eeeeej                  ef   f      de
e   d	dfd
Zededed	efd       Zeded	dfd       Zd	efdZededededeee      d	df
d       Zededededeee      d	df
d       Zedededed	eeef   fd       Zedej0                  dej0                  dedeej0                     deej0                     d	eej0                  ej0                  f   fd       Zy)PagedAttentionCacheu  
    Manages the cache for a paged attention mechanism, inspired by VLLM's hybrid allocator. The cache relies on making
    groups of layers to reduce the complexity of cache management and fragmentation.

    The cache uses a three-level hierarchy:
    - Pages: The smallest unit of cache, a page has a size of [num_heads, head_size], which is the space needed to
        store the key or value states for one token and one layer. For a model with only full-attention layers, to store
        the KV cache of one token, we need `2 * num_layers` pages: key and values each take `num_layers` pages.
        Pages are grouped into blocks:
    - Blocks: A block is a collection of `block_size` pages, serving as the allocation unit to reduce management
        complexity and fragmentation. Cache is allocated and freed block by block, not page by page. One block is
        allocated to one layer group, which only has one attention type, like full-attention or sliding-attention.
        If all layers in the model have the same attention type, then all layers will be in the same group. There is
        more than one group if and only if the model has a mixed attention types, like layers with full-attention and
        layers with sliding-attention.
    - Cache tensors: The physical supports for the cache. There are as many cache tensors as there are layer in a
        layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.

    Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
        same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
        efficiently allocate and free blocks, and to efficiently read and write key and value states.

    For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
    layers and a sliding-attention group with 3 layers. At creation time, the physical cache tensors look like this:

    cache_tensor_0: □ □ □ □ □ □ □ □
    cache_tensor_1: □ □ □ □ □ □ □ □
    cache_tensor_2: □ □ □ □ □ □ □ □

    where □ means the blocks is not allocated to any layer group yet. We have 3 cache tensors because there are
    3 layers per group.
    We allocate 1 block to each group, after allocation, the cache tensors look like this:

    cache_tensor_0: ✖ ◉ □ □ □ □ □ □
    cache_tensor_1: ✖ ◉ □ □ □ □ □ □
    cache_tensor_2: ✖ ◉ □ □ □ □ □ □

    where ✖ means the block is allocated to the full-attention group, and ◉ means the block is allocated to the
    sliding-attention group.
    Now, if we continue to generate, and the sliding window has been reached, we only need to allocate a new block
    for the full-attention group, and the cache tensors look like this:

    cache_tensor_0: ✖ ◉ ✖ □ □ □ □ □
    cache_tensor_1: ✖ ◉ ✖ □ □ □ □ □
    cache_tensor_2: ✖ ◉ ✖ □ □ □ □ □

    And after further generation, when we need a new block allocated:

    cache_tensor_0: ✖ ◉ ✖ ✖ □ □ □ □
    cache_tensor_1: ✖ ◉ ✖ ✖ □ □ □ □
    cache_tensor_2: ✖ ◉ ✖ ✖ □ □ □ □

    This would not have been possible if all layers were in the same group: we would have had to allocate a new block
    for the sliding-attention group, although it is not needed.
    Nr   generation_configdevicedtypelayer_device_maptp_sizer   c                    || _         || _        || _        t        |dd      }||n|j                  | _        t        |dd      }||n|j                  |j                  z  | _        t        |dd      | _        t        |      \  }	}
t        |	d         }t        |	      | _        i | _        i | _        t        |	      D ]N  \  }}|
|   dk(  r|j                  nd}t        |      D ]%  \  }}||f| j                  |<   || j                  |<   ' P |3|dkD  r.| j
                  |z  dk7  rt!        d	| j
                   d
| d      | j                  | j
                  z  }t        |dd      dk(  rd}nd|
v rdnd}t#        | j                  || j                  ||j                  |j$                  z   |      }|j'                  t        |dd      t        |dd      t        |dd      | j                        \  }}|| _        || _        t-        j.                  d| j(                  d| j                  d|d| j*                  d|
       g | _        g | _        || j                  z  dz   | j
                  | j                  f| _        t7        |      D ]  }t9        j:                  | j4                  | j                  | j                        }t9        j:                  | j4                  | j                  | j                        }t8        j<                  j?                  |       t8        j<                  j?                  |       | j0                  jA                  |       | j2                  jA                  |        t-        j.                  d| j4                  d| j0                  d   jB                  d| j0                  d   jE                                tG        t7        |            | _$        g | _%        t        |
      D ]q  \  }}|dk(  rtM        || j                        }n5|dk(  r"tO        || j                  |j                        }nt!        d|       | jJ                  jA                  |       s y) a  Initialize a paged attention cache for efficient memory usage.

        Args:
            config: Model configuration
            generation_config: Generation configuration containing cache parameters
            device: Device for the cache tensors
            dtype: Data type of the cache
            layer_device_map: Optional mapping of layer indices to devices
            tp_size: Tensor parallelism size
        num_key_value_headsNhead_dim
block_size    r   r   r   zNumber of key value heads z+ must be divisible by tensor parallel size .attn_implementationpaged_attention   r:   	page_size
num_groupsr*   peak_activation_per_tokennum_attention_masks
num_blocksmax_batch_tokens
max_memory?)rE   rF   max_memory_percentcache_dtypez7PagedAttentionCache initialized with self.num_blocks = z, self.block_size = z, page_size = z, self.max_batch_tokens = z num_attention_masks = )r4   r3   zself.cache_shape = z self.key_cache[0].shape = z self.key_cache[0].numel() = r   zInvalid group type: )(r   r4   r3   r   num_attention_headsr8   hidden_sizer9   r:   r/   r!   rB   sliding_windowslayer_index_to_group_indicesr   r   
ValueErrorPagedAttentionMemoryHandler
vocab_size%infer_num_blocks_and_max_batch_tokensrE   rF   r   info	key_cachevalue_cachecache_shaper   torchempty_dynamomark_static_addressr#   shapenumelr   _free_blocksgroup_cache_managersr   r   )selfr   r2   r3   r4   r5   r6   kv_headsr9   r+   r-   r*   r'   groupr   jlayerrA   rD   memory_handlerrE   rF   r%   new_layer_key_cachenew_layer_value_cache
group_typecms                              r.   __init__zPagedAttentionCache.__init__x   sM   & 
 6#8$?4<4HfNhNh 6:t4)1)=X6CUCUY_YsYsCs ""3\2F %>f$E!ka)
l+!,.)!,/HAu6A!nH[6[V22abN%e,5<=q611%8.<$$U+ - 0 7Q;'''1Q6 01I1I0JJuv}u~~  A  MMD$<$<<	60$7;LL"# (;k'I!q4!'-'9'9F<M<M'M 3
 (6'['[0,E$%68JDQ&'8,L

	 (\ (
$
$ % 0FDOO3GG\$//I]]l`i_m n($$((@*=)AC	
 .0/1&81<d>V>VX\XeXefz"A"'++d.>.>djjY]YdYd"e$)KK0@0@

[_[f[f$g!MM--.ABMM--.CDNN!!"56##$9: # 	*t''++GT^^A->-D-D,HHf$..YZJ[JaJaJcIghi "%
"34:<!&{3MAz--0DOOD223AtH]H]^ #7
|!DEE%%,,R0 4    n_blocks
request_idc                     d}| j                   D ]/  }|j                  ||| j                        }| yt        ||      }1 |S )zAllocate cache blocks across all layer groups for a given request. Actual allocation is done by the cache
        managers, and this method only returns the maximum number of blocks actually allocated across all managers.r   N)r^   allocate_blocksr]   max)r_   rk   rl   max_allocatedrh   	allocateds         r.   rn   z#PagedAttentionCache.allocate_blocks   sP     ++B**8ZARARSI y9M	 ,
 rj   c                 ^    | j                   D ]  }|j                  || j                           y)zFree all allocated cache blocks for a given request across all layer groups. Actual deallocation is done
        by the cache managers.N)r^   free_blocksr]   )r_   rl   rh   s      r.   rs   zPagedAttentionCache.free_blocks   s(     ++BNN:t'8'89 ,rj   c                 ,    t        | j                        S )zHGet the current number of unallocated blocks available for new requests.)r!   r]   )r_   s    r.   get_num_free_blocksz'PagedAttentionCache.get_num_free_blocks   s    4$$%%rj   past_lengthquery_length
read_indexc                     t        | j                  |      D ])  \  }}|j                  |||      }|j                  |       + y)zRetrieve physical cache indices for reading KV states in the cache across all layer groups. This method
        coordinates with all cache managers to build the complete set of read indices needed for attention computation.
        N)zipr^   get_read_indicesextend)r_   rl   rv   rw   rx   rh   read_indicesr)   s           r.   extend_read_indicesz'PagedAttentionCache.extend_read_indices   sC     !$D$=$=z JB))*k<PG( !Krj   write_indexc                     t        | j                  |      D ])  \  }}|j                  |||      }|j                  |       + y)zRetrieve physical cache indices for writing new KV states to the cache across all layer groups. This method
        coordinates with all cache managers to build the complete set of write indices needed to store computed KV
        states.N)rz   r^   get_write_indicesr|   )r_   rl   rv   rw   r   rh   write_indicesr)   s           r.   extend_write_indicesz(PagedAttentionCache.extend_write_indices
  sC     "%T%>%>!LB**:{LQG  ) "Mrj   c                 b    i }| j                   D ]  }|j                  |||      \  }}|||<    |S )zRetrieve the key sequence length for the given request_id across all layer types. Returns a dictionary of
        layer types to their corresponding key sequence lengths.)r^   get_seqlens_k)r_   rl   rv   rw   	seqlens_krh   r$   seqlen_ks           r.   r   z!PagedAttentionCache.get_seqlens_k  sD     	++B"$"2"2:{L"YIx#+Ii  , rj   
key_statesvalue_states	layer_idxc                    | j                   |   \  }}||   }	||   }
| j                  |   }| j                  |   }|j                  dd      j	                  d      }|j                  dd      j	                  d      }| j
                  |   }|dk(  r4|||
ddddf<   |||
ddddf<   ||	ddddf   }||	ddddf   }||fS |	dk(  }||	ddddf   }|||<   ||	ddddf   }|||<   |||
ddddf<   |||
ddddf<   ||fS )al  Update the cache with new key-value states for a specific layer. This method writes new KV states to the
        appropriate cache locations. The behavior differs based on the layer's attention type:

        - Full attention: New KV states are written to cache, then complete sequence is read from cache
        - Sliding window: Old KV is read from cache along with extra spaces for the new KV, then new KV is written to
            cache. This is because new KV might overwrite the old KV, so we need to read the old KV first.

        Returns the complete KV states (cached + new) for attention computation.
        r   r?   r   N)rN   rT   rU   	transposesqueezerM   )r_   r   r   r   rx   r   kwargs	group_idxlayer_idx_in_grouplayer_read_indexlayer_write_indexk_cachev_cacher   key_states_with_cachevalue_states_with_cachemasks                    r.   updatezPagedAttentionCache.update  sv   ( )-(I(I)(T%	%%i0'	2..!34""#56))!Q/77:
#--a3;;A> --i8Q/9G%q!+,/;G%q!+,$+,<a,B$C!&-.>1.D&E#  %&=== $r)D$+,<a,B$C!*4!$'&-.>1.D&E#,8#D)/9G%q!+,/;G%q!+, %&===rj   )__name__
__module____qualname____doc__rW   float16r
   r   r3   r4   r   dictintr   strri   r   rn   rs   ru   listr~   r   r   Tensortupler    rj   r.   r1   r1   =   s2   6| #]]OS!%n1 n1 ,n1 	n1
 {{n1 #4U3c3I-J(J#KLn1 #n1 
n1` 	 	 	 	 	 :c :d : :&S & )),/)?B)PTUYZ]U^P_)	) ) **,/*?B*QUVZ[^V_Q`*	* *  # S UYZ]_bZbUc   3>LL3> ll3> 	3>
 &3> %,,'3> 
u||U\\)	*3> 3>rj   r1   c                   *   e Zd ZdZej
                  Zej                  ZdZ	dZ
dededededed	ed
dfdZedded
efd       Zdddej                   fdee   dee   dedej$                  d
eeef   f
dZdej                   dfdedej$                  ded
eeef   fdZdej                   fdededej$                  d
efdZdej                   fdededej$                  d
efdZddej                   fdee   dee   dej$                  d
eeeef   fdZy)rP   a[  A helper class to determine the best number of pages and maximum number of tokens per batch for the paged
    attention cache, providing automatic sizing based on available GPU memory.
    The helper works using the number of pages, which is tied to the number of blocks by:
        num_blocks = num_pages // block_size

    The memory footprint consists of three main components:
    - Cache memory: the space needed to store the cache tensors:
        2 * layer_group_size * [num_pages, page_size] * cache_dtype
    - Activation memory: the space temporarily taken by the largest activation during the model forward pass:
        peak_activation_per_token * max_tokens_per_batch * activation_dtype_size
    - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of:
        - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size
        - attention_mask: num_attention_masks * num_pages * max_tokens_per_batch * activation_dtype_size
        - cumulative_seqlens_q + cumulative_seqlens_k: (1 + 2) * max_tokens_per_batch * int32_size
        - write_index_tensor: num_groups * max_tokens_per_batch * int32_size
        - read_index_tensor: num_groups * (num_pages + max_tokens_per_batch) * int32_size

    The handler can operate in three modes:
    1. Auto-sizing: Determines both number of pages and maximum number of tokens per batch using quadratic optimization
    2. Fixed cache: Calculates max batch tokens given a fixed number of pages
    3. Fixed batch: Calculates number of pages given a fixed maximum batch size

       i   r:   rA   rB   r*   rC   rD   r   Nc                 X    || _         || _        || _        || _        || _        || _        y)a  Initialize the memory handler with the parameters that cannot be automatically inferred.

        Args:
            block_size: Size of the cache blocks
            page_size: Size of the cache pages
            num_groups: Number of layer groups
            group_size: Number of layers per layer group
            peak_activation_per_token: Maximum size of activation tensor per token, = hidden_size + vocab_size
            num_attention_masks: Number of attention masks, 0 if no attention mask is used, 2 if hybrid model, else 1
        Nr@   )r_   r:   rA   rB   r*   rC   rD   s          r.   ri   z$PagedAttentionMemoryHandler.__init__u  s0    & %"$$)B&#6 rj   rI   c                 ^    t               \  }}}}|t        ||      z
  }t        || z        }|S )a  Calculate available GPU memory for cache allocation, accounting for already allocated tensors.
        This method queries the current memory state and applies the specified percentage limit to determine
        how much memory can be safely used for the paged attention cache.

        Args:
            max_memory_percent: Fraction of available memory to use (0.0-1.0). 1.0 means use all available memory.

        Returns:
            int: Available memory in bytes for cache allocation
        )r   ro   r   )rI   r%   totalreservedrq   available_memorys         r.   get_available_memoryz0PagedAttentionMemoryHandler.get_available_memory  s@     )H(I%5(I 3y(#;;/2DDErj   rH   rE   rF   rJ   c                    ||| j                  ||      \  }}n/||| j                  |||      }n||| j                  |||      }| j                  |      }| j	                  |||      }||kD  rt        d| d|       ||fS )af  Determine optimal number of blocks and maximum number of tokens per batch based on available memory and
        constraints. Check the class docstring for more details. Naming the number of pages as N and the maximum number
        of tokens per batch as M, the equation solved is:

        available_memory = sum([
            MN * num_attention_masks * activation_dtype_size,
            2N * (layer_group_size * page_size * cache_dtype + 2 * num_group),
            M * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group),
        ])

        where we already simplified int32_size = 4.
        )rF   rE   rJ   zMemory footprint z is more than available memory )'compute_num_blocks_and_max_batch_tokenscompute_max_batch_tokenscompute_num_blocksr   compute_memory_footprintMemoryError)r_   rE   rF   rI   rJ   r   memory_footprints          r.   rR   zAPagedAttentionMemoryHandler.infer_num_blocks_and_max_batch_tokens  s    ( "2":+/+W+W"K,(J( #(8(@#<<ZI[]hi)j.@001ACUWbcJ  445GH88-!# 9 

 .. 12B1CCbcsbtuvv+++rj   g{Gz?mc                    | j                  |      }t        j                  d|        || j                  z  | j                  j
                  z  }d| j                  | j                  z  |j
                  z  d| j                  z  z   z  }||| j                  | j                  j
                  z  dz   d| j                  z  z   z  z  }| }t        j                  d|d|d|       |dz  d|z  |z  z
  }|dk  rt        d	|      | t        |      z   d|z  z  }	|	dk  rt        d
|	      t        |	      }
|
| j                  z  }|| j                  kD  r1t        j                  d|d| j                         | j                  }t!        |	|z        }|| j"                  kD  r1t        j                  d|d| j"                         | j"                  }||fS )a  Calculate optimal number of blocks and maximum number of tokens per batch using quadratic optimization when
        neither is fixed. This method assumes a relationship M = m * N where m is a small ratio below 1 and solves the
        resulting quadratic equation to find the optimal N that maximizes utilization within memory constraints. m is
        the amount of cache we can fill with one batch: m=0.01 means a batch fills at most 1% of the cache. The equation
        to solve is:

        available_memory = sum([
            m * N^2 * num_attention_masks * activation_dtype_size,
            2N * (layer_group_size * page_size * cache_dtype + 2 * num_group),
            m * N * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group),
        ])
        zCache memory: r?         z+Coefficients of 2nd degree polynomial: a = z, b = z, c = r   z)Discriminant is negative: discriminant = z3Greatest solution is negative: greatest_solution = num_blocks = 9 is too large, setting to self._upper_bound_num_blocks = max_batch_tokens = ? is too large, setting to self._upper_bound_max_batch_tokens = )r   r   rS   rD   _activation_dtypeitemsizer*   rA   rB   rC   debugrO   r   r   r:   _upper_bound_num_blocksr   _upper_bound_max_batch_tokens)r_   rI   rJ   r   cache_memoryabcdiscriminantgreatest_solution	num_pagesrE   rF   s                r.   r   zCPagedAttentionMemoryHandler.compute_num_blocks_and_max_batch_tokens  s   $ 001CDn\N34 (((4+A+A+J+JJ4>>1K4H4HH1tK^^_	Q$0043I3I3R3RRUWWZ[^b^m^mZmmnnMCqfGqfGqfUV !ta!eai'!I,9JKLLR$|"44Q?q S?P>TUVV +,	$//1
444KK=:/)cDD`D`Cdef55J0145d@@@KK.+//o$JlJlIpqr#AA+++rj   c                    | j                  |      }|| j                  z  }|}|d|z  | j                  | j                  z  |j                  z  d| j
                  z  z   z  z  }| j                  j                  || j                  z  | j                  z   z  }|dd| j
                  z  z   z  }t        ||z        }|| j                  kD  r1t        j                  d|d| j                         | j                  }|S )aE  Calculate maximum batch tokens M given a fixed number of cache blocks. The formula for M is given by:

        M = (available_memory - 2N * (layer_group_size * page_size * cache_dtype + 2 * num_group))
            / (activation_dtype_size * (N * num_attention_masks + peak_activation_per_token) + 28 + 4 * num_group)
        r?   r   r   r   r   )r   r:   r*   rA   r   rB   r   rD   rC   r   r   r   rS   )	r_   rE   rI   rJ   r   r   numdenumrF   s	            r.   r   z4PagedAttentionMemoryHandler.compute_max_batch_tokens  s    001CD0	q9}$.. @;CWCW WZ[^b^m^mZm mnn&&//00043Q3QQ
 	a$//))) u-d@@@KK.+//o$JlJlIpqr#AArj   c                 d   | j                  |      }|}||| j                  z  | j                  j                  z  z  }||dd| j                  z  z   z  z  }d| j
                  | j                  z  |j                  z  d| j                  z  z   z  }||| j                  | j                  j                  z  z  z  }||| j                  j                  z  z  }t        ||z        }|| j                  z  }|| j                  kD  r1t        j                  d|d| j                         | j                  }|S )a`  Calculate number of cache blocks N given a fixed maximum token per token M. The formula for N is given by:

        N = (available_memory - M * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group))
          / (2 * (layer_group_size * page_size * cache_dtype + 2 * num_group) + M * (num_attention_masks * activation_dtype_size))
        r   r   r?   r   r   )r   rC   r   r   rB   r*   rA   rD   r   r:   r   r   rS   )	r_   rF   rI   rJ   r   r   r   r   rE   s	            r.   r   z.PagedAttentionMemoryHandler.compute_num_blocks  s1    001CD$"@"@@4CYCYCbCbbb2DOO(;#;<<T__t~~58L8LLqSWSbSbObbc!T%=%=@V@V@_@_%_``!D$:$:$C$CCC#+&	$//1
444KK=:/)cDD`D`Cdef55Jrj   c           	         || j                   z  }d| j                  z  |z  | j                  z  |j                  z  }| j                  | j
                  j                  z  }||z  }d|z  dz  }| j                  | j
                  j                  z  }|||z  z  }d|z  dz  }	| j                  |z  dz  }
| j                  ||z   z  dz  }t        |||||	|
|g      }|S )a  Calculate the memory footprint breakdown for a given number of blocks and maximum batch tokens. The memory
        footprint is given by:

        available_memory = sum([
            MN * num_attention_masks * activation_dtype_size,
            2N * (layer_group_size * page_size * cache_dtype + 2 * num_group),
            M * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group),
        ])
        but is broken down below.
        r?   r   r	   )	r:   r*   rA   r   rC   r   rD   rB   sum)r_   rE   rF   rJ   r   cache_memory_footprintactivation_memory_footprint4inputs_outputs_positions_and_logits_memory_footprintattention_memory_footprint#cumulative_seqlens_memory_footprintwrite_index_memory_footprintread_index_memory_footprinttotal_memory_footprints                r.   r   z4PagedAttentionMemoryHandler.compute_memory_footprint2  s     0	!"T__!4y!@4>>!QT_ThTh!h&*&D&DtG]G]GfGf&f##'77#?@CS?SVW?W<%)%=%=@V@V@_@_%_""i2B&BB"./2B.BQ.F+'+9I'IA'M$&*ooEU9U&VYZ&Z#!$&+D*3,+
"
 &%rj   )g      ?)r   r   r   r   rW   bfloat16r   int32_input_dtyper   r   r   ri   staticmethodfloatr   r   r   r4   r   rR   r   r   r   r   r   rj   r.   rP   rP   W  s   0 ;;L$'!"77 7 	7
 7 $'7 !7 
74        $ %)*.$'#(==(,SM(, #3-(, "	(,
 [[(, 
sCx(,X %(#(==	.,!., [[., 	.,
 
sCx.,f %(#(==	   "  [[	 
 
 > %(#(==	 " [[	
 
< %)*.#(==	,&SM,& #3-,& [[	,&
 
sC}	,&rj   rP   )collectionsr   mathr   r   r   typingr   r   rW   configuration_utilsr
   generation.configuration_utilsr   utils.metricsr   r   cache_managerr   r   r   requestsr   r   r   r   r   r   r/   r1   rP   r   rj   r.   <module>r      s     ! ! "  3 > 2 f f =%&6 %5d3iRVWZR[A[;\ %B U> U> U>rG& G&rj   