
    <h                        S SK r S SKrS SKrS SKJrJr  S SKJr  S SKJ	r	J
r
  S SKJr  S SKJr  S SKJrJr  S SKrS SKJr  S SKJr  S S	KJr  S
SKJr  S
SKJr  S
SKJr  S
SKJr  S
SK J!r!J"r"J#r#   " S S\5      r$\RJ                  " \&5      r'\	 " S S5      5       r(\	 " S S5      5       r)\"" 5        " S S5      5       r* " S S\5      r+\"" 5        " S S\+5      5       r,\"" 5        " S S\+5      5       r-S r.\#" SS 9S!S\R^                  4S" j5       r0\	 " S# S$5      5       r1\#S% 5       r2\"" 5        " S& S'5      5       r3\,\-S(.r4\"" 5        " S) S*5      5       r5 " S+ S,5      r6g)-    N)ABCabstractmethod)deque)	dataclassfield)Enum)partial)OptionalUnion)DecodeStream)tqdm   )PretrainedConfig)GenerationConfig)PreTrainedTokenizerFast)logging)ContinuousBatchProcessorMetricsattach_tracertracedc                   4    \ rS rSrSrSrSrSrSrSr	Sr
S	rS
rg)RequestStatus&   z5Status of a generation request through its lifecycle.pending
prefillingprefilling_splitsplit_pending_remainderdecodingfinishedfailed N)__name__
__module____qualname____firstlineno____doc__PENDING
PREFILLINGPREFILLING_SPLITSPLIT_PENDING_REMAINDERDECODINGFINISHEDFAILED__static_attributes__r        c/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching.pyr   r   &   s*    ?GJ)7HHFr.   r   c                       \ rS rSr% Sr\\S'   \" \S9r	\\
   \S'   \" \S9r\\
   \S'   \" \S9r\\   \S'   Sr\\   \S	'   \R"                  r\\S
'   \" \R&                  S9r\\S'   \" \
S9r\\
   \S'   Srg)GenerationOutput5   a  Tracks the output of a generation request.

Attributes:
    request_id (str): The ID of the generation request.
    prompt_ids (list[int]): The IDs of the prompt tokens.
    generated_tokens (list[int]): The generated tokens.
    logprobs (list[float]): The log probabilities of the generated tokens.
    error (Optional[str]): Any error message associated with the request. When None, the request was successful.

request_iddefault_factory
prompt_idsgenerated_tokenslogprobsNerrorstatuscreated_time
next_tokenr    )r!   r"   r#   r$   r%   str__annotations__r   listr6   intr7   r8   floatr9   r
   r   r&   r:   timer;   r<   r-   r    r.   r/   r1   r1   5   s     O!$7JS	7"'"=d3i=!$7Hd5k7E8C=)11FM1		:L%: %c :J:r.   r1   c                      \ rS rSr% Sr\\S'   Sr\\	\
      \S'   Sr\\	\
      \S'   \" \	S9r\	\
   \S'   \" \	S9r\	\
   \S	'   \" \	S9r\	\
   \S
'   Sr\
\S'   \R$                  r\\S'   Sr\
\S'   Sr\
\S'   \" \R,                  S9r\\S'   Sr\\   \S'   Sr\\   \S'   S\
4S jrS\
4S jr\S\
S\4S j5       rS r S r!Sr"g)RequestStateK   zTracks the state of a generation request through its lifecycle.

Attributes:
    status (RequestStatus): can be one of PENDING, PREFILLING, PREFILLING_SPLIT,
                            SPLIT_PENDING_REMAINDER, DECODING, FINISHED, FAILED
r3   Nr6   full_prompt_idsr4   remaining_prompt_idsstatic_outputsallocated_blocksr   position_offsetr:      max_new_tokenseos_token_idr;   r9   r<   returnc                     U R                   $ )zCGet the current length of the sequence (prompt + generated tokens).)rJ   selfs    r/   current_lenRequestState.current_lenc   s    ###r.   c                 ,    [        U R                  5      $ )z*Get the number of tokens generated so far.)lenrH   rQ   s    r/   generated_lenRequestState.generated_leng   s    4&&''r.   token_idc                 `   U R                   [        R                  :w  a  gXR                  :H  =(       a    U R                  S:g  nU R	                  5       U R
                  :  nU(       a  U(       a  U R                  R                  U/5        U(       d  U(       a  [        R                  U l         gg)zUpdate the request with a newly generated token and check for completion.

Args:
    token_id: The token ID to add to the output sequence

Returns:
    bool: True if the request is now complete, False otherwise
FrM   T)	r:   r   r*   rN   rW   rL   rH   extendr+   )rR   rY   is_eos
is_max_lens       r/   update_with_tokenRequestState.update_with_tokenk   s     ;;-000...J43D3D3J'')T-@-@@
 6&&z2Z'00DKr.   c                 0   SU R                    SU R                   SU R                  5        S[        U R                  5       S[        U R
                  5       SU R                   S[        U R                  5       SU R                   S	U R                   S
3$ )NzRequestState(
	request_id=z
,
	status=z,
	out_tokens=z,
	query_length=z, 
	remaining_tokens=z, 
	kv_length=z
	full_prompt_lenght=z,
	allocated_blocks=z,
	generated_tokens=z
))
r3   r:   rW   rV   r6   rG   rJ   rF   rI   rH   rQ   s    r/   __repr__RequestState.__repr__   s9   -doo->l4;;-Wghlhzhzh|g}  ~P  QT  UY  Ud  Ud  Qe  Pf  f}  ~A  BF  B[  B[  ~\  }]  ]m  nr  nB  nB  mC  CZ  [^  _c  _s  _s  [t  Zu  uK  LP  La  La  Kb  bx  y}  yL  yL  xM  MP  Q  	Qr.   c           
          [        U R                  U R                  U R                  U R                  / U R
                  U R                  S9$ )z7Convert the request state to a GenerationOutput object.)r3   r6   r:   r7   r8   r9   r<   )r1   r3   rF   r:   rH   r9   r<   rQ   s    r/   to_generation_output!RequestState.to_generation_output   s@    ++;;!00**
 	
r.   )r:   )#r!   r"   r#   r$   r%   r=   r>   r6   r
   r?   r@   rF   r   rG   rH   rI   rJ   r   r&   r:   rL   rN   rB   r;   rA   r9   r<   rS   rW   r   boolr^   ra   rd   r-   r    r.   r/   rD   rD   K   s    O&*Jc#*+/OXd3i(/&+D&A$s)A %d ;NDI;"'"=d3i=OS)11FM1NCL#		:L%:E8C= $J$$S $(s ( # $  4Q

r.   rD   c                      \ rS rSr\R
                  SSS4S\S\S\R                  S\R                  S\
S	\\\
\\\R                  \
4   4      S
\\
   SS4S jjr\S\
S\S\\
   4S j5       r\S\SS4S j5       rS\
4S jrS\S\\
   4S jr\S\S\\
   S\\
   4S j5       r\S\R0                  S\R0                  S\
S\\R0                  \R0                  4   4S j5       rSrg)PagedAttentionCache   d   Nconfiggeneration_configdevicedtypenum_requestslayer_device_maptp_sizerO   c                 "   [        USS5      c  UR                  OUR                  U l        U R                  nUb   US:  a  X-  S:w  a  [        SU SU S35      e[	        US5      (       a  UR
                  OUR                  UR                  -  U l        UR                  U l        [        US	S
5      n	[        USS5      n
[        USS5      n[        USS5      nU	b  Uc:  [        UR                  U
U R
                  U R                  U R                  UUU	S9u  p[        R                  SU	 SU
 SU 35        Xl        Xl        Xl        XU R                  U R
                  4U l        X@l        X0l        / U l        / U l        [)        UR                  5       H  nUb  Xm   OUn[*        R,                  " U R                  U R                   US9n[*        R,                  " U R                  U R                   US9n[*        R.                  R1                  U5        [*        R.                  R1                  U5        U R$                  R3                  U5        U R&                  R3                  U5        M     [5        [)        U	5      5      U l        0 U l        g)a  Initialize a paged attention cache for efficient memory usage.

Args:
    config: Model configuration
    generation_config: Generation configuration containing cache parameters
    device: Device for the cache tensors
    dtype: Data type for the cache tensors
    layer_device_map: Optional mapping of layer indices to devices
    initial_prompt_shapes: Optional sample prompts to help calculate optimal cache size
num_key_value_headsN   r   zNumber of key value heads z+ must be divisible by tensor parallel size .head_dim
num_blocksi   
block_size    
max_memory?max_batch_tokens   )rx   rv   
num_layers	num_headsmax_memory_percentrn   rw   zUsing calculated num_blocks=z, block_size=z, max concurrent requests rn   rm   )getattrnum_attention_headsrs   
ValueErrorhasattrrv   hidden_sizenum_hidden_layerscompute_optimal_blocksrL   loggerwarningr|   rx   rw   cache_shapern   rm   	key_cachevalue_cacherangetorchzeros_dynamomark_static_addressappendr   _free_blocks_block_tables)rR   rk   rl   rm   rn   ro   rp   rq   rs   rw   rx   r   r|   idxlayer_devicenew_layer_key_cachenew_layer_value_caches                    r/   __init__PagedAttentionCache.__init__   sy   . v4d;C &&++ 	 
 #667Q;",1 01D0EEpqxpyyz{   'vz::FOO@R@RV\VpVp@p 	 "(!9!9 .dC
.bA
$%6cJ"#46H#N!1!9+A!00%1122#5%	,(J 	*:,mJ<Oijzi{|	
 !1$$/T__dmm\
-//1112C4D4P+0V\L"'++d.>.>djjYe"f$)KK0@0@

[g$h! MM--.ABMM--.CDNN!!"56##$9: 3 "%
"3435r.   n_blocksr3   c                 ,   [        U R                  5      U:  a  g/ n[        U5       H,  nUR                  U R                  R	                  5       5        M.     X R
                  ;  a  / U R
                  U'   U R
                  U   R                  U5        U$ )z*Allocates n_blocks for a given request_id.F)rV   r   r   r   popleftr   r[   )rR   r   r3   	allocated_s        r/   allocate_blocks#PagedAttentionCache.allocate_blocks   s     t  !H,	xAT..6689 ! ///-/Dz*:&--i8r.   c                     XR                   ;   a7  U R                   R                  U5      nU R                  R                  U5        g[        R                  SU 35        g)z.Frees all blocks associated with a request_id.z6Attempted to free blocks for non-existent request_id: N)r   popr   r[   r   info)rR   r3   blocks_to_frees      r/   free_blocksPagedAttentionCache.free_blocks   sP     +++!//33J?N$$^4KKPQ[P\]^r.   c                 ,    [        U R                  5      $ )z,Returns the number of free blocks available.)rV   r   rQ   s    r/   get_num_free_blocks'PagedAttentionCache.get_num_free_blocks  s    4$$%%r.   c                 :    U R                   R                  U/ 5      $ )z&Returns the block table for a request.)r   getrR   r3   s     r/   get_block_table#PagedAttentionCache.get_block_table  s    !!%%j"55r.   statelogical_indicesc           	      :   UR                   nU R                  R                  U5      nU(       d  [        SU 35      eU R                  n/ nU HJ  nXu-  nXu-  n	U[        U5      :  a  [        SU SU SU 35      eXH   n
X-  U	-   nUR                  U5        ML     U$ )ax  
Maps logical sequence indices to physical cache indices using the block table, using PyTorch.

Args:
    request_id: The request ID.
    logical_indices: A list of logical indices.

Returns:
    A list of physical indices.

Raises:
    ValueError: If no block table is found for the request ID.
    IndexError: If a logical index maps to a block index that is out of bounds.
z!No block table found for request zLogical index z maps to block index z$ which is out of bounds for request )r3   r   r   r   rx   rV   
IndexErrorr   )rR   r   r   r3   block_tablerx   physical_indicesr   	block_idxblock_offsetphysical_block_numphysical_indexs               r/   _get_physical_indices)PagedAttentionCache._get_physical_indices  s      %%
((,,Z8@MNN__
"C)I+LC,, $SE)>yk J##-,0 
 "-!7/<|KN##N3 #  r.   
key_statesvalue_states	layer_idxc                 p   U R                   U R                  -  nU R                  U   R                  U R                  XpR
                  5      nU R                  U   R                  U R                  XpR
                  5      n	US   US S 2US S 24'   US   U	S S 2US S 24'   US S S 2US S 24   U	S S S 2US S 24   4$ )Nr   )rw   rx   r   viewrs   rv   r   )
rR   r   r   r   
read_indexwrite_indexkwargstotal_slotsk_cache_flatv_cache_flats
             r/   updatePagedAttentionCache.update4  s     oo7~~i055d6N6NP[]j]jk''	2778P8PR]_l_lm*4Q-QQ&'*6q/QQ&'D!Z23\$:WXBX5YYYr.   )r   r   rx   r   rm   rn   rv   r   r|   rw   r   rs   r   )r!   r"   r#   r$   r   float16r   r   rm   rn   r@   r
   dictr   r=   r   r   r?   r   r   r   r   rD   r   Tensortupler   r-   r    r.   r/   rh   rh      s    #]]OS!%R6 R6 ,R6 	R6
 {{R6 R6 #4U3c3I-J(J#KLR6 #R6 
R6h   c   _c _d _ _&S &6# 6$s) 6 % < % $s) % X\]`Xa %  % N ZLLZ llZ 	Z 
u||U\\)	*Z Zr.   rh   c                       \ rS rSrSrSS\S\4S jjr\S\	4S j5       r
\S\S	\\	   4S
 j5       r\S	\4S j5       r\SS\S\4S jj5       r\S\S	\\   4S j5       rSrg)	ScheduleriG  z
Abstract base class for scheduling requests in the continuous batch processor.
It is expected that cache allocation and scheduling logic will be implemented in subclasses.
cacheretain_cache_on_finishc                 V    0 U l         0 U l        [        5       U l        Xl        X l        g N)active_requestswaiting_requestsr   waiting_requests_orderr   r   )rR   r   r   s      r/   r   Scheduler.__init__M  s'    8:9;27'#
&<#r.   r   c                     gz"Add a request to the waiting list.Nr    )rR   r   s     r/   add_waiting_requestScheduler.add_waiting_requestT       	r.   token_budgetrO   c                     g r   r    )rR   r   s     r/   schedule_batchScheduler.schedule_batchY  s    r.   c                 d    [        U R                  5      =(       d    [        U R                  5      $ )z2Check if there are requests ready to be processed.)rV   r   r   rQ   s    r/   has_pending_requestsScheduler.has_pending_requests]  s%     4''(FC0E0E,FFr.   r3   evict_from_cachec                     g)z:Finish processing a request and free its allocated blocks.Nr    rR   r3   r   s      r/   finish_requestScheduler.finish_requestb  r   r.   c                 V    XR                   ;   a  U R                   U   R                  $ / $ r   )r   rH   r   s     r/   !get_active_request_static_outputs+Scheduler.get_active_request_static_outputsg  s*    ---''
3BBB	r.   )r   r   r   r   r   NFT)r!   r"   r#   r$   r%   rh   rf   r   r   rD   r   r@   r?   r   r   r   r=   r   r   r-   r    r.   r/   r   r   G  s    
=1 =4 =    3 43E   Gd G G     C DI  r.   r   c                       \ rS rSr\S\S\4S j5       r\" SS9S\S\S\\	   4S	 j5       r
\S\4S
 j5       r\S\S\\   4S j5       r\SS\	S\4S jj5       rSrg)FIFOSchedulerin  r   len_next_tokensc                    UR                  5       n[        UR                  5      U R                  R                  -  U-
  nXB:  d  [        UR                  5      S:X  ai  X$-
  S-   U R                  R                  -  S-   nU R                  R                  XQR                  5      nU(       d  gUR                  R                  U5        gNr   rt   FTrS   rV   rI   r   rx   r   r3   r[   rR   r   r   rS   	occupancyblocks_neededr   s          r/   _allocate_blocks_if_needed(FIFOScheduler._allocate_blocks_if_neededp       '')../$**2G2GG+U	&3u/E/E+F!+K-9A=$**BWBWW[\\M

22=BRBRSI"")))4r.   prepare_request	span_namer   "request_ids_to_remove_from_waitingc                 P   UR                   [        R                  :X  a  UR                  OUR                  n[        U5      U:  a  UR                   [        R                  :X  aI  XR                  UR                  '   [        R                  Ul         UR                  UR                  5        gUR                   [        R                  :X  a.  [        R                  Ul         UR                  Ul        / Ul        ggUR                   [        R                  :X  aI  XR                  UR                  '   [        R                  Ul         UR                  UR                  5        O3UR                   [        R                  :X  a  [        R                  Ul         XBS Ul        USU Ul        gz6Prepare a request for processing in the current batch.Nr:   r   r)   rG   r6   rV   r&   r   r3   r'   addr(   rR   r   r   r   request_tokenss        r/   _prepare_request_for_processing-FIFOScheduler._prepare_request_for_processing~  A    +0,,-:_:_*_E&&ejeueu 	 ~-||}4449>$$U%5%56,77266u7G7GH!F!FF,77#(#=#= -/* G ||}4449>$$U%5%56,==266u7G7GH!F!FF,==)7)FE&-m|<Er.   c                    U R                   (       a  UR                  U R                  ;   an  U R                  R                  UR                  5      nUR                  [        UR                  5      S Ul        UR                  Ul        UR                  Ul        XR                  UR                  '   U R                  R                  UR                  5        gr   r   r3   r   r   r6   rV   rF   rI   rJ   r   r   r   rR   r   	old_states      r/   r   !FIFOScheduler.add_waiting_request       &&5+;+;t?S?S+S,,001A1ABI$//I4M4M0N0PQE%.%?%?E"$-$=$=E!27e../##**5+;+;<r.   rO   c                   ^ ^
^ / n/ n/ mT R                   R                  5        Hc  nUR                  [        R                  :X  a  UR                  U5        UR                  [        R                  :X  d  MR  UR                  U5        Me     T R                   H!  nUR                  T R                  U   5        M#     X#-   n[        5       m
U H  nT R                  XAT
5        [        UR                  5      nT R                  U[        UR                  5      5      (       d'  [        T R                  R                  5      S:X  a    OQM{  [         S["        4U4S jj5       nU" U5        X-  n[         S["        4U
U 4S jj5       n	U	" U5        US:X  d  M    O   [%        T R                   Vs/ sH  oUT
;  d  M
  UPM     sn5      T l        T$ s  snf )Nr   r   c                 (   > TR                  U 5        g r   r   r   scheduled_requestss    r/   _add_to_scheduled_requests@FIFOScheduler.schedule_batch.<locals>._add_to_scheduled_requests      "))%0r.   c                 |   > U R                   nUTR                  ;   a  TR                  U	 TR                  U5        g g r   r3   r   r  r   req_idr   rR   s     r/   _remove_from_waiting_requestsCFIFOScheduler.schedule_batch.<locals>._remove_from_waiting_requests  =    ))T222--f56::6B 3r.   )r   valuesr:   r   r*   r   r)   r   r   setr  rV   r6   r   r   r   r   rD   r   rR   r   priority_statessecond_priority_statesr   r  
candidatesrequest_lenr  r  r   r  s   `         @@r/   r   FIFOScheduler.schedule_batch  s   .057))002E||}555&&u-||}DDD&--e4	 3 11F"))$*?*?*GH 2 %=
-0U*E00Fhie../K22s5++,  tzz../141, 1 1 'u-'LC\ C C *%0q 9  < ',"&"="=r"=OqAqV"=r'
# "! ss   )G	5G	r3   r   c                     U(       a9  U R                   R                  U5        XR                  ;   a  U R                  U	 g g g r   r   r   r   r   s      r/   r   FIFOScheduler.finish_request  ;    JJ"":.111((4 2 r.   r   Nr   r!   r"   r#   r$   r   rD   r@   r   r  r=   r  r   r?   r   rf   r   r-   r    r.   r/   r   r   n       s   '(=!=14=Z]^aZb= )=8 = = = 4"3 4"43E 4" 4"l 5 5 5 5r.   r   c                       \ rS rSr\S\S\4S j5       r\" SS9S\S\S\\	   4S	 j5       r
\S\4S
 j5       r\S\S\\   4S j5       r\SS\	S\4S jj5       rSrg)PrefillFirstScheduleri  r   r   c                    UR                  5       n[        UR                  5      U R                  R                  -  U-
  nXB:  d  [        UR                  5      S:X  ai  X$-
  S-   U R                  R                  -  S-   nU R                  R                  XQR                  5      nU(       d  gUR                  R                  U5        gr   r   r   s          r/   r   0PrefillFirstScheduler._allocate_blocks_if_needed  r   r.   r   r   r   r   c                 P   UR                   [        R                  :X  a  UR                  OUR                  n[        U5      U:  a  UR                   [        R                  :X  aI  XR                  UR                  '   [        R                  Ul         UR                  UR                  5        gUR                   [        R                  :X  a.  [        R                  Ul         UR                  Ul        / Ul        ggUR                   [        R                  :X  aI  XR                  UR                  '   [        R                  Ul         UR                  UR                  5        O3UR                   [        R                  :X  a  [        R                  Ul         XBS Ul        USU Ul        gr   r   r  s        r/   r  5PrefillFirstScheduler._prepare_request_for_processing  r  r.   c                    U R                   (       a  UR                  U R                  ;   an  U R                  R                  UR                  5      nUR                  [        UR                  5      S Ul        UR                  Ul        UR                  Ul        XR                  UR                  '   U R                  R                  UR                  5        gr   r  r	  s      r/   r   )PrefillFirstScheduler.add_waiting_request  r  r.   rO   c                   ^ ^
^ / n/ n/ mT R                   R                  5        He  nUR                  [        R                  :X  a  UR                  U5        M4  UR                  [        R                  :X  d  MT  UR                  U5        Mg     T R                   H!  nUR                  T R                  U   5        M#     X#-   n[        5       m
U H  nT R                  XAT
5        [        UR                  5      nT R                  U[        UR                  5      5      (       d'  [        T R                  R                  5      S:X  a    OQM{  [         S["        4U4S jj5       nU" U5        X-  n[         S["        4U
U 4S jj5       n	U	" U5        US:X  d  M    O   [%        T R                   Vs/ sH  oUT
;  d  M
  UPM     sn5      T l        T$ s  snf )Nr   r   c                 (   > TR                  U 5        g r   r  r  s    r/   r  HPrefillFirstScheduler.schedule_batch.<locals>._add_to_scheduled_requests:  r  r.   c                 |   > U R                   nUTR                  ;   a  TR                  U	 TR                  U5        g g r   r  r  s     r/   r  KPrefillFirstScheduler.schedule_batch.<locals>._remove_from_waiting_requestsB  r  r.   )r   r  r:   r   r)   r   r*   r   r   r  r  rV   r6   r   r   r   r   rD   r   r  s   `         @@r/   r   $PrefillFirstScheduler.schedule_batch  s   .057))002E||}DDD&&u-!7!77&--e4	 3 11F"))$*?*?*GH 2 %=
-0U*E00Fhie../K22s5++,  tzz../141, 1 1 'u-'LC\ C C *%0q 9  < ',"&"="=r"=OqAqV"=r'
# "! ss   +G7Gr3   r   c                     U(       a9  U R                   R                  U5        XR                  ;   a  U R                  U	 g g g r   r%  r   s      r/   r   $PrefillFirstScheduler.finish_requestT  r'  r.   r(  Nr   r)  r    r.   r/   r,  r,    r*  r.   r,  c                     [         R                  R                  5       (       a~  [         R                  " S5      n [         R                  R	                  U 5      R
                  n[         R                  R                  U 5      n[         R                  R                  U 5      nO[         R                  R                  R                  5       (       a  [         R                  R                  R                  5       (       aX  [         R                  " S5      n [         R                  R                  5       nU[         R                  R                  5       -
  nSnO[         R                  " S5      n S nSnSnXX#4$ )Ncudampsr   cpu)r   r<  is_availablerm   get_device_propertiestotal_memorymemory_reservedmemory_allocatedbackendsr=  is_builtdriver_allocated_memoryrecommended_max_memory)rm   rA  reserved_memoryallocated_memorys       r/   get_device_and_memoryrJ  \  s    zz  f%zz77?LL**44V< ::66v>				(	(	*	*u~~/A/A/J/J/L/Le$yy88:'%))*J*J*LL e$BBr.   T
standaloner{   c                 p   [        5       u  pp[        U	[        X5      -
  U-  5      n[        R                  " / US9R                  5       nSU-  U-  U-  U-  nUb  Xn-  n[        XU -  U -  S-  -  5      nUS::  a  [        R                  S5        Sn[        SU5      n[        UU -  U-  S-   S5      nUU4$ )	N)rn   r      r   z0you are trying to generate a bit too many tokensry   @   rt   )	rJ  r@   maxr   tensorelement_sizer   r   min)max_num_tokensrx   rv   r   r~   r   rw   rn   rm   totalreservedr   available_memory
dtype_sizebytes_per_token max_possible_concurrent_requestsmax_concurrent_tokensoptimal_num_blockss                     r/   r   r   t  s     *?)@&F8EC	$<<@RRSb.;;=J)mh.;jHO+5+G('*~=NRSST($ (1,IJ+-($DE4~E*TXYY[]^444r.   c                   L   \ rS rSr% \R
                  \S'   \R
                  \S'   \R
                  \S'   \R
                  \S'   \R
                  \S'   \\S'   \\S'   \R
                  \S	'   \R
                  \S
'   \R
                  \S'   \\	\
\   4   \S'   \\S'   Sr\\S'   Srg)PagedAttentionArgsi  	input_idsattention_maskposition_idscumulative_seqlens_qcumulative_seqlens_kmax_seqlen_qmax_seqlen_kr   r   logits_indicesblock_tablesr   F	use_cacher    N)r!   r"   r#   r$   r   r   r>   r@   r   r=   r?   rh   rh  rf   r-   r    r.   r/   r^  r^    s    ||LL ,,,,&,,&LL sDI~&&Itr.   r^  c                    U SS  U S S :  nUSS  US S :  n[        UR                  5       UR                  5       5      nU S US-    n US US-    nU S   nUS   n[        R                  " XPR                  S9n[        R                  " XaR                  S9n[        R
                  " XpSS  SS9n	[        R
                  " XSS  SS9n
U	S S 2S 4   U
S S S 24   :H  nU SS  U S S -
  S:H  ) U SS  -  n[        R
                  " X|SS9S S 2S 4   U
:H  n[        R                  " [        R                  " XVU	R                  S9SS9R                  5       nUR                  X-  S5        U$ )Nrt   rM   rm   T)rightFdiagonal)
rS  sumr   arangerm   	bucketizetriuonesrf   masked_fill_)rb  rc  valid_docs_qvalid_docs_knum_valid_docstotal_qtotal_k	q_indices	k_indices	q_doc_ids	k_doc_idsdoc_mask	is_causalapply_causalcausal_masks                  r/   create_document_maskr    s    (+.B3B.GGL'+.B3B.GGL))+\-=-=-?@N 00D.12DE/0D.12DE"2&G"2&GW-H-HIIW-H-HII	+C4PI	+C5QID!YtQw%77H 'qr*-A#2-FF!KLOcdedfOggI??9tDQWMQZZL**UZZAQAQR]^_ddfK<5>Or.   c                      \ rS rSr  S#S\S\S\S\R                  S\R                  S\	R                  S\R                  S	\R                  S
\S\S\4S jjr\" SS9S 5       r\\R&                  " 5       S 5       5       rS\4S jrS r\S 5       r\S\4S j5       r\S 5       r\S 5       r\S 5       r\S\S\4S j5       r\S 5       r \S\4S j5       r!\S 5       r"\S  5       r#S!r$g")$ContinuousBatchProcessori  r   rk   rl   input_queueoutput_queue
stop_eventmodel_devicemodel_dtype	scheduler	streamingmanual_evictionc                    Xl         X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        / U l        UR                  U l        [        UR                  5      U l        U R                  5         [         R"                  " U R                  R$                  5      U l        [)        SS9U l        g)a  Initialize the continuous batch processor.

Args:
    cache: The paged attention cache to use
    generation_config: The generation configuration
    input_queue: Queue for incoming requests
    output_queue: Queue for outgoing results
    stop_event: Event to signal processing should stop
    model_device: Device for model inputs/outputs
    model_dtype: Data type for model inputs/outputs
    streaming: Whether to stream tokens as they're generated
Tskip_special_tokensN)r   rk   rl   r  r  r  r  r  r  r  r  requests_in_batchr|   r   metricssetup_static_tensorsr   from_pretrained_name_or_path	tokenizerr   decode_stream)rR   r   rk   rl   r  r  r  r  r  r  r  r  s               r/   r   !ContinuousBatchProcessor.__init__  s    4 
!2&($(&"".57 !& 6 66u7M7MN!!#0@@AZAZ[)dCr.   TrK  c                 "   U R                   nU R                  R                  U R                  R                  -  n[        R
                  U R                  S.nX0l        [        R                  " SU440 UD6U l	        [        R                  " SU440 UD6U l
        [        R                  " SSX4U R                  U R                  S9U l        [        R                  " US-   440 UD6U l        [        R                  " US-   440 UD6U l        [        R                  " U440 UD6U l        [        R                  " U440 UD6U l        [        R"                  " U4S40 UD6U l        SU l        SU l        [        R"                  " SU4S40 UD6U l        g )Nr   rt   rM   r   )r|   r   rw   rx   r   int32r  tensor_metadatar   r_  ra  r  r`  rb  rc  r   r   fullrf  rd  re  
output_ids)rR   Tmax_token_budgetr  s       r/   r  -ContinuousBatchProcessor.setup_static_tensors  sT   !!::004::3H3HH$)KK4;L;LM.aV??!KKAB/B#kk1't/?/?HYHY
 %*KKQ$LO$L!$)KKQ$LO$L! ;;t??++'7&9M_M#jj!rE_E**aVRC?Cr.   c                 L   U R                   R                  5         U R                  R                  5         U R                  R	                  [
        R                  " U R                  5      R                  5        U R                  R                  5         U R                  R                  5         U R                  R	                  S5        U R                  R	                  S5        U R                  R	                  S5        SU l        SU l        U R                   R                  5         g)z(Reset static tensors for the next batch.rM   r   N)r_  zero_ra  r`  fill_r   finfor  rS  rb  rc  r   r   rf  rd  re  r  rQ   s    r/   reset_static_tensors-ContinuousBatchProcessor.reset_static_tensors  s     	!!!%++d.>.>"?"C"CD!!'')!!'')r"b!!!"%r.   rO   c                 &   U R                   U R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  U R                  U R                  R                  U R                  SS.$ )z2Get model keyword arguments for the current batch.F)r_  ra  r`  rb  rc  r   r   rf  rd  re  rg  r   rh  )r_  ra  r`  rb  rc  r   r   rf  rd  re  r   r   rQ   s    r/   get_model_kwargs)ContinuousBatchProcessor.get_model_kwargs  s{      --"11$($=$=$($=$=++//"11 -- -- JJ44ZZ
 	
r.   c           	          SU R                    SU R                   SU R                  R                   SU R                  R                   S3	U R                  5       R                  5       -   $ )Nz%ContinuousBatchProcessor(input_queue=z, output_queue=z, active_requests=z, waiting_requests=))r  r  r  r   r   r  ra   rQ   s    r/   ra   !ContinuousBatchProcessor.__repr__.  s    3D4D4D3E_UYUfUfTggyz~  {I  {I  {Y  {Y  zZ  Zm  nr  n|  n|  nM  nM  mN  NO  P##%..01	
r.   c                    U R                   R                  5       (       d]   U R                   R                  5       nUc  M?  U R                  R	                  U5        U R                   R                  5       (       d  M\  gg! [
        R                   a     g[         aN  n[        R                  SU 3SS9  [        5       R                  S5      nUb  U R                  X!5         SnANSnAff = f)z?Pull new requests from the input queue and add to waiting list.NzError processing new request: Texc_infor   )r  empty
get_nowaitr  r   queueEmpty	Exceptionr   r9   localsr   _handle_request_error)rR   r   es      r/   _get_new_requests*ContinuousBatchProcessor._get_new_requests4  s     ""((**9((335=2259 ""((** ;;  9=aSADQ&,hll7&;$..q8	9s$   A>  A> >C+	C+AC&&C+r   c                    [         R                  Ul        [        U5      Ul        [        UR                  [        5      (       a+  U R                  R                  UR                  5      Ul	        O/ Ul	        U R                  R                  UR                  UR                  5        U R                  R                  UR                  5       5        g)z(Handle general request processing error.N)r   r,   r:   r=   r9   
isinstancer3   r  r   rH   r  record_request_completionr;   r  putrd   )rR   r9   r   s      r/   r  .ContinuousBatchProcessor._handle_request_errorF  s     %++%j e&&,,#'>>#S#STYTdTd#eE #%E ..u/A/A5CSCSTe88:;r.   c                 p   U R                  5         U R                  R                  5       (       d  gU R                  R	                  [        U R                  R                  5      [        U R                  R                  5      5        U R                  R                  U R                  5      U l
        U R                  (       d  gU R                  5         / n/ n/ n/ nS/nS/n/ nU R                  R                  U R                  5        U R                   GHP  nUR                  n	UR                  U	5        UR                  n
[        U	5      nX-   n[!        [#        U5      5      nXS nU R$                  R'                  X5      nX* S nUR                  U5        UR                  U5        UR                  U5        UR)                  US   U-   5        UR)                  US   U-   5        [        UR*                  5      S:X  a  UR)                  US   S-
  5        [-        U R.                  U5      U l        [-        U R0                  U5      U l        U=R                  U-  sl        GMS     [2        R5                  S[        U R                  5       S[        U R                  R                  5       S[        U R                  R                  5       SUS    S	US    S
U R$                  R7                  5        35        U R9                  UUUUUUU5        U R                  R;                  U R$                  5        g)z=Prepare tensors and metadata for the next model forward pass.Nr   rM   rt   zScheduled: z, Waiting: z
, Active: z	. cum Q: z
. cum KV: z, free blocks: )r  r  r   r  record_queue_metricsrV   r   r   r   r|   r  r  record_batch_metricsr6   r[   rJ   r?   r   r   r   r   rG   rP  rd  re  r   r   r   _build_tensorsrecord_kv_cache_memory_metrics)rR   ra  r_  r   r   rb  rc  rf  r   next_input_idspast_lengthquery_length
key_lengthcache_indexpositions_to_addread_indiceswrite_indicess                    r/   prepare_next_batch+ContinuousBatchProcessor.prepare_next_batchU  sE    	 ~~2244))#dnn.L.L*MsSWSaSaSrSrOst!%!>!>t?T?T!U%% 	!!#	
 !s !s))$*@*@A++E"--N^,//K~.L%3JuZ01K*<8::;;EOL(8M 01l+}- ''(<R(@<(OP ''(<R(@:(MN5--.!3%%&:2&>&BC #D$5$5| DD #D$5$5z BD!!\1!!+ ,. 	#d4456k#dnnFeFeBfAggqruvz  wE  wE  wU  wU  sV  rW  W`  au  vx  ay  `z  zD  EY  Z\  E]  D^  ^m  nr  nx  nx  nL  nL  nN  mO  P	
 	  	
 	33DJJ?r.   c           
      ~   [        [        R                  40 U R                  D6nU" U5      U R                  S S 2S [        U5      24'   U" U5      U R                  S S 2S [        U5      24'   U" U5      U R                  S [        U5      & U" U5      U R                  S [        U5      & U" U5      U R                  S [        U5      & U" U5      U R                  S [        U5      & U" U5      U R                  S [        U5      & [        R                  " U R                  5      R                  n	U R                  R                   S:w  a  [#        [        U5      S-
  5       H  n
XZS-      XZ   -
  XjS-      Xj   -
  :  a/  XZS-      XZ   -
  S:  a  XjS-      XZS-      XZ   -
  -
  S-   nXU
   -
  nOSn[%        XZ   XZS-      5      n[%        Xj   XjS-      5      n[        R&                  " [        R(                  " U R*                  SX4   R,                  U	U R                  U R.                  S9US9nXR*                  SX4'   M     g g )Npaged_attentionrt   .r   rl  )r	   r   rQ  r  r_  rV   ra  r   r   rb  rc  rf  r  r  rS  rk   _attn_implementationr   slicerq  r  r`  shaper  )rR   r_  ra  r   r   rb  rc  rf  	to_tensor	min_valueirm  query_range	key_rangemasks                  r/   r  'ContinuousBatchProcessor._build_tensors  sb    ELLAD,@,@A	.7	.Bq*C	N**+4=l4K!0s<0001/8/E+3{+,-6z-B)#j/*AJK_A`!!"=C(<$=>AJK_A`!!"=C(<$=>5>~5N1c.12KK 0 0155	;;++/@@334q89(Q/2F2II*q514H4KKL,U36J6MMQRR -U37KPQE7RUiUl7lmpqq   (q*AAH H#$8$;=QVWRW=XY!"6"9;OTUPU;VW	zzJJ++C,GHNN!"..#00	 & DH##C$?@1 : Ar.   c                     U R                   b    U R                   R                  5       S   nU$ SS/nU$ ! [         a    SS/n U$ f = f)Nr   rt   )r  tolistr  )rR   outs     r/   _syncContinuousBatchProcessor._sync  s`    ??&oo,,.q1
 
 a&C
	  !f 
	s   4 AAtokenc                 z   U R                   (       ab  U R                  R                  U R                  UR                  S   5      Ul        U R                  R                  UR                  5       5        gUR                  [        R                  :X  a*  U R                  R                  UR                  5       5        gg)zCSend output to the queue based on streaming mode and request state.rM   N)r  r  stepr  rH   r<   r  r  rd   r:   r   r+   )rR   r   r  s      r/   _maybe_send_output+ContinuousBatchProcessor._maybe_send_output  s     >>#1166t~~uG[G[\^G_`E!!%"<"<">?\\]333!!%"<"<">? 4r.   c                 h   U R                  5       n/ n[        U R                  5       GH]  u  p4UR                  n[	        UR
                  5      S:X  a  U R                  R                  UR                  UR                  5        [        R                  Ul        XR                  U      nU/Ul        UR                  U5      (       at  U R                  R                  UR                  UR                  5        U R                   R#                  UR                  U R$                  (       + S9  UR'                  U5        U R)                  XF5        GM'  UR                  [        R*                  :X  d  GMH  [        R,                  Ul        GM`     U R.                  R1                  5       S:X  a  [3        S5      eg)z0Update request states based on generated tokens.r   )r   zNo more free blocksN)r  	enumerater  r3   rV   rG   r  record_ttft_metricr;   r   r*   r:   rf  r6   r^   r  r  r   r  r   r  r(   r)   r   r   r   )rR   
out_tokensfinished_request_idsr  r   r  r  s          r/   update_batch%ContinuousBatchProcessor.update_batch  sO    ZZ\
!!$"8"89HA%%F5--.!3//0B0BEDTDTU,55"#6#6q#9:$)7 **511LL::5;M;MuO_O_`NN11%2B2BZ^ZnZnVn1p(//7''5!?!??,DD : ::))+q0233 1r.   c                 6    U R                   R                  5       $ )z2Check if there are any active or waiting requests.)r  r   rQ   s    r/   r   -ContinuousBatchProcessor.has_pending_requests  s     ~~2244r.   c                     U R                   nU H9  nU R                  X5        U R                  R                  UR                  5        M;     g)z&Handle errors during batch processing.N)r  r  r  r   r3   )rR   r9   failed_reqsreqs       r/   handle_batch_error+ContinuousBatchProcessor.handle_batch_error  s=     ,,C&&u2NN))#..9 r.   c                    [        U R                  R                  R                  5       5      nU H9  nU R	                  X5        U R                  R                  UR                  5        M;     [        U R                  R                  R                  5       5       H9  nU R                  R                  R                  U5      nU R	                  X5        M;     U R                  R                  R                  5         g)zlFail all active requests with the given error.

Args:
    error: The error to report in the failure message
N)r?   r  r   r  r  r   r3   r   keysr   r   clear)rR   r9   requestsr   r  s        r/   fail_all_requests*ContinuousBatchProcessor.fail_all_requests  s     66==?@E&&u4NN))%*:*:; 
 4>>::??ABFNN3377?E&&u4 C
 	--335r.   )r`  r   rk   rc  rb  r  rl   r_  r  rf  r  r|   re  rd  r  r  r  r  r  ra  r   r  r  r  r  r  r  r   N)FF)%r!   r"   r#   r$   rh   r   r   r  Queue	threadingEventr   rm   rn   r   rf   r   r   r  no_gradr  r^  r  ra   r  rD   r  r  r  r  r@   r  r  r   r  r  r-   r    r.   r/   r  r    s      %/D"/D !/D ,	/D
 [[/D kk/D OO/D ll/D [[/D /D /D /Db tD D& 
]]_    
"4 
&
 9 9" <, < < <@ <@| ,H ,H\   @ @S @ @ 4 4* 5d 5 5 : : 6 6r.   r  )fifoprefill_firstc            	          \ rS rSrSr   S*S\S\S\4S jjr\S 5       r	S r
S+S
\S\\   4S jjrS,S\\   4S jjr S-S\\   S\\   S\\   S\4S jjrS\\\      4S jrS,S\\   4S jjrS r\S 5       r\S\4S j5       r\" SS9S 5       r\" SS9S 5       r\" SS9S\4S  j5       rS! r\" S"S9S.S\S#\4S$ jj5       r\" S%S9S& 5       r\S\\   4S' j5       r \S\4S( j5       r!S)r"g	)/ContinuousBatchingManageri  zManager for handling continuous batching of generation requests.

This class provides the user interface for submitting generation requests,
retrieving results, and managing the background generation thread.
rl   r  r  c                    UR                  5       U l        Uc  UR                  OUnX l        [        R                  " US9U l        [        R                  " 5       U l        [        R                  " 5       U l	        XPl
        [        USS5      U l        SU l        SU l        [        R                  " 5       U l        SU R                  R                  l        [        USS5      U l        U R                  R'                  U5      U l        [        USS5      U l        [        US	S5      U l        X0l        SU l        [3        SS
9U l        g)a#  Initialize the continuous batching manager.

Args:
    model: The language model for generation
    generation_config: Configuration for generation parameters
    max_queue_size: Maximum size of the request queue (0 = unlimited)
    streaming: Whether to stream tokens as they are generated
N)maxsizelog_prob_generationFr   	do_sampleTuse_cuda_graphprofiler  )evalmodelrl   r  r  r  r  r  r  r  r  r   r   _generation_thread_request_counterLock_request_locktop_pr  _get_logits_processorlogit_processorr  r  r  batch_processorr   r  )rR   r  rl   r  max_queue_sizer  s         r/   r   "ContinuousBatchingManager.__init__  s
     ZZ\
7H7PE33Vg!2 ;;~>!KKM#//+"#*+<>SUZ#[ "& !&^^--1

$$* !2KF#zz??@QR%&79I4P0)UC.CG)dCr.   c                 `   U R                   b5  U R                   R                  5       (       a  [        R                  S5        g[        R
                  " 5       U l        [        R                  " U R                  S9U l         U R                   R                  5         [        R                  S5        g)z'Start the background generation thread.Nz"Manager thread is already running.)targetz$Continuous batching manager started.)r  is_aliver   r   r  r  _result_queuer  Thread_run_generation_loopstartr   rQ   s    r/   r  ContinuousBatchingManager.startC  sz     "".43J3J3S3S3U3UNN?@"[[]"+"2"2$:S:S"T%%':;r.   c                 `    U R                   SL=(       a    U R                   R                  5       $ )z5Check if the background generation thread is running.N)r  r  rQ   s    r/   
is_running$ContinuousBatchingManager.is_runningO  s'    &&d2Yt7N7N7W7W7YYr.   Nblocktimeoutc                    U R                   c  [        R                  S5        gU R                  R	                  5       (       d/  U R                  R                  5         [        R                  S5        U(       a  U R                  U5        gg)zSignal the background thread to stop.

Args:
    block: Whether to wait for the thread to stop
    timeout: Maximum time to wait for the thread to stop
NzManager not started.z'Stopping continuous batching manager...)r  r   r   r  is_setr  r   join)rR   r  r  s      r/   stopContinuousBatchingManager.stopS  sd     ""*NN12%%''OO!KKABIIg r.   c                     U R                   bk  U R                   R                  US9  U R                   R                  5       (       a  [        R	                  S5        g[        R                  S5        SU l         gg)zjWait for the background thread to finish.

Args:
    timeout: Maximum time to wait for the thread to stop
Nr  z2Generation thread did not exit after join timeout.z$Continuous Batching Manager stopped.)r  r  r  r   r   r   )rR   r  s     r/   r  ContinuousBatchingManager.joine  sc     "".##(((9&&//11STBC*.' /r.   r_  r3   rL   rO   c                    Uc9  U R                      SU R                   3nU =R                  S-  sl        SSS5        Uc  U R                  R                  OUn[	        U[        U5      [        U5      UU R                  R                  S9nU R                  R                  USSS9  [        R                  SU S	35        U$ ! , (       d  f       N= f)
zAdd a new generation request to the queue.

Args:
    input_ids: Input token IDs to use as prompt
    request_id: Optional custom request ID (auto-generated if None)
    **kwargs: Additional generation parameters

Returns:
    str: The request ID
Nreq_rt   )r3   r6   rF   rL   rN   T
   r  r  zAdded request z
 to queue.)r	  r  rl   rL   rD   r?   rN   r  r  r   debug)rR   r_  r3   rL   r   s        r/   add_request%ContinuousBatchingManager.add_requests  s     ###D$9$9#:;
%%*% $ CQBX//>>^l!I O)//<<
 	U$;~j\<=# $#s   %C
Cinputsc                 `    [        U5       H  u  p4SU 3nU R                  " U4SU0UD6  M!     g )N
batch_req_r3   )r  r*  )rR   r,  r   r  r_  r  s         r/   add_requests&ContinuousBatchingManager.add_requests  s7    %f-LA!!%FYD6DVD .r.   c                    U R                   c   U R                  R                  5       (       a  g U R                  R                  SUS9n[        R                  SUR                   35        U$ ! [        R                   a     gf = f)zRetrieve one result from the output queue.

Args:
    timeout: Maximum time to wait for a result

Returns:
    Optional[Dict]: The result data or None if timeout
NTr(  zRetrieved result for request )	r  r  r  r   r   r)  r3   r  r  )rR   r  results      r/   
get_result$ContinuousBatchingManager.get_result  s}     ""*t/@/@/F/F/H/H	&&**w*GFLL89J9J8KLMM{{ 		s   =A- -BBc              #   n  #    U R                   b  U R                   R                  5       (       d  U R                  R                  5       (       df  U R	                  SS9nUb  Uv   U R                   b!  U R                   R                  5       (       a  MD  U R                  R                  5       (       d  Me  gg7f)z.Iterate over results as they become available.Ng?r#  )r  r  r  r  r3  )rR   r2  s     r/   __iter__"ContinuousBatchingManager.__iter__  s      ##/D4K4K4T4T4V4V^b^o^o^u^u^w^w__S_1F!	 ##/D4K4K4T4T4V4V^b^o^o^u^u^w^ws   BB5B53B5c                    [         R                  R                  U R                  R                  S9nUR                  [         R                  R                  5       5        [         R                  R                  U5         U R                  U5        S S S 5        [         R                  R                  5       R                  U5        [         R                  R                  5       U l
        [         R                  R                  U R                  US9   U R                  U5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nrj  )stream)r   r<  Streamr  rm   wait_streamcurrent_streamr9  _generation_step	CUDAGraphgraph)rR   r  r9  s      r/   warmup ContinuousBatchingManager.warmup  s    ""$***;*;"<5::4467ZZv&!!/2 ' 	

!!#//7ZZ))+
ZZdjj8!!/2 98 '& 98s   >D+D<+
D9<
E
r  c                 @   UR                  5       n[        R                  " 5          U R                  U5      nU R                  (       a  UR
                  R                  U5        U R                  X#5      nU R                  X5        SSS5        g! , (       d  f       g= f)z6Perform a single generation step. This is cuda graphedN)	r  r   r  _model_forwardr   output_probscopy__process_logit_sample)rR   r  
batch_datalogitsprobss        r/   r=  *ContinuousBatchingManager._generation_step  sn     %557
]]_((4F'',,226:''
;ELL0 __s   A B
Bmodel_forwardr   c                 :    U R                   " S0 UD6R                  $ )Nr    )r  rI  )rR   rH  s     r/   rC  (ContinuousBatchingManager._model_forward  s    zz'J'...r.   logit_processingc                     [        U R                  S5      (       a"  U R                  R                  US   US   5        U R                  US   U5      $ )Nset_continuous_batching_contextrf  rb  r_  )r   r  rQ  )rR   rH  rI  s      r/   rF  (ContinuousBatchingManager._process_logit  sW     4'')JKK  @@+,j9O.P ##J{$;VDDr.   samplingc                    U R                   (       aF  [        R                  R                  USS9n[        R
                  " US   SS9R                  S5      nO[        R                  " USS9nUR                  R                  U5        g )NrM   )dimr   rt   )num_samples)
r  nn
functionalsoftmaxr   multinomialsqueezeargmaxr  rE  )rR   r  rJ  next_tokenss       r/   rG  !ContinuousBatchingManager._sample  sg    >>MM))%R)8E++E!H!DLLQOK,,u"5K""((5r.   c                 &   Sn [        U R                  R                  U R                  U R                  R                  U R                  R
                  [        U R                  R                  5      [        U R                  SS5      S9nSn[        U R                  S5      (       aL  [        R                  U R                  R                  5      nUc  [        R                  SU S35        [         nO[         n[#        UU R                  R                  U R                  U R                  U R$                  U R&                  U R                  R                  U R                  R
                  U" X R(                  5      U R*                  U R(                  5      nXl        SnU R&                  R/                  5       (       a  UR1                  5       (       aR  U R3                  X5        U(       a  S	nU R&                  R/                  5       (       d  M;  UR1                  5       (       a  MR  [        R;                  S5        g! [4         a2  n[        R7                  S
U 3SS9  U R9                  XQ5         SnANMSnAff = f! [        R;                  S5        f = f)z6Main processing loop running in the background thread.N_tp_size   )ro   rq   r  zScheduler 'z ' not found. Defaulting to FIFO.TFzError in generation loop: r  zGeneration loop finished.)rh   r  rk   rl   rm   rn   rV   r  r  r   r   SCHEDULER_MAPPINGr   r  r   r   r   r  r  r  r  r  r  r  r   _inner_generation_loopr  r9   _handle_critical_errorr   )rR   r  paged_attention_cacher  is_firstr  s         r/   r  .ContinuousBatchingManager._run_generation_loop  s    ,	5$7

!!&&

!!

   !1!1!7!78

J:%! It--{;;-11$2H2H2R2RS	$NN[;[#\] -I *	6%

!!&&  !!

!!

  /1E1EF$$O $3 H--//O4X4X4Z4Z++OF$H --//O4X4X4Z4Z KK34	  	<LL5aS9DLI'';;	< KK34s0   HH: H: :
I6(I1,I9 1I66I9 9Jgeneration_looprf  c           
      H   [         R                  R                  5       (       a  [         R                  R                  5         UR	                  5         [        5       u  p4pV[        R                  SU SU SU SU 35        [         R                  R                  5       (       a_  U R                  (       aN  U(       a  U R                  U5        OF[        U S5      (       a   U R                  5         O#U R                  U5        OU R                  U5        [         R                  R                  5       (       a  [         R                  R                  5         UR!                  5         g ! [         a2  n[        R                  SU 3SS9  UR                  U5         S nAg S nAff = f)	Nz[Memory] Device: z	, Total: z, Reserved: z, Allocated: r?  zModel forward pass failed: Tr  )r   r<  r?  synchronizer  rJ  r   r   r  r@  r   _graph_replayr  r9   r  r=  r  )rR   r  rf  rm   rU  rV  r   r  s           r/   rc  0ContinuousBatchingManager._inner_generation_loop  s7   ::""$$JJ""$**,-B-D*x'xy|H:Ubclbmno::""$$)<)<O,w''&&( %%o6!!/2::""$$JJ""$$$& ! LL#>qc!BTLR#66q9s   E% %
F!/(FF!graph_replayc                 8    U R                   R                  5         g r   )r?  replayrQ   s    r/   rk  'ContinuousBatchingManager._graph_replay.  s    

r.   c                     U R                   R                  5           U R                  R                  5       nUb  UR	                  X5        M0  ! [
        R                   a     Of = fUb  UR                  U5        gg)z:Handle critical errors that terminate the generation loop.N)r  r  r  r  r  r  r  r  )rR   r9   r  req_datas       r/   rd  0ContinuousBatchingManager._handle_critical_error2  s|     		++668".#99%J  {{ 		 &--e4 's   1A A$#A$c                     U R                   (       d  [        S5      eU R                  b&  U R                  R                  R	                  U5        gg)zSEvict a request from the cache. It is assumed that the request is already finished.z0Manual eviction is not enabled for this manager.N)r  RuntimeErrorr  r  r   r   s     r/   evict_request_from_cache2ContinuousBatchingManager.evict_request_from_cacheE  sD     ##QRR+  **99*E ,r.   )r  r  r	  r  r  r  r  rl   r?  r  r   r  r  r  r  r  r  r  r  )Fr   T)FNr   )NNr   )#r!   r"   r#   r$   r%   r   rf   r   r   r  r  r
   rA   r   r  r?   r@   r=   r*  r/  r1   r3  r6  r@  r  r=  rC  rF  rG  r  rc  rk  rd  rv  r-   r    r.   r/   r  r    s    !&"D ,"D 	"D "DH 	< 	<Z$ % $/HUO / gkc08V^_bVc	BE4S	? E(3C*D $ 
3 
3 10H 1 1 o&/ '/ ()E *E j!6'? 6 "6/5b '('6N 'Z^ ' )'0 n% & 5XF^=_ 5 5$ F3 F Fr.   r  c                       \ rS rSrSr    SS\\   S\S\S\S\	4
S	 jjr
\\R                  " 5         SS
\\\      S\\   S\S\\\      4S jj5       5       rSrg)ContinuousMixiniN  z?Mixin class for models to add continuous batching capabilities.Nrl   r  r  r  rO   c                 (   [        U S5      (       a"  [        U S5      (       a  [        U S5      (       d  [        S5      eUb  UOU R                  nUc  [        S5      eUR                  c  [
        R                  S5        SUl        [        U UUUUS9$ )	aV  Initialize a manager for continuous batching inference.

Args:
    generation_config: Custom generation configuration
    max_queue_size: Maximum size of the input request queue
    streaming: Whether to stream tokens as they are generated

Returns:
    `ContinuousBatchingManager`: The manager instance to add requests and retrieve results.
rk   rm   rn   z;Model must have 'config', 'device', and 'dtype' attributes.z8A GenerationConfig must be provided or set in the model.zE`eos_token_id` not set in GenerationConfig. Setting to -1 (disabled).rM   )r  rl   r  r  r  )r   AttributeErrorrl   r   rN   r   r   r  )rR   rl   r  r  r  
gen_configs         r/   init_continuous_batching(ContinuousMixin.init_continuous_batchingQ  s    " tX&&gdH.E.EWUY[bMcMc !^__*;*G&TMcMc
WXX""*NNbc&(J# )(+)
 	
r.   r,  progress_barc                    U(       d  / $ U R                  US9nUR                  5         0 n[        U5      n SSKJn  U" [
        /5         [        UU(       + SU S3SS9 n	UR                  " U40 UD6  Sn
X:  a  UR                  SS	9nU(       a  UR                  nUR                  [        R                  :X  a  XU'   U
S-  n
U	R                  S5        [
        R                  UR                  R                   R#                  UR$                  5      5        O+UR'                  5       (       d  [
        R)                  S
5        OX:  a  M  SSS5        SSS5        UR-                  SSS9  U$ ! , (       d  f       N(= f! , (       d  f       N1= f! [*         a!  n[
        R)                  SU 3SS9   SnANZSnAff = f! UR-                  SSS9  f = f)a  Generate sequences for a batch of prompts using continuous batching.

Args:
    inputs: List of input token sequences (prompts)
    generation_config: Optional generation configuration
    **kwargs: Additional generation parameters

Returns:
    `list[list[int]]`: A list containing the generated sequences (including prompt tokens
                        if not handled otherwise) for each input prompt, in the same order.
                        Returns an empty list `[]` for requests that failed.
)rl   r   )logging_redirect_tqdmzSolving z	 requestsrequest)rU  disabledescunitrt   r#  z*Generation thread terminated unexpectedly.NzError during batch generation: Tr  g      @r(  )r}  r  rV   tqdm.contrib.loggingr  r   r   r/  r3  r3   r:   r   r+   r   r)  r  r  decoder7   r  r9   r  r   )rR   r,  rl   r  r   managerresultsro   r  pbarfinished_countr2  r  r  s                 r/   generate_batchContinuousMixin.generate_batchv  s   * I //BS/T6{	2B&x0&!--#L>;"	
 ((:6:%&N(7!(!3!3A!3!>!%+%6%6F%}}0F0FF28 .! 3 $A"LL)@)@)J)J)Q)QRXRiRi)jk#*#5#5#7#7 &-Y Z % )7 14 LLtSL15  10.  	OLL:1#>LN	O LLtSL1sf   F 
E= C(E,
E=F ,
E:	6E==
FF 
F< F 
F9F4/F< 4F99F< <Gr    )NFr   F)NT)r!   r"   r#   r$   r%   r
   r   rf   r@   r  r}  r   r   inference_moder?   r  r-   r    r.   r/   ry  ry  N  s    I 9= %#
#$45#
 #
 	#

 #
 
##
J 
 9=!	9T#Y9 $$459 	9 
d3i9  9r.   ry  )7r  r  rB   abcr   r   collectionsr   dataclassesr   r   enumr   	functoolsr	   typingr
   r   r   torch.nnrW  tokenizers.decodersr   r   configuration_utilsr   generation.configuration_utilsr   tokenization_utils_fastr   utils.loggingr   utils.metricsr   r   r   r   	getLoggerr!   r   r1   rD   rh   r   r   r,  rJ  r   r   r^  r  r  rb  r  ry  r    r.   r/   <module>r     s       #  (   "   ,  2 = = # R R	D 	 
		8	$ ; ; ;* G
 G
 G
T mZ mZ mZ`$ $N s5I s5 s5l s5I s5 s5lC0 4 
--5 5>      < H6 H6 H6X
 *  sF sF sFl	c cr.   