
    Ph	6                         d dl Z d dlmZmZ d dlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ  G d	 d
e      Z e        G d de             Z e        G d de             ZeedZy)    N)ABCabstractmethod)deque   )attach_tracertraced   )PagedAttentionCache)RequestStateRequestStatusc                   :   e Zd ZdZddedefdZedefd       Z	e
dedee   fd	       Zedefd
       Zeddedefd       Zededee   fd       Zedefd       Zed        Zededefd       Zedededefd       Z ed      dededee   fd       Zy)	ScheduleraA  
    Abstract base class for scheduling requests in the continuous batch processor. Schedulers manage the lifecycle of
    requests from when they are added to the waiting queue to when they are scheduled for processing. Different
    schedulers implement different strategies for prioritizing and batching requests.
    cacheretain_cache_on_finishc                     i | _         i | _        t               | _        || _        || _        t        j                         | _        t               | _
        y N)active_requestswaiting_requestsr   waiting_requests_orderr   r   	threadingLock_cancellation_lockset_requests_to_cancel)selfr   r   s      o/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/scheduler.py__init__zScheduler.__init__   sD    8:9;27'#
&<#"+.."2-0U     statec                    | j                   r|j                  | j                  v rn| j                  j                  |j                        }|j                  t        |j                        d |_        |j                  |_        |j                  |_        || j                  |j                  <   | j                  j                  |j                         y)z#Adds a request to the waiting list.N)r   
request_idr   pop
prompt_idslenfull_prompt_idsallocated_blocksposition_offsetr   r   append)r   r   	old_states      r   add_waiting_requestzScheduler.add_waiting_request(   s     &&5+;+;t?S?S+S,,001A1ABI$//I4M4M0N0PQE%.%?%?E"$-$=$=E!27e../##**5+;+;<r   token_budgetreturnc                      y)a@  Schedules requests for the next batch based on available token budget. This method selects which requests
        should be processed in the current batch, considering the token budget and the scheduler's prioritization rules.
        The token_budget is the maximum number of tokens that can be processed in this batch.N )r   r+   s     r   schedule_batchzScheduler.schedule_batch3   s    
 	r   c                 Z    t        | j                        xs t        | j                        S )z3Checks if there are requests ready to be processed.)r$   r   r   )r   s    r   has_pending_requestszScheduler.has_pending_requests:   s%     4''(FC0E0E,FFr   r!   evict_from_cachec                 x    |r8| j                   j                  |       || j                  v r| j                  |= yyy)zCompletes processing of a request and optionally frees its allocated cache blocks. This method is called
        when a request has finished generation or encountered an error.
        N)r   free_blocksr   )r   r!   r2   s      r   finish_requestzScheduler.finish_request?   s?    
 JJ"":.T111((4 2 r   c                 T    || j                   v r| j                   |   j                  S g S )z,Gets generated tokens for an active request.)r   static_outputsr   r!   s     r   !get_active_request_static_outputsz+Scheduler.get_active_request_static_outputsI   s.     ---''
3BBB	r   c                 |    | j                   5  | j                  j                  |       ddd       y# 1 sw Y   yxY w)z!Marks a request for cancellation.N)r   r   addr8   s     r   set_request_cancellationz"Scheduler.set_request_cancellationP   s-     $$$$((4 %$$s   2;c                 z   | j                   5  | j                  D ]|  }|| j                  v r| j                  |= || j                  v r| j                  |= || j                  v r| j                  j                  |       | j                  j                  |       ~ t               | _        ddd       y# 1 sw Y   yxY w)z=Remove all cancelled requests from active and waiting queues.N)	r   r   r   r   r   remover   r4   r   r8   s     r   clear_cancelled_requestsz"Scheduler.clear_cancelled_requestsV   s     $$"66
!5!55,,Z8!6!66--j9!<!<<//66zB

&&z2 7 (+uD$ %$$s   BB11B:c                 ^    || j                   v xs || j                  vxr || j                  vS )z2Checks if a request has been cancelled or removed.)r   r   r   r8   s     r   request_is_cancelledzScheduler.request_is_cancelledd   s:     T555 
d222^zI^I^7^	
r   len_next_tokensc                 X   |j                         }|j                  | j                  j                  z  |z
  }||k  s|j                  dk(  r`||z
  dz   | j                  j                  z  dz   }| j                  j	                  ||j
                        }|y|xj                  |z  c_        y)a  Allocate additional cache blocks for a request if the currently allocated blocks are insufficient to
        accommodate the next tokens. It calculates how many blocks are needed based on the request's current
        cache occupancy and the number of tokens to be processed. The allocation itself is done by the CacheAllocator
        objects. Returns a boolean indicating if the allocation was successful or not.
        r   r	   FT)current_lenr&   r   
block_sizeallocate_blocksr!   )r   r   rB   rD   	occupancyblocks_needed	allocateds          r   _allocate_blocks_if_neededz$Scheduler._allocate_blocks_if_neededk   s     '')**TZZ-B-BB[P	&%*@*@A*E-	9A=$**BWBWW[\\M

22=%BRBRSI ""i/"r   prepare_request)	span_name"request_ids_to_remove_from_waitingc                 J   |j                   t        j                  k(  r|j                  n|j                  }t        |      |k  r|j                   t        j                  k(  rJ|| j                  |j                  <   t        j                  |_         |j                  |j                         y|j                   t        j                  k(  r.t        j                  |_         |j                  |_        g |_        yy|j                   t        j                  k(  rJ|| j                  |j                  <   t        j                  |_         |j                  |j                         n2|j                   t        j                  k(  rt        j                  |_         ||d |_        |d| |_        y)z7Prepares a request for processing in the current batch.N)statusr   SPLIT_PENDING_REMAINDERremaining_prompt_idsr#   r$   PENDINGr   r!   
PREFILLINGr;   PREFILLING_SPLIT)r   r   r+   rM   request_tokenss        r   _prepare_request_for_processingz)Scheduler._prepare_request_for_processing~   sG    +0,,-:_:_*_E&&ejeueu 	 ~-||}4449>$$U%5%56,77266u7G7GH!F!FF,77#(#=#= -/* G ||}4449>$$U%5%56,==266u7G7GH!F!FF,==)7)FE&-m|<Er   N)F)T)__name__
__module____qualname____doc__r
   boolr   r   r   r*   r   intlistr/   r1   strr5   r9   r<   r?   rA   rJ   r   rV   r.   r   r   r   r      sv   31 34 3 = = = 3 43E   Gd G G 5 5 5 5 C DI   53 5 5
 - - 
s 
t 
 
  s W[  $ '(=!=14=Z]^aZb= )=r   r   c                   R     e Zd ZdZd	dededef fdZede	de
e   fd       Z xZS )
FIFOScheduleraK  This scheduler processes requests in the order they arrive, meaning decoding requests has priority over
    prefilling requests. Additionally, it includes a safety margin mechanism to prevent cache exhaustion. By default,
    when 80% of the cache is full, new requests will not be scheduled to prioritize decoding active requests.r   r   safety_marginc                 4    t         |   ||       || _        y)a[  Initializes the FIFO scheduler. The safety margin is the percentage of free blocks under which we stop
        scheduling new prefill requests, so safety_margin = 0.1 means that when there is less than 10% of free blocks,
        or equivalently when more than 90% of blocks are already allocated, we stop scheduling new prefill requests.
        N)superr   ra   )r   r   r   ra   	__class__s       r   r   zFIFOScheduler.__init__   s    
 	 67*r   r+   r,   c                 X    g }g }g  j                   j                         D ]n  }|j                  t        j                  k(  r|j                  |       |j                  t        j                  t        j                  fv s^|j                  |       p  j                  D ]   }|j                   j                  |          " ||z   }t                j                   j                  j                  z  }|D ]  } j                  j                         }||k  }	|	r!r|j                  t        j                  k7  r n j                  ||       t!        |j"                        }
 j%                  |t!        |j"                              s%t!         j                  j&                        dk(  r nJt(        dt*        ffd       } ||       ||
z  }t(        dt*        f fd       } ||       |dk(  s n t-         j                  D cg c]	  }|vs| c}       _        S c c}w )Nr   r   c                 (    j                  |        y r   r(   r   scheduled_requestss    r   _add_to_scheduled_requestsz@FIFOScheduler.schedule_batch.<locals>._add_to_scheduled_requests       "))%0r   c                 x    | j                   }|j                  v rj                  |= j                  |       y y r   r!   r   r;   r   req_idrM   r   s     r   _remove_from_waiting_requestszCFIFOScheduler.schedule_batch.<locals>._remove_from_waiting_requests   =    ))T222--f56::6B 3r   )r   valuesrO   r   DECODINGr(   rP   rT   r   r   r   ra   r   
num_blocksget_num_free_blocksrV   r$   r#   rJ   _free_blocksr   r   r   )r   r+   priority_statessecond_priority_statesr   ro   
candidatessafety_marginsnum_free_blocksoutside_safety_marginrequest_lenrj   rp   rM   ri   s   `            @@r   r/   zFIFOScheduler.schedule_batch   s   .057))002E||}555&&u-|| E E}GeGeff&--e4	 3 11F"))$*?*?*GH 2 %'==
-0U*++djj.C.CCE"jj<<>O$3n$D!$);P]PfPf@f00Fhie../K22s5++, tzz../141, 1 1 'u-K'LC\ C C *%0q E  H ',"&"="=r"=OqAqV"=r'
# "! ss   	H'H')Fg?)rW   rX   rY   rZ   r
   r[   floatr   r   r\   r]   r   r/   __classcell__)rd   s   @r   r`   r`      sL    q+1 +4 +hm + ;"3 ;"43E ;" ;"r   r`   c                   0    e Zd ZdZededee   fd       Zy)PrefillFirstSchedulerzScheduler that prioritizes split prefill requests over decoding requests. This scheduler ensures that split
    prefill requests (which are continuations of partially processed prompts) are completed before processing new
    decoding requests.r+   r,   c                    
 g }g }g  j                   j                         D ]o  }|j                  t        j                  t        j
                  fv r|j                  |       A|j                  t        j                  k(  s_|j                  |       q  j                  D ]   }|j                   j                  |          " ||z   }t               
|D ]  } j                  ||
       t        |j                        } j                  |t        |j                              s%t         j                  j                         dk(  r nJut"        dt$        ffd       } ||       ||z  }t"        dt$        f
 fd       }	 |	|       |dk(  s n t'         j                  D cg c]	  }|
vs| c}       _        S c c}w )Nr   r   c                 (    j                  |        y r   rg   rh   s    r   rj   zHPrefillFirstScheduler.schedule_batch.<locals>._add_to_scheduled_requests  rk   r   c                 x    | j                   }|j                  v rj                  |= j                  |       y y r   rm   rn   s     r   rp   zKPrefillFirstScheduler.schedule_batch.<locals>._remove_from_waiting_requests  rq   r   )r   rr   rO   r   rT   rP   r(   rs   r   r   r   rV   r$   r#   rJ   r   rv   r   r   r   )r   r+   rw   rx   r   ro   ry   r}   rj   rp   rM   ri   s   `         @@r   r/   z$PrefillFirstScheduler.schedule_batch   s   .057))002E|| > >@e@eff&&u-!7!77&--e4 3 11F"))$*?*?*GH 2 %'==
-0U*E00Fhie../K22s5++, tzz../141, 1 1 'u-K'LC\ C C *%0q 9  < ',"&"="=r"=OqAqV"=r'
# "! ss   '	G1GN)	rW   rX   rY   rZ   r   r\   r]   r   r/   r.   r   r   r   r      s.     5"3 5"43E 5" 5"r   r   )fifoprefill_first)r   abcr   r   collectionsr   utils.metricsr   r   r   r
   requestsr   r   r   r`   r   SCHEDULER_MAPPINGr.   r   r   <module>r      sw     #  2 & 1A= A=H I"I I" I"Z ;"I ;" ;"~ * r   