
    h                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&Z&d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9 ddl1m:Z:  e8       rd dl;Z;d dl&m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZB  e,       rd dlCZC e0       rd dlDmEZE  e.       xr  e+       xr  e/       xr  e-       ZFeFrd dlGZGd dlHmIZImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[ d dl\m]Z] d dl^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZm d dlnmoZo d d lpmqZqmrZrmsZs  G d! d"eod#$      Zt G d% d&e]d#$      Zu G d' d(eSd#$      Zv eret      Zw ereu      Zx erev      Zyh d)Zzh d*Z{h d+Z| e9j                  e~      Zd,d-d.d/iZ eej                               Zd0Z G d1 d2ej
                        Zd3efd4Zd5ed6d7d8d7fd9Z G d: d;      Z G d< d=      Ze G d> d?             Z G d@ dAe:      Ze~dBk(  r e       Zej                          yy)C    N)ArgumentParser	Namespace)AsyncGenerator	GeneratorIterable)asynccontextmanager)	dataclassfield)BytesIO)Thread)Optional	TypedDictUnion)
model_info)HF_HUB_OFFLINE)DecodeStream)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                       e Zd ZU dZeed<   y))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__     \/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/commands/serving.pyrN   rN   {       	 rX   rN   F)totalc                       e Zd ZU dZeed<   y)+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rO   NrP   rW   rX   rY   r]   r]      rZ   rX   r]   c                   4    e Zd ZU dZeed<   eed<   dZeed<   y)%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerO   FstreamN)	rQ   rR   rS   rT   bytesrV   rU   ra   boolrW   rX   rY   r_   r_      s    	 rX   r_   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopre   audiorf   logprobsmetadata	functions
modalities
predictionrl   rm   rn   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   rg   rh   languager{   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                       e Zd ZdZdZdZdZy)ModalityLLMVLMSTTTTSN)rQ   rR   rS   r   r   r   r   rW   rX   rY   r   r      s    
C
C
C
CrX   r   argsc                     t        |       S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)r   s    rY   serve_command_factoryr      s     rX   reqmodel_generation_configr'   returnc                 `   | j                  d      "t        di t        j                  | d         }nt	        j
                  |      } |j                  di |}|j                         D ]  \  }}|	t        |||        | j                  d      t        | d         |_
        | j                  d      t        | d         |_
        | j                  d      t        | d         |_        | j                  d      
| d   |_        | j                  d      
| d   |_        | j                  d      +t        | d         |_        t        | d         dk(  rd	|_        | j                  d
      t        | d
         |_        | j                  d      t%        j&                  | d          |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rO   max_output_tokens
max_tokensfrequency_penalty
logit_biasrr   temperatureg        Ftop_pseedrW   )getr'   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrO   non_standard_kwargskvs          rY   !create_generation_config_from_reqr      s   . ww"#/,Ttzz#>Q:R/ST MM*AB2+22<V<#))+1=%q!, ,
 ww"#/+.s3F/G+H( ww|(+.s</@+A(
ww"#//4S9L5M/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A%]#$+*/'
www#"'G"5
wwv"#f+&rX   c                       e Zd ZdZd Zd Zy)	ToolStatez7Lightweight class to keep track of the tool call state.c                 $    | j                          y N)resetselfs    rY   __init__zToolState.__init__'  s    

rX   c                 <    d| _         d| _        d| _        d| _        y)z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    rY   r   zToolState.reset*  s!     %%*"!"rX   N)rQ   rR   rS   rT   r   r   rW   rX   rY   r   r   $  s    ArX   r   c            	       L    e Zd ZdZ	 ddddedeed      fdZd	 Zd
 Z	d Z
d Zy)
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr(   timeout_seconds	processor)r   r   c                     || _         t        |j                        | _        || _        || _        t        j                  | j
                  | j                        | _	        | j                  j                          y r   )r   rU   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr   )r   r   r   r   s       rY   r   zTimedModel.__init__8  s[     
 !3!34".ood&:&:D<P<PQrX   c                     | j                   j                          t        j                  | j                  | j
                        | _         | j                   j                          y)z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    rY   reset_timerzTimedModel.reset_timerE  s@    ood&:&:D<P<PQrX   c                 0   t        | d      r| j                  }| `| `d| _        d| _        t        j                          t
        j                  j                         rt
        j                  j                          | j                  j                          yyy)z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   cudais_availableempty_cacher   r   r   s    rY   delete_modelzTimedModel.delete_modelK  sr    4!djj&<
DJ!DNJJL zz&&(

&&( KK  '=!rX   c                     | j                          t        j                  | j                   d| j                   d       y )Nz was removed from memory after z seconds of inactivity)r   loggerinfor   r   r   s    rY   r   zTimedModel.timeout_reached[  s7    t))**I$J^J^I__uvwrX   c                 <    t        | d       xs | j                  du S )z)Check if the instances have been deleted.r   N)r   r   r   s    rY   
is_deletedzTimedModel.is_deleted_  s     4))?TZZ4-??rX   r   )rQ   rR   rS   rT   r   r   r   r   r   r   r   r   rW   rX   rY   r   r   2  sP     SW	   E"MNO	! x@rX   r   c                      e Zd ZU dZ edddi      Zeed<    edddi      Ze	ed	<    ed
dg dd      Z
ee	   ed<    eddg dd      Zee	   ed<    edddi      Zeed<    ed
ddi      Zee	   ed<    edddi      Zeed<    edddi      Zeed<    eddddgd      Ze	ed<    edddi      Zeed<    eddd i      Ze	ed!<    ed"dd#i      Zeed$<    ed%dd&i      Zeed'<    ed(dd)i      Ze	ed*<    ed
dd+i      Zee   ed,<    eddd-i      Zeed.<    eddd/i      Zeed0<    ed
dd1i      Zee	   ed2<   d3 Zy
)4ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    Fhelpz8Whether to use continuous batching for chat completions.)defaultru   continuous_batchingautozfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   bfloat16float16float32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypez2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                     | j                   ^| j                  | j                   | _        y| j                   | j                  k7  r&t        d| j                    d| j                   d      yy)z(Only used for BC `torch_dtype` argument.Nz`torch_dtype` z and `dtype` zn have different values. `torch_dtype` is deprecated and will be removed in 4.59.0, please set `dtype` instead.)r   r   
ValueErrorr   s    rY   __post_init__zServeArguments.__post_init__  su     'zz!!--
!!TZZ/ $T%5%5$6mDJJ< PM M  0 (rX   )rQ   rR   rS   rT   r
   r   rc   rV   r   rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   rX   rY   r   r   d  sT    !&TU!   >
FC  "'WA
"K#  !PA
E8C=  $)] ^t  */ r
*#  efL$  efL$   %UFZhmotgu=vww!&uHm?n!o$o kV=c4deD#edf6W-XYD#Y\]M3  &*d!eIs  #(([\#L(3-  &
K  #B
d  "'2
"K# 
rX   r   c                   J   e Zd Zedefd       ZdefdZdede	ddd	e
fd
ZdefdZdefdZdefdZ	 	 	 	 	 	 	 	 d1dedee   dee   dee   dee   deed      dee   dee   defdZdddefdZd Zej2                  deeeef      fd       Zdededeedf   fd Zedd!defd"       Zed#efd$       Z dede!eddf   fd%Z"dede!eddf   fd&Z#dede!eddf   fd'Z$dede%fd(Z&ededed)   fd*       Z'd+edefd,Z(d-efd.Z)d-ede*d!ef   fd/Z+d-ede*d!e,f   fd0Z-y)2r   parserc                 d    t         f}| j                  d|      }|j                  t               y)z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r   r  serve_parsers      rY   register_subcommandz ServeCommand.register_subcommand  s3     *+((/(R!!'<!=rX   r   c           	      0   t         st        d      || _        | j                  j                  | _        | j                  rt        j                         }| j                  j                  )|| j                  _        t        j                  d|        t        j                         }| j                  j                  |vr)t        d| d| j                  j                   d| d      | j                  j                  | _        | j                  j                  )t        j                  | j                  j                         t!        j"                  d      }|j%                  t         j&                  | j                  j(                  j+                                   t!        j"                  d      }|j%                  t         j&                  | j                  j(                  j+                                   i | _        d | _        d | _        d | _        d | _        y )	NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`z-No attn_implementation passed, defaulting to z"Continuous batching only supports z as attn_implementation, got z#Try setting `--attn_implementation=`transformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr)    default_attention_implementationr   r   r   #supported_attention_implementationsr   r   r   r   r   r"   
get_loggersetLevel
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   default_attn_implsupported_attn_impltransformers_logger	cb_loggers         rY   r   zServeCommand.__init__  s   +s 
 	'+yy'D'D$'' 9 Z Z \yy,,40A		-KL]K^_`";"_"_"ayy,,4GG 89L8MMjyy4459:K9LAO 
  990099!!-dii445 &00@$$W%7%7		8K8K8Q8Q8S%TU&&'TU	7--dii.A.A.G.G.IJK 57X\0 "!rX   requestschema	validatorrK   unused_fieldsc                 0   t         j                  d|        t        |j                               }|j                  }||z
  }|r(t         j                  d|        t        dd|       | j                  j                  rB	 |j                  |       ||z  }	|	r(t         j                  d|	        t        dd|	       yy# t        $ rF}t         j                  d|j                                 t        d|j                               d}~ww xY w)a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr-   r   r   validate_pythonrL   errors)
r   r  r  r  r  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             rY   _validate_requestzServeCommand._validate_request  s   . 	+G956 (
//$}4LL;O;LMNC:Z[jZk8lmm99%%H))'2 (2M'A$'=>V=WXY# #.LMeLf,g  ( & # H1!((*>?#AHHJGGHs   C 	DADDc                 F    | j                  |t        t        t               y N)r  r  r  r  )r/  rN   response_validatorUNUSED_RESPONSE_FIELDSr   r  s     rY   validate_response_requestz&ServeCommand.validate_response_requestA  s!    <(0	 	 	
rX   c                 F    | j                  |t        t        t               y r1  )r/  r]   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr4  s     rY    validate_chat_completion_requestz-ServeCommand.validate_chat_completion_requestI  s!    >*7	 	 	
rX   c                 F    | j                  |t        t        t               y r1  )r/  r_   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr4  s     rY   validate_transcription_requestz+ServeCommand.validate_transcription_requestQ  s!    8-5	 	 	
rX   N
request_idcontentr   rolefinish_reason
tool_callsr7   decode_stream	tokenizerr   c	                     | |||j                  |j                  |      }t        |t        t	        j                               |t        t        |||      d|      gdd      }	d|	j                  d	       d
S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )r?  r@  rB  r   )deltaindexrA  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)step
_tokenizerr4   r   timer5   r6   model_dump_json)
r   r>  r?  r   r@  rA  rB  rC  rD  chunks
             rY   build_chat_completion_chunkz(ServeCommand.build_chat_completion_chunkY  s    D $)<AV#(()=)=wGG#		$% '!#-
 "/
  "*!
$ --4-@AFFrX   responserJ   c                 .    d|j                  d       dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        rL  TrM  rO  )rS  )r   rV  s     rY   build_response_eventz!ServeCommand.build_response_event  s"     00d0CDDIIrX   c                     t         dt        f fd       }t        |      } j                  r2|j                  t        dgddgdg       t
        j                  d       dd	lm} |j                  d
      d|dt        f fd       }|j                  d      dt        f fd       }|j                  d      d|f fd       }|j                  d      |j                  d       fd              }|j                  d      d        }|j                  d      d|fd       }	t        j                  | j                   j"                   j                   j$                   j                   j&                         y)a  
        Setup and run the FastAPI server for transformers serve.

        Models will be loaded and unloaded automatically based on usage and a timeout.

        The server will expose the following endpoints:
        - POST /v1/chat/completions: Generates chat completions.
        - POST /v1/responses: Generates responses.
        - POST /v1/audio/transcriptions: Generates transcriptions from audio.
        - GET /v1/models: Lists available models for 3rd party tools.

        Requires FastAPI and Uvicorn to be installed.
        appc                   K   d  j                   j                         D ]  }|j                           j                  j                  j	                  dd       y y w)NT   blocktimeout)r  valuesr   r  rr   )rZ  r   r   s     rY   lifespanz"ServeCommand.run.<locals>.lifespan  s[     ++224""$ 577C88==DRS=T Ds   A A#)ra  *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsr  bodyc                     j                  |       j                  r'j                  || j                  j                        }nj                  |      }t        |d      S Nr  text/event-stream
media_type)r9  r  #continuous_batching_chat_completionstater>  generate_chat_completionr0   )r  rh  outputr   s      rY   chat_completionz)ServeCommand.run.<locals>.chat_completion  sW    11$1?++AA$H`H`a66t<$V8KLLrX   z/v1/responsesc                 d    j                  |        j                  |       }t        |d      S rj  )r5  generate_responser0   )r  rr  r   s     rY   	responsesz#ServeCommand.run.<locals>.responses  s2    **7*;++G4F$V8KLLrX   z/v1/audio/transcriptionsc           
        K   | j                         4 d {   }t        |d   j                          d {   |d         }t        j	                  d|d   j
                   d|d   j                   d|d   j                  dz  dd	       d d d       d {    j                  
       j                  |      }t        |d      S 7 7 7 8# 1 d {  7  sw Y   HxY ww)Nr`   r   )r`   r   zReceived file: z; MIME type: z; size:    z.2fz KiBrk  rl  rm  )formr_   readr   r#  filenamecontent_typesizer=  generate_transcriptionr0   )r  ry  parsed_requestrr  r   s       rY   audio_transcriptionsz.ServeCommand.run.<locals>.audio_transcriptions  s      ||~~!F#F|0022w-"
 %d6l&;&;%<M$v,JcJcId e!&\..5c:$@ &~ ///G00@F$V8KLL &2 &~~~sU   C+CC+CC
ACC+C5C+CC+C(CC($C+z
/v1/modelsc                  <    t        d j                         d      S )Nlist)rK  data)r/   get_gen_modelsr   s   rY   get_all_modelsz(ServeCommand.run.<locals>.get_all_models  s      64;N;N;P QRRrX   z/healthc                      t        ddi      S )Nstatusok)r/   rW   rX   rY   healthcheckz%ServeCommand.run.<locals>.healthcheck  s    4 011rX   httpc                    K   | j                   j                  t              xs t        t	        j
                               }|| j                  _         ||        d {   }||j                   t        <   |S 7 wr   )headersr   X_REQUEST_IDrU   uuiduuid4rp  r>  )r  	call_nextr>  rV  s       rY   get_or_set_request_idz/ServeCommand.run.<locals>.get_or_set_request_id  s]      ,,\:Oc$**,>OJ'1GMM$&w//H-7H\*O 0s   AA9A7A9)r   r   r   N)r   r,   r   add_middlewarer.   r   warning_oncefastapirg  postdictoptionsr   
middlewareuvicornrunr   r   r   r   )
r   ra  rZ  rg  rs  rv  r  r  r  r  s
   `         rY   r  zServeCommand.run  s    
	U 	U 
	U x( "e"&"e"e   g 	$	(	)	MW 	MD 	M 
*	M 
/	"	Mt 	M 
#	M 
,	-	M 	M 
.	M" 
\	"			S 
 
#	S 
		2 
	2 
		 	 
 	 	Cdiinn499>>TYYM`M`arX   c           	         g d}t         rQ|D cg c]E  }|dt        j                  j                         j                         |j	                  d      d   dG c}S |D cg c]  }t        |       }}|D cg c]5  }|j                  d|j                  j                         |j                  d7 c}S c c}w c c}w c c}w )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructr   /r   )rH  rK  rI  owned_by)	r   datetimenow	timestampsplitr   rH  
created_atauthor)r   modelsr   model_infoss       rY   r  zServeCommand.get_gen_models   s    
  $ $E  %'00446@@B %C 0 3	 $  ;AA&:e,&KA ) )E  ((%$//99; %	 )  Bs   A
B5 B:8:B?r   c           	         	
  j                  |d         		 j                  k7  }	 _        |r0 j                  $ j                  j                  dd       d _         j	                  	      \  }}t        |d      r|j                  n|t        ||j                  j                  j                  ddd	       j                  K|j                  d
       _        t                j                  _         j                  j                          |j                  |d   dd      j!                  |j"                        }	 fd
 
fd} ||d   |      S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r]  rD  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rO   	streamingmessagespt)return_tensorsadd_generation_promptc              3     K   	 j                  | d       j                  j                  |       D ]\  }|j                  t        j
                  k(  rj                  | d        y j                  | |j                  d   |       ^ y # t        $ rT}t        j                  t        |             j                  j                  |        dt        |       d Y d }~y d }~ww xY ww)	N	assistantr@  r   rr   rA  r   )r>  r?  r   rC  rD  data: {"error": ""})rU  r  request_id_iterr  r*   FINISHEDgenerated_tokens	Exceptionr   r'  rU   cancel_request)r>  rC  resultr-  model_id_and_revisionr   rD  s       rY   stream_chat_completionzPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completion^  s     7 66z[p6qq"FFVVWabF}}(>(>>">>&*0"7 ?  
 ">>'1$*$;$;B$?"7*7&/ ?   c"  7SV$88GG
S*3q6(#667s<   C6A(B -C6.'B C6	C3A
C.)C6.C33C6c                  K   	 t        | j                         d      }j                  j                  | |j                        } ||      D ]$  }| t        j                  d       d {    & y 7 # t
        j                  $ r7 j                  j                  |       t        j                  d| d       Y y w xY ww)NF)r>  r   r   zRequest z was cancelled.)r   tolistr  add_requestr   asynciosleepCancelledErrorr  r   warning)_inputsr>  rC  rT  rO   r   r  s       rY   cancellation_wrapperzNServeCommand.continuous_batching_chat_completion.<locals>.cancellation_wrapperz  s     G ,W^^-=u E!EEQQ
CTCcCc R 
 4JNEK!--*** O*)) G88GG
S*_EFGs<   CA+A: 0A81A: 7C8A: :ACCCCr   )process_model_namer  r  rr   load_model_and_processorr   rD  r   rO   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )r   r   r>  must_discard_cacher   r   inputsr  rO   r  r  rD  s   `       @@@@rY   ro  z0ServeCommand.continuous_batching_chat_completion.  sa    !% 7 7G E2dooE/77C88==DRS=T;?8889NOy+29k+JI''PY	=$)$;$;"//"//
 33;7<7U7U"3t 8V 8D4 H[G\D44D44::< ..s:tko.pssLL
	78	G $F1Iz::rX   r(   c                     | j                   j                  }|t        j                         v rt        j
                  }|S |t        j                         v rt        j                  }|S t        d|       )NzUnknown modality: )		__class__rQ   r   r`  r   r   r   r   r   )r   model_classnamemodalitys      rY   get_model_modalityzServeCommand.get_model_modality  sm    //22HOOQQ||H   A H H JJ||H  1/1BCDDrX   r  c           	         g }| D ]  }|d   g d}|t         j                  k(  rmt        |d   t              r|d   }nMt        |d   t              r:g }|d   D ]  }|d   dk(  s|j                  |d          ! dj                  |      }|d<   n(|t         j                  k(  rt        |d   t              r|d   j                  d|d   d       n|d   D ]  }|d   dk(  r|d   j                  |        |d   dk(  s)d	|d   d
   v rt        j                  dd|d   d
         }t        j                  t        t        j                  |                  }t        j                   dd      }	|	j"                  }
|j%                  |	j"                         n|d   d
   }
|d   j                  d|
d        |j                  |        |S )Nr@  r@  r?  r?  typerd    )r  rd   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerU   r  appendjoinr   resubr+   openr   r  	b64decodetempfileNamedTemporaryFilenamesave)r  r  processor_inputsmessageparsed_messageparsed_contentr?  
image_datar  r`   r  s              rY   *get_processor_inputs_from_inbound_messagesz7ServeCommand.get_processor_inputs_from_inbound_messages  s   G&-fo"EN8<<' gi0#6%,Y%7N	 2D9%'N#*9#5"6?f4*11'&/B $6 &)XXn%=N,:y)X\\) gi0#6"9-44fgV_N`5ab#*9#5"6?f4*95<<WE$V_;'7;+?+FF-/VV4LbRYZeRfglRm-n
(-

76;K;KJ;W3X(Y'/'B'B&Y^'_&*ii %

499 5&-k&:5&A*95<<gVY=Z[ $6  ##N3O  P  rX   c                      j                   j                   j                   j                  |d<   |d   }|d   d   dk(  ry j                  |d          j                  k7  } _         j	                        \  } j                        } j                  ||      }dt        D ]/  }|j                  j                  d   j                         v s-| n |j                  |d|j                  d	      d
dd      }|j                  j                        }|j                  dd      d}	dj                  j                  d   j                         v rd}	t        ||	d      }
t!        |j"                        }d} j%                  |      r=|s; j&                  j)                         }|d   j*                  d   |kD  r j&                  }i ||
|d|d fd} ||
      S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r  r@  r  r   Ttoolsr  )r  r  r  return_dicttokenizer>  req_0gptossFskip_special_tokensskip_promptr   	input_ids)streamerrO   return_dict_in_generatepast_key_valuesc              3   n  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}	 |j	                          t               }j                  d	
       | D ]#  }dj                   j                  d   j                         v r|j                  d      }||z  }|r||v rd}LM|j                         t           d   k(  rd|_
        u|j                         t           d   k(  r(|j                          j                  |d d       |j                  r@|xj                  |z  c_        |j                  sYt        j                  d|j                        }	|	|	j!                  d      }	d|_        t#        t%        |	      dd|dz         }
n|dk(  rHd|j                  vrX|xj&                  |j)                  d      z  c_        |xj&                  |j)                  d      z  c_        |j&                  dk  r&dj+                  |j-                  d      d d       dz   }t#        t%        |      dd      }
j                  |d |
g       |dk7  sj                  ||       & j                  |d       |j+                          |j+                          y # t.        $ r9}t0        j3                  t5        |             d t5        |       d! Y d }~Nd }~ww xY w# |j+                          w xY ww)"NFr  r   T<|channel|>final<|message|>c                  L     j                   di | }|j                  _        y NrW   generater  r  r   generate_outputr   r   s     rY   generate_with_cachezbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache  $    "0%..":6":%4%D%D"rX   targetr   r   r  r  
<|return|>r   r   rB  )r>  r@  rA  r   z\"name\": \"(.*?)\"r#   )r  function
_tool_call)r  rG  r  rH  z"arguments": {{})	arguments)r  rG  r  )r>  r@  rB  r   )r?  r   rr   r  r  r  )configarchitecturesr  r   r   r   rU  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr7   r8   r   countr  r  r  r   r'  rU   )r  _request_id
filter_cotcot_trace_endr  threadresults
tool_stater  	tool_nametoolr-  generation_kwargsr   r  r>  r   tool_model_familys               rY   r  zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion  sG     J M5<<55a8>>@@!
 =E #6?PQFGg&[
 66z[p6qq&F5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G "(G3).J$$ )4!<<>->?P-QRY-ZZ:>J7$ "<<>->?P-QRW-XX&,,."&"B"B+6%).:&;	 #C #  %%66&--7- $.#C#C,.II6LjN_N_,`	#,#4$,090BICG
 @':-Hi-X*+)3'2\'A	(" $*R<$, $4:;L;L#L$, !+ < <S@Q Q < * < <S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6Ttf\q #C #  % |">>'?T ?  i 'n 66{RX`u6vv   7SV$*3q6(#667
 sC   AL5H>K ?K 
L5	L$/LL  LL   L22L5)r   r   r  r  r  r  r  _MODELS_WITH_TOOL_SUPPORTr  r  r  r  r   r  r   r    r   rO   is_continuationr  get_seq_lengthshape)r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerrO   r  seq_lenr  r"  r   r  r>  r#  s   `              @@@@@rY   rq  z%ServeCommand.generate_chat_completion  s"    99  ,9900CL9<Z B<;. $ 7 7G E2dooE/889NOy**51JJ8U]^ !(A$'5<<+E+Ea+H+N+N+PP$<! )B .."&'''" / 
 5<<(WW\73
 #u||11!4::<<"'2 3

 >c[`[r[rs$-?((779Gk"((,w6 $ 2 2

+!2'+,
x	 x	t &&9:FFrX   c                 <     j                  d          j                  k7  } _         j                        \  }t        d   t              r'dv r	dd   dgng }|j                  dd   d       nt        d   t              r8dv r.d   d   d   dk7  rdd   dgd   }nYd   }d   |d   d	<   nHd   }nBt        d   t              r$dv r	dd   dgng }|j                  d          nt        d
      |j                  |dd      }|j                  j                        }j                  dd      d}dj                  j                  d   j                         v rd}t!        ||d      }t#        j$                        }d} j'                        r=|s; j(                  j+                         }	|d   j,                  d   |	kD  r j(                  }|t/        j0                  |      ||d|d fd}
 |
|      S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  re   r   r@  r?  z%inputs should be a list, dict, or strTr  )r  r  rp   r  r  Fr  r  Nr  r  )r  attention_maskr  rO   r  r  c              3   .	  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}d}d}	 |j	                          t        j
                         }	t        d|t        d	 |	d
j                  d      dddiidg g j                  dd      dj                  d                  }
|dz  }j                  |
       t        d|t        d	 |	dj                  d      dddiidg g j                  dd      dj                  d                  }|dz  }j                  |       t        d||t        d dddg             }|dz  }j                  |       t        dd |||t        dd g !      "      }|dz  }j                  |       d }| D ]~  }dj                   j                  d   j                         v r|j                  d#      }||z  }|r
||v rd}d }MNt!        d$d ||||d d%d&g'      }|dz  }j                  |        t#        d(d ||d|d d%d&g)      }|dz  }j                  |       t%        d*d |||t        d|j&                  g !      "      }|dz  }|dz  }j                  |       t)        d+||t        d dd,d|j*                  gg -            }|dz  }|dz  }j                  |       t-        d.|t        d	 |	d,j                  d      dddii|j.                  gdg j                  dd      dj                  d      /            }|dz  }j                  |       |j1                          |j1                          y # t2        $ r}t4        j7                  d0t9        |              t;        d1|t9        |      2      }|dz  }j                  |       t=        d3|t        d	 	d4j                  d      dddiig dg ddj                  d      t?        d5t9        |      6      7            }|dz  }j                  |       Y d }~d }~ww xY w# |j1                          w xY ww)8NFr  r   Tr   c                  L     j                   di | }|j                  _        y r  r  r  s     rY   r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cache  r  rX   r	  zresponse.createdresp_queuedr-  formatr  rd   rV  r   r   ru   )rH  r  r  r   r-  rd   rK  r  rr  r   rl   ru   )r  sequence_numberrV  r#   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )rH  r  r  r@  r?  )r  r5  output_indexitemzresponse.content_part.addedoutput_textr   )r  rd   annotations)r  item_idr5  r8  content_indexpartr  zresponse.output_text.deltagX@)tokenlogprob)r  r<  r5  r8  r=  rF  rt   zresponse.output_text.done)r  r<  r5  r8  r=  rd   rt   zresponse.content_part.donezresponse.output_item.done	completed)rH  r  r  r@  r?  r;  zresponse.completed)rH  r  r  r   r-  rd   rr  rK  r  r   rl   ru   z"Exception in response generation: r'  )r  r5  r  zresponse.failedfailedserver_error)coder  )rH  r  r  r   r-  rd   rr  rK  r  r   rl   ru   r'  ) r  r  r  r   r   rR  r>   r:   r   rX  rB   rC   rE   r<   rF   r  rG   rH   r=   rd   rD   r>  r;   r9  r  r  r   r'  rU   r@   rA   r?   )r  r  r  r  r  r  r5  r8  r=  r  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr-  error_eventresponse_failedr"  r   r  r   r>  r   s                           rY   stream_responsez7ServeCommand.generate_response.<locals>.stream_response  s     J M5<<55a8>>@@!
 =E #6?PQFOLMM!YY[
 $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4$ $  1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4($$  1$//0DEE .J5$3!-.!*.Y}[fpr	.*  1$//0JKK /L6":,/$3!-"/+RUWX/+  1$//0KLL &F5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G "(G3).J&(G$$1G9"&zl 3(7%1&3$,.4"@!A2. $q(O334NOO3 '8 -B4":,/$3!-"# (*t<=-)  1$//0IJJ .J5":,/$3!-"/+E^EcEcqst.*  1$"//0JKK -H4$3!-.!*.&*(!;!@!@ A$&	-)  1$!//0IJJ &<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4&"$  1$//0BCCJ I  !AA#a&JK0 $3F
  1$//<<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#,  1$//@@C!AH s>   ARMN% R%	Q=.CQ83R  8Q==R   RR)r  r  r  r  rU   r  r  r  r   r  r  r   r   r  r  r  r    r   rO   r%  r  r&  r'  r   	ones_like)r   r   r  r   r  r  r)  rO   r  r*  rP  r"  r   r  r>  s   ``         @@@@rY   ru  zServeCommand.generate_response  s]    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'DEE..vTbf.g5<<(WW3W=
 #u||11!4::<<"'2 3

 >c[`[r[rs$-?((779Gk"((,w6 $ 2 2 #oof5+!2'+,
`	 `	D 2J??rX   c                 (  
 t               st        d      | j                  |d         }| j                  |      \  t	        j
                  dd      }t        |j                        }j                  j                  }t        j                  |d         }t        j                  ||d      \  }} ||d	      j                  j                        

d
   j                  j                         
d
<   ||dd
fd}	 |	       S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  r`   )srmonor  )sampling_rater  input_features)r  rO   r  c               3      K    j                   di } j                  | j                  d      d   }t        |      }|j	                  d        y w)NT)r  r   )rd   rM  rW   )r  batch_decode	sequencesr1   rS  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr"  s      rY   _generate_transcriptionzDServeCommand.generate_transcription.<locals>._generate_transcription  sg     0K00U<UCTUM!0!=!=m>U>Uko!=!pqr!s)/ABM"222EFGs   AA)r   r  r  load_audio_model_and_processorr    rD  r   rO   feature_extractorrU  ior   librosaloadr  r   r   )r   r   r  r)  rO   model_sampling_rateaudio_bytesaudio_array_r`  r]  r^  r_  r"  s             @@@@rY   r~  z#ServeCommand.generate_transcription  s"    $%o  !% 7 7G E'+'J'JK`'a$_2%%4T
 >)F)F

 .??MMjjV- k6IPTUQ&{BUfjknn
 *66F)G)J)J;K\K\)]%& ,!2'+
	H '((rX   c                 >   |j                  d      xs |j                  d      }d}| j                  d}n`t        | j                        t        |      k\  rd}n<t        t        | j                              D ]  }| j                  |   ||   k7  sd} n || _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r,  TF)r   r  lenrange)r   r   r  req_continues_last_messagesis        rY   r%  zServeCommand.is_continuation  s     77:&:#'''*:&*# %*/'##$H5*/' 3t1123%%a(HQK727/ 4
 &**rX   r&   c                     | j                   r:t        d| j                  | j                  | j                  | j                        }|S | j
                  rt        d      }|S d}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r&   r   r   r   r   )r   quantization_configs     rY   get_quantization_configz$ServeCommand.get_quantization_config   sr     "4!'+zz$($<$<*.*C*C'+zz# #" "4!# #" #'""rX   model_idc                 p    | j                   j                  | j                   j                  }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        @z@main)r   r   )r   ru  s     rY   r  zServeCommand.process_model_name  s<     99  ,yy,,H(?O5!!rX   r  c                    | j                   }t        j                  d|        d|v r|j                  dd      \  }}n|d}}t	        j
                  |||j                        }|j                  dv r|j                  nt        t        |j                        }| j                  |      }||j                  |d|j                  d}|||d
<   t        j
                  |fi |}	t        t        |	j                  d         }
 |
j
                  |fi |}t        |dd	      |j                  |j                         }|j"                  j$                  d	u xr |j"                  j&                  dk(  }|j"                  j$                  d	uxr |j"                  j$                  dk  }|s|rd|j"                  _        t        j                  d|        ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading rw  r#   main)revisionr   )r   Nr   )rz  r   r   
device_mapr   Nrs  r   hf_device_map   rx  zLoaded model )r   r   r   r  r%   from_pretrainedr   r   getattrr   rt  r   r   r	  r  r  r   rO   r   
max_length)r   r  r   ru  rz  data_processorr   rs  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokenss                 rY   _load_model_and_data_processorz+ServeCommand._load_model_and_data_processor.  s    yyh4567''!6!<!<S!!DHh!6hH&66"44
 #jjN:

tzz@Z"::4@ !#'#;#; !%!7!7
 *2EL./++HEE|V-A-A!-DE,,,XFF5/408HHT[[)E ##22d:gu?V?V?a?aeg?g 	 ##22$>p5CZCZCiCilpCp 	! "%=59E##2m$9#:;<n$$rX   c                    || j                   vs| j                   |   j                         rG| j                  |      \  }}t        || j                  j
                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r  r   r  r   r   r   r   r   r   )r   r  r   r   s       rY   r  z%ServeCommand.load_model_and_processork  s     !(:(::d>P>PQf>g>r>r>t#BBCXYE98B $		 7 7#9D45 i	 45AAC&&'<=CCE**+@AKKIirX   c                    || j                   vs| j                   |   j                         rG| j                  |      \  }}t        || j                  j
                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   r  r^  r_  s       rY   ra  z+ServeCommand.load_audio_model_and_processor  s     !(:(::d>P>PQf>g>r>r>t+/+N+NOd+e(K8B $		 7 7)9D45 O++	 45AAC,,-BCIIK"001FGQQOO++rX   )r   NNNNNNN).rQ   rR   rS   staticmethodr   r  r   r   r  r   r$  r/  r5  r9  r=  rU   r   r   r  r   r   rU  rX  r  	functoolscacheanyr  r   ro  r   r  r  r   rq  ru  r~  rc   r%  rt  r  r  tupler  r   ra  rW   rX   rY   r   r     s   	>N 	> 	>+^ +Z// / !	/
 /b
 

 

d 
 !%#"'+<@047;6G6G #6G }	6G
 sm6G  }6G T"7896G  -6G 346G 
6GpJ[ JS J ]b~ __+T#s(^ 4 + +ZZ;t Z; Z;Q_`cei`iQj Z;x 	"3 	 	 	 + x +  + ZGGD GGYsD$5O GGRc@T c@iT4.H c@J	.)$ .)9S$_3M .)`+4 +D +< #n #BV9W # #8"3 "3 "";%C ;%z %( 	 "99	: 6,C ,ERcesRsLt ,rX   r   __main__)r  r  r   r  enumr  r   rc  r   r  r  r   rR  r  argparser   r   collections.abcr   r   r   
contextlibr   dataclassesr	   r
   r   r   typingr   r   r   huggingface_hubr   huggingface_hub.constantsr   tokenizers.decodersr   r	  &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r   r    utilsr!   r"   r$   r   r%   r&   r'   r(   generation.continuous_batchingr)   r*   rd  PILr+   r
  r  r  r,   r-   fastapi.middleware.corsr.   fastapi.responsesr/   r0    openai.types.audio.transcriptionr1   .openai.types.audio.transcription_create_paramsr2   openai.types.chatr3   'openai.types.chat.chat_completion_chunkr4   r5   r6   r7   r8   *openai.types.chat.completion_create_paramsr9   openai.types.responsesr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   -openai.types.responses.response_create_paramsrI   pydanticrJ   rK   rL   rN   r]   r_   r2  r7  r;  r3  r8  r<  r  rQ   r   r  r  r%  r$  r  Enumr   r   r  r   r   r   r   r   r   r  rW   rX   rY   <module>r     sr         	 	  	     . ? ? * (   - - & 4 ,    0 (   Z k 4 6k;O;QkViVk   .6A>\<  [    " \@@4QY^ 6U]b 0MUZ  %%NO&'RS)*OP %!.# 
		H	%
   !!2!7!7!9: tyy 	 8	8/8 	8v /@ /@d n n nbG,- G,T& zNE	IIK rX   