o
    
sho                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZmZ d dlmZmZ d dlmZ d dl
mZ d dlmZmZ d dlmZ d d	lmZ d dlZd d
lm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 ddl)m2Z2 e0 rd dl3Z3d dlm4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z: e$ rd dl;Z;e( rd dl<m=Z= e& oe# oe' oe% Z>e>rd dl?Z?d dl@mAZAmBZB d dlCmDZD d dlEmFZFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk G dd  d egd!d"ZlG d#d$ d$eUd!d"ZmG d%d& d&eKd!d"ZnejelZoejemZpejenZqh d'Zrh d(Zsh d)Zte1uevZwd*d+d,d-iZxeyexz Z{G d.d/ d/ej|Z}d0efd1d2Z~d3ed4d5d6d5fd7d8ZG d9d: d:ZG d;d< d<ZeG d=d> d>ZG d?d@ d@e2ZevdAkre Ze  dS dS )B    N)ArgumentParser	Namespace)	GeneratorIterable)	dataclassfield)BytesIO)Thread)OptionalUnion
model_infoHF_HUB_OFFLINE)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   @      e Zd ZU dZeed< dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__ rT   rT   [/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/commands/serving.pyrK   w      
 rK   F)totalc                   @   rJ   )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with an additional field for the generation config (as a json string).
        rL   NrM   rT   rT   rT   rU   rX   ~   rV   rX   c                   @   s2   e Zd ZU dZeed< eed< dZee	 ed< dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerL   FstreamN)
rN   rO   rP   rQ   bytesrS   rR   r[   r
   boolrT   rT   rT   rU   rY      s
   
 rY   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr_   audior`   logprobsmetadata	functions
modalities
predictionrf   rg   rh   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   ra   rb   languageru   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendc                   @   s   e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rN   rO   rP   r   r   r   r   rT   rT   rT   rU   r      s
    r   argsc                 C   s   t | S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)r   rT   rT   rU   serve_command_factory   s   r   reqmodel_generation_configr$   returnc                 K   sZ  |  ddurtdi t| d }nt|}|jdi |}| D ]\}}|dur3t||| q%|  ddurBt	| d |_
|  ddurPt	| d |_
|  ddur^t| d |_|  ddurj| d |_|  ddurv| d |_|  ddurt| d |_t| d d	krd
|_|  ddurt| d |_|  ddurt| d  |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rL   Nmax_output_tokens
max_tokensfrequency_penalty
logit_biasrl   temperatureg        Ftop_pseedrT   )getr$   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrL   non_standard_kwargskvrT   rT   rU   !create_generation_config_from_req   s6   


r   c                   @   s    e Zd ZdZdd Zdd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 C   s   |    d S N)resetselfrT   rT   rU   __init__!  s   zToolState.__init__c                 C   s   d| _ d| _d| _d| _dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   rT   rT   rU   r   $  s   
zToolState.resetN)rN   rO   rP   rQ   r   r   rT   rT   rT   rU   r     s    r   c                	   @   sJ   e Zd ZdZ	ddddedeed  fdd	Zd
d Zdd Z	dd Z
dS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr%   timeout_seconds	processor)r   r   c                 C   s>   || _ t|j| _|| _|| _t| j| j| _	| j	
  d S r   )r   rR   name_or_path_name_or_pathr   r   	threadingTimer_delete_model_timerr   )r   r   r   r   rT   rT   rU   r   2  s   zTimedModel.__init__c                 C   s*   | j   t| j| j| _ | j   dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   rT   rT   rU   reset_timer?  s   
zTimedModel.reset_timerc                 C   sj   t | dr1| jdur3| `| `d| _d| _t  tj r"tj  t	
| j d| j d dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   Nz was removed from memory after z seconds of inactivity)hasattrr   r   gccollectr   cudais_availableempty_cacheloggerinfor   r   r   rT   rT   rU   r   E  s   

zTimedModel._delete_modelc                 C   s   t | d p
| jdu S )z)Check if the instances have been deleted.r   N)r   r   r   rT   rT   rU   
is_deletedV  s   zTimedModel.is_deletedr   )rN   rO   rP   rQ   r   r
   r   r   r   r   r   rT   rT   rT   rU   r   ,  s    	

r   c                   @   s  e Zd ZU dZedddidZeed< eddg d	d
dZe	e ed< eddg d	d
dZ
e	e ed< edddidZeed< edddidZe	e ed< edddidZeed< edddidZeed< eddddgd
dZeed< edddidZeed< edddidZeed< ed dd!idZeed"< ed#dd$idZeed%< ed&dd'idZeed(< eddd)idZe	e ed*< eddd+idZeed,< eddd-idZeed.< eddd/idZe	e ed0< d1d2 ZdS )3ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    autohelpzfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.)defaultro   deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   bfloat16float16float32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypeFz2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                 C   s(   | j dur| jdkr| j | _dS dS dS )z(Only used for BC `torch_dtype` argument.Nr   )r   r   r   rT   rT   rU   __post_init__  s   zServeArguments.__post_init__)rN   rO   rP   rQ   r   r   rR   rS   r   r
   r   r   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rT   rT   rT   rU   r   [  s   
 
r   c                   @   s  e Zd ZedefddZdefddZdedd	d
dde	fddZ
defddZdefddZdefddZ						dHdee dee dee dee dee deed  defdd Zd!d"defd#d$Zd%d& Zejdd'deeeef  fd(d)Zd*edeeddf fd+d,Zedd-defd.d/Zed0efd1d2Zd*edeeddf fd3d4Zd*edeeddf fd5d6Zd*edeeddf fd7d8Zd*ede fd9d:Z!ededed; fd<d=Z"d>edefd?d@Z#dAefdBdCZ$dAede%d-e&f fdDdEZ'dAede%d-e(f fdFdGZ)dS )Ir   parserc                 C   s$   t f}| jd|d}|jtd dS )z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r   r   serve_parserrT   rT   rU   register_subcommand  s   z ServeCommand.register_subcommandr   c                 C   s   t std|| _| jjdk| _| jj| _| jjd ur"t| jj t	
d}|t	j| jj   t	
d}|t	j| jj   i | _d | _d | _d | _d | _d S )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`
sdpa_pagedtransformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr   r   r   r   r   
get_loggersetLevel
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   transformers_logger	cb_loggerrT   rT   rU   r     s$   



zServeCommand.__init__requestschema_TypedDictMeta	validatorrH   unused_fieldsc           
   
   C   s   t d|  t| }|j}|| }|r(t d|  tdd| d| jjriz|	| W n t
yQ } zt d|   td| dd}~ww ||@ }	|	rkt d|	  tdd|	 ddS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`_TypedDictMeta`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr*   r   r   validate_pythonrI   errors)
r   r   r   r  r  
input_keyspossible_keysunexpected_keyseunused_fields_in_requestrT   rT   rU   _validate_request  s.   

zServeCommand._validate_requestc                 C      | j |tttd d S N)r   r   r  r  )r  rK   response_validatorUNUSED_RESPONSE_FIELDSr   r   rT   rT   rU   validate_response_request!     
z&ServeCommand.validate_response_requestc                 C   r  r  )r  rX   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr  rT   rT   rU    validate_chat_completion_request)  r  z-ServeCommand.validate_chat_completion_requestc                 C   r  r  )r  rY   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr  rT   rT   rU   validate_transcription_request1  r  z+ServeCommand.validate_transcription_requestr   N
request_idcontentr   rolefinish_reason
tool_callsr4   r   c              
   C   sF   t |tt |tt|||dd|dgddd}d|jdd	 d
S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )r   r!  r#  r   )deltaindexr"  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)r1   r   timer2   r3   model_dump_json)r   r  r   r   r!  r"  r#  chunkrT   rT   rU   build_chat_completion_chunk9  s$    
z(ServeCommand.build_chat_completion_chunkresponserG   c                 C   s   d|j dd dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        r*  Tr+  r-  )r/  )r   r2  rT   rT   rU   build_response_eventm  s   z!ServeCommand.build_response_eventc                    s   t  } jr|jtdgddgdgd td ntd |ddtf fdd	}|d
dtf fdd}ddlm	} |dd|f fdd}|
d|d fdd}tj| jj jj jjd d S )N*T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.zaSome apps may require CORS. Consider launching the server with `--enable-cors` if you see errors.z/v1/chat/completionsr   c                    s4    j | d  jr | }n | }t|ddS Nr   text/event-stream
media_type)r  r   #continuous_batching_chat_completiongenerate_chat_completionr-   r   outputr   rT   rU   chat_completion  s
   
z)ServeCommand.run.<locals>.chat_completionz/v1/responsesc                    s"    j | d  | }t|ddS r9  )r  generate_responser-   r@  r   rT   rU   	responses  s   
z#ServeCommand.run.<locals>.responsesr   )Requestz/v1/audio/transcriptionsc              
      s   |   4 I d H 5}t|d  I d H |d d}td|d j d|d j d|d jd dd	 W d   I d H  n1 I d H sDw   Y   j|d
  	|}t
|ddS )NrZ   r   )rZ   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr:  r;  r<  )formrY   readr   r  filenamecontent_typesizer  generate_transcriptionr-   )r   rG  parsed_requestrA  r   rT   rU   audio_transcriptions  s   (

z.ServeCommand.run.<locals>.audio_transcriptionsz
/v1/modelsc                      s   t d  dS )Nlist)r)  data)r,   get_gen_modelsrT   r   rT   rU   get_all_models  s   z(ServeCommand.run.<locals>.get_all_models)r   r   r   )r)   r   add_middlewarer+   r   warning_oncepostdictfastapirE  optionsr   uvicornrunr   r   r   r   )r   apprB  rD  rE  rN  rR  rT   r   rU   rZ  }  s4   	"zServeCommand.run)maxsizec                 C   s6   g d}t rdd |D S dd |D }dd |D S )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructc                 S   s.   g | ]}|d t j   |dd dqS )r   /r   r&  r)  r'  owned_by)datetimenow	timestampsplit.0r   rT   rT   rU   
<listcomp>  s    z/ServeCommand.get_gen_models.<locals>.<listcomp>c                 S   s   g | ]}t |qS rT   r   rd  rT   rT   rU   rf    s    c                 S   s$   g | ]}|j d |j |jdqS )r   r^  )r&  
created_atrb  authorrd  rT   rT   rU   rf    s    r   )r   modelsmodel_infosrT   rT   rU   rQ    s   	
zServeCommand.get_gen_modelsr   c                    s    d jk}_|r!jdur!jjddd d_\}}t|dr0|jn|}t|j|j	|j
ddd	dd
dd
 jdu rZ|j dd_t j_j  |jd ddd|j} fdd}||d S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   )blocktimeout	tokenizerFr    rF  
   fifo)	r   eos_token_idpad_token_id	use_cache
num_blocks
block_sizer   max_batch_tokens	scheduler)rL   	streamingmessagespt)return_tensorsadd_generation_promptc              
   3   s
   z`j j| d jd}d}j|ddV  j D ]A}|j|kr%qdd ur7|s7|jtjkr5qd}|jtjkr?dnd }|jtjkrTj||dV   W d S j||j	d	V  qW d S  t
y } ztt| d
t| dV  W Y d }~d S d }~ww )Nr  )r  r   F	assistantr!  r   Trl   r"  r   )r  r   r   data: {"error": ""})r   add_requestr   r   r1  r  statusr'   FINISHED
next_token	Exceptionr   r	  rR   )_inputsr  queue_is_flushedresultr"  r  rL   model_id_and_revisionr   r   rT   rU   stream_chat_completion  s:   


 zPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completionr   )process_model_namer   r   rl   load_model_and_processorr   rm  r   rL   rp  rq  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )r   r   must_discard_cacher   r   rm  inputsr  rT   r  rU   r>    s@   




$z0ServeCommand.continuous_batching_chat_completionr%   c                 C   sB   | j j}|t v rtj}|S |t v rtj}|S td| )NzUnknown modality: )		__class__rN   r   valuesr   r   r   r   
ValueError)r   model_classnamemodalityrT   rT   rU   get_model_modalityE  s   zServeCommand.get_model_modalityr  c                 C   s~  g }| D ]}|d g d}|t jkrEt|d tr|d }n"t|d tr@g }|d D ]}|d dkr:||d  q+d|}||d< nr|t jkrt|d tr^|d d|d d nY|d D ]T}|d dkrr|d | qb|d dkrd	|d d
 v rt	dd|d d
 }t
tt|}tjddd}	|	j}
||	j n|d d
 }
|d d|
d qb|| q|S )Nr!  r!  r   r   typer^    )r  r^   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerR   rO  appendjoinr   resubr(   openr   r  	b64decodetempfileNamedTemporaryFilenamesave)rx  r  processor_inputsmessageparsed_messageparsed_contentr   
image_datar  rZ   r  rT   rT   rU   *get_processor_inputs_from_inbound_messagesQ  s@   




z7ServeCommand.get_processor_inputs_from_inbound_messagesc                    sZ  j jdurj j|d< |d }|d d dkrdS |d jk}_\}}||}dtD ]}|jj	d 
 v rO| nq?|j|d|d	d
ddd}|j}|ddd}	djj	d 
 v rxd}	t||	dd}
t|jd}d}|r|sj}i ||
|d|d  fdd}||
S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   rx  r!  r|  r   Ttoolsry  )r{  r  rz  return_dicttokenizer  req_0gptossFskip_special_tokensskip_promptr   )streamerrL   return_dict_in_generatepast_key_valuesc              
   3   s   d}d }dj jd  v rd}d}fdd}t| d}d	}z.z|  t }jd
dV  | D ]}dj jd  v rQ|drQ|d td  }||7 }|r_||v r^d}q7q7d ur|	 t
 d krrd|_q7|	 t
 d kr|  j|d ddV  q7|jr| j|7  _|jstd|j}	|	d u rq7|	d}	d|_tt|	ddd|d d}
n<|d	krq7d|jvrq7| j|d7  _| j|d8  _|jdk rd	|dd d d }tt|dddd}
j|d |
gdV  q7|d	krj||dV  q7j|dd V  |  W n# tyC } ztt| d!t| d"V  W Y d }~nd }~ww W |  d S W |  d S |  w )#NFr  r   T<|channel|>final<|message|>c                         j di | }|j_d S NrT   generater  r   r   generate_outputr   r   rT   rU   generate_with_cache     zbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cachetargetr   r   r|  r}  
<|return|>r   r   r#  )r  r!  r"  r   z\"name\": \"(.*?)\"r    )r  function
_tool_call)r  r%  r  r&  z"arguments": {{})	arguments)r  r%  r  )r  r!  r#  r   )r   r   rl   r~  r  r  )configarchitecturesr   r	   r   r   r1  endswithlenstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr4   r5   r   countr  rc  r  r   r	  rR   )r  _request_id
filter_cotcot_trace_endr  threadresults
tool_stater  	tool_nametoolr  generation_kwargsr   r  r  r   tool_model_familyrT   rU   r    s   






zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion)r   r   r  r   r  r  r  _MODELS_WITH_TOOL_SUPPORTr  r  r   r  r   r  r   r   r   rL   is_continuationr   )r   r   rx  r  r   r  r  supported_model_familiesr  r  generation_streamerrL   r   r  rT   r  rU   r?    sb   



{z%ServeCommand.generate_chat_completionc           
         s   d jk}_\}td tr6dv r)dd dgng }|dd d nUtd trjdv red d d dkrXdd dgd }n3d }d |d d	< n&d }n!td trdv r}dd dgng }|d  ntd
|j	|ddd}|
j}ddd}djjd  v rd}t||dd}tjd}d}r|sǈj}|t|||d|d  fdd}	|	|S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  r_   r   r!  r   z%inputs should be a list, dict, or strTry  )r{  rz  rj   r  r  Fr  r  N)r  attention_maskr  rL   r  r  c                 3   sF   d}d }dj jd  v rd}d}fdd}t| d}d}d}d}zz|  t }	td	|td
 |	dddddiidg g dddddd}
|d7 }	|
V  t
d|td
 |	dddddiidg g dddddd}|d7 }	|V  td||td dddg dd}|d7 }	|V  tdd |||td d!g d"d#}|d7 }	|V  d!}| D ]F}dj jd  v r|d$r|d td$  }||7 }|r||v rd}d!}qqtd%d ||||d!d&d'gd(}|d7 }	|V  qtd)d ||d|d!d&d'gd*}|d7 }	|V  td+d |||td |jg d"d#}|d7 }|d7 }	|V  td,||td dd-d|jgg d.d}|d7 }|d7 }	|V  td/|td
 |	d-ddddii|jgdg ddddd0d}|d7 }	|V  |  W nc ty } zVtd1t|  td2|t|d3}|d7 }	|V  td4|td
 |	d5ddddiig dg dddt d6t|d7d8d}|d7 }	|V  W Y d }~nd }~ww W |  d S W |  d S |  w )9NFr  r   Tr  c                     r  r  r  r  r  rT   rU   r    r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cacher  zresponse.createdresp_queuedr  formatr  r^   r2  ry   r   ro   )r&  rg  r  r   r  r^   r)  r  rA  ry   rf   ro   )r  sequence_numberr2  r    zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r|  )r&  r  r  r!  r   )r  r  output_indexitemzresponse.content_part.addedoutput_textr   )r  r^   annotations)r  item_idr  r  content_indexpartr  zresponse.output_text.deltagX@)tokenlogprob)r  r  r  r  r  r$  rn   zresponse.output_text.done)r  r  r  r  r  r^   rn   zresponse.content_part.donezresponse.output_item.done	completed)r&  r  r  r!  r   r  zresponse.completed)r&  rg  r  r   r  r^   rA  r)  r  ry   rf   ro   z"Exception in response generation: r	  )r  r  r  zresponse.failedfailedserver_error)coder  )r&  rg  r  r   r  r^   rA  r)  r  ry   rf   ro   r	  )!r  r  r   r	   r   r.  r;   r7   r   r3  r?   r@   rB   r9   rC   r  r  rD   rE   r:   r^   rA   r  r8   r  r  r  r   r	  rR   r=   r>   r<   )r  r  r  r  r  r  r  r  r  rg  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr  error_eventresponse_failedr  r   r  r   r  r   rT   rU   stream_response  s  





	
	


%z7ServeCommand.generate_response.<locals>.stream_response)r  r   r  r  rR   r  rO  rV  r  r  r  r   r   r  r  r   r   r   rL   r  r   r   	ones_like)
r   r   r  r   r  r  r  rL   r   r  rT   r  rU   rC  G  sV   

	 
dzServeCommand.generate_responsec           
         s   t  std| |d }| |\tjddd}t|jd}jj	}t
|d }tj||dd\}}||dd	j  d
 j d
< ||dd fdd}	|	 S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  rZ   )srmonory  )sampling_raterz  input_features)r  rL   r  c                  3   sH    j di  } j| jddd }t|d}|jdd V  d S )NT)r  r   )r^   r+  rT   )r  batch_decode	sequencesr.   r/  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr  rT   rU   _generate_transcription  s
   
zDServeCommand.generate_transcription.<locals>._generate_transcription)r   r   r  load_audio_model_and_processorr   rm  r   rL   feature_extractorr  ior   librosaloadr  r   r   )
r   r   r  r  rL   model_sampling_rateaudio_bytesaudio_array_r  rT   r  rU   rL  k  s2   z#ServeCommand.generate_transcriptionc                 C   sx   | dp	| d}d}| jdu rd}n#t| jt|kr d}ntt| jD ]}| j| || kr6d} nq'|| _|S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        rx  r  TNF)r   r   r  range)r   r   rx  req_continues_last_messagesirT   rT   rU   r    s   
zServeCommand.is_continuationr#   c                 C   s@   | j rtd| j| j| j| jd}|S | jrtdd}|S d}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r#   r   r   r   r   )r   quantization_configrT   rT   rU   get_quantization_config  s    z$ServeCommand.get_quantization_configmodel_idc                 C   s*   | j jdur
| j j}d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        N@z@main)r   r   )r   r0  rT   rT   rU   r    s
   
zServeCommand.process_model_namer  c                 C   s>  | j }td|  d|v r|dd\}}n|d}}tj|||jd}|jdv r.|jntt	|j}| 
|}||j|d|jd}|d	urK||d
< tj|fi |}	tt|	jd }
|
j|fi |}t|dd	d	u rs||j}|jjd	u o~|jjdk}|jjd	uo|jjdk }|s|rd|j_td|  ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading r1  r    main)revisionr   )r   Nr   )r3  r   r   
device_mapr   Nr.  r   hf_device_map   rF  zLoaded model )r   r   r   rc  r"   from_pretrainedr   r   getattrr   r/  r   r   r   r  r  r   rL   r   
max_length)r   r  r   r0  r3  data_processorr   r.  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokensrT   rT   rU   _load_model_and_data_processor  sB   

z+ServeCommand._load_model_and_data_processorc                 C   t   || j vs| j |  r#| |\}}t|| jj|d| j |< ||fS | j |   | j | j}| j | j}||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r   r   r?  r   r   r   r   r   r   )r   r  r   r   rT   rT   rU   r  $  s   
z%ServeCommand.load_model_and_processorc                 C   r@  )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        rA  rB  )r   r  r  r  rT   rT   rU   r  ?  s   
z+ServeCommand.load_audio_model_and_processor)r   NNNNN)*rN   rO   rP   staticmethodr   r   r   r   rV  r  r  r  r  r  r
   rR   rO  r1  r3  rZ  	functools	lru_cacheanyrQ  r   r>  r   r  r  r?  rC  rL  r]   r  r/  r  r?  tupler   r  r   r  rT   rT   rT   rU   r     s~     
1


4
A-Y- I  &0=

r   __main__)r  r   r`  enumrD  r   r!  r   r  r  r   r.  argparser   r   collections.abcr   r   dataclassesr   r   r   r	   typingr
   r   huggingface_hubr   huggingface_hub.constantsr   r   &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r!   r   r"   r#   r$   r%   generation.continuous_batchingr&   r'   r"  PILr(   r   rY  rW  r)   r*   fastapi.middleware.corsr+   fastapi.responsesr,   r-    openai.types.audio.transcriptionr.   .openai.types.audio.transcription_create_paramsr/   openai.types.chatr0   'openai.types.chat.chat_completion_chunkr1   r2   r3   r4   r5   *openai.types.chat.completion_create_paramsr6   openai.types.responsesr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   -openai.types.responses.response_create_paramsrF   pydanticrG   rH   rI   rK   rX   rY   r  r  r  r  r  r  r   rN   r   r  rO  r  r  Enumr   r   rV  r   r   r   r   r   r   rZ  rT   rT   rT   rU   <module>   s    	D


	
;/g         
