
    3j$                         S r SSKrSSKrSSKrSSKrSSKJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  \	R                  " \5      r " S S	\\R$                  5      r " S
 S5      rS\l         g)z+
CLI entry point for `transformers serve`.
    N)	Annotated)logging)is_serve_available   )set_torch_seedc                        \ rS rSrSrSrSrSrg)ReasoningMode#   onoffauto N)__name__
__module____qualname____firstlineno__ONOFFAUTO__static_attributes__r       P/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/cli/serve.pyr	   r	   #   s    	B
CDr   r	   c            3       X   \ rS rSrSSSSS\R
                  SSSSSSSSSSSSSSSS4S	\\S-  \R                  " S
S94   S\\
\R                  " SS94   S\\S-  \R                  " SS94   S\\
\R                  " SS94   S\\S-  \R                  " SS94   S\\\R                  " SS94   S\\S-  \R                  " SS94   S\\\R                  " SS94   S\\S-  \R                  " SS94   S\\
\R                  " SS94   S\\\R                  " SS94   S \\S-  \R                  " S!S94   S"\\S-  \R                  " S#S94   S$\\S-  \R                  " S%S94   S&\\S-  \R                  " S'S94   S(\\
S-  \R                  " S)S94   S*\\\R                  " S+S94   S,\\\R                  " S-S94   S.\\
\R                  " S/S94   S0\\\R                  " S1S94   S2\\S-  \R                  " S3S94   S4\\
\R                  " S5S6S794   S8S4.S9 jjrS: rS; rS< rS=rg)>Serve)   NFr   i,  	localhosti@  warningforce_modelz*Model to preload and use for all requests.)helpcontinuous_batchingzMEnable continuous batching with paged attention. Configure with --cb-* flags.attn_implementationz2Attention implementation (e.g. flash_attention_2).compilez*Enable torch.compile for faster inference.quantizationz.Quantization method: 'bnb-4bit' or 'bnb-8bit'.	reasoningu   Reasoning mode. 'auto' uses the chat template default. Only applies to models that support reasoning via their chat template (e.g. Qwen3, Gemma 4) — for other models this flag has no effect.chat_template_kwargszDefault JSON kwargs forwarded to apply_chat_template (e.g. '{"enable_thinking": true}'); per-request chat_template_kwargs override these.devicez4Device for inference (e.g. 'auto', 'cuda:0', 'cpu').dtypez2Override model dtype. 'auto' derives from weights.trust_remote_codezTrust remote code when loading.model_timeoutzGSeconds before idle model is unloaded. Ignored when force_model is set.cb_block_sizez6KV cache block size in tokens for continuous batching.cb_num_blocksz2Number of KV cache blocks for continuous batching.cb_max_batch_tokensz1Maximum tokens per batch for continuous batching.cb_max_memory_percentz/Max GPU memory fraction for KV cache (0.0-1.0).cb_use_cuda_graphz+Enable CUDA graphs for continuous batching.hostzServer listen address.portzServer listen port.enable_corszEnable permissive CORS.	log_levelz'Logging level (e.g. 'info', 'warning').default_seedzDefault torch seed.non_blockingTz1Run server in a background thread. Used by tests.)hiddenr   returnc           
      \   [        5       (       d  [        S5      eSS KnSSKJn  SSKJn  SSKJn  SSK	J
n  SSKJn  SS	KJn  SS
KJn  Ub  [#        U5        [$        R&                  " S5      nUR)                  [$        R*                  UR-                  5          5        U" UU	U
UUUUS9U l        SSKJn   UUUUUS.R5                  5        V!V"s0 s H  u  n!n"U"c  M  U!U"_M     n#n!n"U#(       a  U " S0 U#D6OS n$U" UUU$S9U l        U(       aB  [8        R:                  " U5      n[=        U[>        5      (       d  [@        RB                  " S5      eO0 nU[D        RF                  :X  a  SUS'   OU[D        RH                  :X  a  SUS'   U" U R.                  U R6                  US9U l%        U" U R.                  U R6                  S9U l&        U" U R.                  U R6                  US9U l'        U" U R.                  U R6                  5      U l(        U" U R.                  U RJ                  U RL                  U RN                  U RP                  U R6                  US9n%URS                  U%UUSS9n&URU                  U&5      U l+        U(       a  U RY                  5         g U RV                  R[                  5         g s  sn"n!f )NzRMissing dependencies for serving. Install with `pip install transformers[serving]`r   r   )ChatCompletionHandler)CompletionHandler)ModelManager)ResponseHandler)build_server)TranscriptionHandler)GenerationStatetransformers)r&   r'   r(   r!   r#   r)   r   )ContinuousBatchingConfig)
block_size
num_blocksmax_batch_tokensmax_memory_percentuse_cuda_graph)r    r"   	cb_configz,--chat-template-kwargs must be a JSON objectTenable_thinkingF)model_managergeneration_stater%   )rH   rI   )completion_handlerresponse_handlertranscription_handlerrI   r1   info)r/   r0   r2   r   ).r   ImportErroruvicornserving.chat_completionr8   serving.completionr9   serving.model_managerr:   serving.responser;   serving.serverr<   serving.transcriptionr=   serving.utilsr>   r   r   
get_loggersetLevel
log_levelslower_model_managerr?   r@   items_generation_statejsonloads
isinstancedicttyperBadParameterr	   r   r   _chat_handler_completion_handler_response_handler_transcription_handlerConfigServerserverstart_serverrun)'selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   rO   r8   r9   r:   r;   r<   r=   r>   transformers_loggerr@   kv	cb_kwargsrF   appconfigs'                                          r   __init__Serve.__init__*   s   D "##rssB9750?2 #<( &00@$$W%7%7	8I%JK*/ 3%'#
 	:
 ,+$7&;"3 eg

1  AqD 	 

 >G,9y9D	!0 3"
  #'::.B#C 2D99(()WXX : $& (((6: !23-+++6; !232--!33!5
 $5--!33$
 
 "1--!33!5"
 ';4;N;NPTPfPf&g##77!33"&"="=!33#
 $TVLnnV,KKOOC

s   
J(J(c                 |   ^  U 4S jn[         R                  " USSS9T l        T R                  R                  5         g )Nc                     > [         R                  " 5       n [         R                  " U 5        U R                  TR                  R                  5       5        g )N)asyncionew_event_loopset_event_looprun_until_completerj   serve)looprm   s    r   _run Serve.start_server.<locals>._run   s:    ))+D""4(##DKK$5$5$78r   zuvicorn-threadF)targetnamedaemon)	threadingThread_threadstart)rm   r~   s   ` r   rk   Serve.start_server   s2    	9
 !''t:JSXYr   c                 8    U R                   R                  5         g)z$Clear all loaded models from memory.N)r[   shutdownrm   s    r   reset_loaded_modelsServe.reset_loaded_models   s    $$&r   c                 "   U R                   R                  5         U R                  R                  5         U R                  (       a  U R                  R	                  5       (       d  g SU R
                  l        U R                  R                  SS9  g )NT   )timeout)r]   r   r[   r   is_aliverj   should_exitjoinr   s    r   kill_serverServe.kill_server   sc    '')$$&||4<<#8#8#:#:"&!$r   )rd   re   r]   r[   rf   r   rg   rj   )r   r   r   r   r	   r   r   strrb   ArgumentboolOptionintfloatrt   rk   r   r   r   r   r   r   r   r   )   s    qu
  di   lrpvch      LWIMUZbkX\ AcsTz5>>?k+llmc 'LLmnp
	c '$J*^__
c 43_!``ac  $J*Z[[
c LL/	
c2 ($JLLm
3cD #u||1ghhiEcF tU\\7k%llmGcH %T5<<=^+_%_`IcJ !#lmm
KcR !$J*bcc
ScX !$J*^__
Yc^ '$J*]^^
_cd  )DL%,,,]^^ 
ecj %4K+XYY
kcr U\\/GHHIsct U\\/DEEFucv tU\\7P%QQRwcx S%,,4]"^^_ycz  d
ELL>S,T TU{c|  %,,d1dee
}cB 
CcJ'%r   r   u  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.
Models will be loaded and unloaded automatically based on usage and a timeout.


Endpoints:
    POST /v1/chat/completions — Chat completions (streaming + non-streaming).
    POST /v1/completions      — Legacy text completions from a prompt.
    GET  /v1/models           — Lists available models.
    GET  /health              — Health check.

Requires FastAPI and Uvicorn: pip install transformers[serving]
)__doc__rx   enumr^   r   typingr   rb   transformers.utilsr   transformers.utils.import_utilsr   rV   r   rW   r   loggerr   Enumr	   r   r   r   r   <module>r      sa          & > ) 
		H	%C y% y%xr   