
    3j"?              
          S SK r S SKrS SKrS SKJr  S SKJrJr  S SKJ	r	  S SK
r
S SKJs  Js  Js  Jr  S SKJrJr  S SKJrJr  S SKJr  S SKJr  S S	KJrJrJr  S S
KJ r J!r!J"r"  S SK#J$r$  S SK%J&r&  S SK'J(r(  S SK)J*r*  S SK+J,r,  / SQr-\*" \.5      r/\ " S S5      5       r0 " S S5      r1S\\2-  S-  S\3\	   S\24S jr4S\&S\5\2S-  \6S-  4   4S jr7S\0S\\2-  S-  S\3\	   S\8\6\	4   4S jr9g)    N)Callable)	dataclassfield)Any)get_default_numa_optionsjustknobs_check)eventsmetrics)
WorkerSpec)create_healthcheck_server)_AliveCallbackProxyLocalElasticAgentTORCHELASTIC_HEALTH_CHECK_PORT)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)NumaOptions)LaunchConfigelastic_launchlaunch_agentc                      \ rS rSr% Sr\\S'   \\S'   \\S'   Sr\S-  \S'   Sr	\
\S	'   S
r\
\S'   Sr\
\S'   Sr\
\S'   \" \S9r\\
\4   \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\
\S'   Sr\
S-  \S'   \" \S9r\\
\
4   \S'   Sr\
S-  \S'   Sr\
\S'   Sr\S-  \S'   Sr\
\S '   Sr\\
   S-  \S!'   Sr \\
   S-  \S"'   S#r!\"\S$'   Sr#\S-  \S%'   S& r$S'r%g)(r   .   a  
Creates a rendezvous config.

Args:
    min_nodes: Minimum amount of nodes that the user function will
                    be launched on. Elastic agent ensures that the user
                    function start only when the min_nodes amount enters
                    the rendezvous.
    max_nodes: Maximum amount of nodes that the user function
                    will be launched on.
    nproc_per_node: On each node the elastic agent will launch
                        this amount of workers that will execute user
                        defined function.
    rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
    rdzv_endpoint: The endpoint of the rdzv sync. storage.
    rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
    rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
        to be removed in future versions, see the note below. The default timeout is 900 seconds.
    run_id: The unique run id of the job (if not passed a unique one will be
            deduced from run environment - flow workflow id in flow - or auto generated).
    role: User defined role of the worker (defaults to "trainer").
    max_restarts: The maximum amount of restarts that elastic agent will conduct
                on workers before failure.
    monitor_interval: The interval in seconds that is used by the elastic_agent
                    as a period of monitoring workers.
    start_method: The method is used by the elastic agent to start the
                workers (spawn, fork, forkserver).
    metrics_cfg: configuration to initialize metrics.
    local_addr: address of the local node if any. If not set, a lookup on the local
            machine's FQDN will be performed.
    local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
    event_log_handler: name of the event logging handler as registered in
      `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
    duplicate_stdout_filters: If non-empty, duplicates stdout to a file containing only lines
                            that match _any_ of the filter strings.
    duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                            that match _any_ of the filter strings.
    virtual_local_rank: Enable virtual local rank mode for workers (defaults to False).
                       When enabled, LOCAL_RANK is set to 0 for all workers and
                       CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its
                       assigned GPU at device index 0.
    shutdown_timeout: Time in seconds to wait for graceful shutdown of workers before
                    sending SIGKILL. Can also be set via TORCH_ELASTIC_SHUTDOWN_TIMEOUT
                    environment variable. Defaults to 30 seconds.


.. note::
    `rdzv_timeout` is a legacy argument that will be removed in future.
    Set the timeout via `rdzv_configs['timeout']`

	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrnullevent_log_handlernuma_optionszSIGTERM,SIGINT,SIGHUP,SIGQUITsignals_to_handleduplicate_stdout_filtersduplicate_stderr_filtersFvirtual_local_rankshutdown_timeoutc                    SnU R                   S:w  a  U R                   U R                  S'   OSU R                  ;  a  XR                  S'   U R                  c  [        5       U l        U R                  c~  [
        R                  R                  5       (       a[  [
        R                  R                  5       U R                  :X  a/  [        5       U l        [        R                  SU R                  5        U R                  c/  [        [        R                   R#                  SS5      5      U l        g U R                  S:  a  [%        SU R                   35      eg )	Ni  r*   timeoutzUsing default numa options = %rTORCH_ELASTIC_SHUTDOWN_TIMEOUT30r   z+shutdown_timeout must be non-negative, got )r+   r)   r    r   r6   torchcudais_availabledevice_countr   r   loggerinfor;   intosenvironget
ValueError)selfdefault_timeouts     X/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/distributed/launcher/api.py__post_init__LaunchConfig.__post_init__|   s   "+/+<+<Di(d///+:i( ??".0DO %

''))

'')T-@-@@ 8 :DKK94;L;LM   ($'

?F%D! ""Q&=d>S>S=TU  '    )r    r6   r;   )&__name__
__module____qualname____firstlineno____doc__rF   __annotations__r    r   r"   strr$   r%   r'   r   dictr)   r   r+   r-   r.   floatr0   r1   r2   r3   r5   r6   r   r7   r8   listr9   r:   boolr;   rN   __static_attributes__ rP   rM   r   r   .   s1   2h NN#'J	D 'FCD#M3L##(#>L$sCx.>L#L#!e!L#+/cDj/"'"=Kc3h=!Jd
!#s#'+L+$+<s<15d3i$.515d3i$.5$$#'cDj'rP   r   c                   <    \ rS rSrSrS\S\\-  S-  4S jrS r	Sr
g)	r      a  
Launches an torchelastic agent on the container that invoked the entrypoint.

    1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
       ``entrypoint`` can be a function or a command.
    2. The return value is a map of each worker's output mapped
       by their respective global rank.

Usage

::

def worker_fn(foo):
    # ...

def main():
    # entrypoint is a function.
    outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
    # return rank 0's output
    return outputs[0]

    # entrypoint is a command and ``script.py`` is the python module.
    outputs = elastic_launch(LaunchConfig, "script.py")(args)
    outputs = elastic_launch(LaunchConfig, "python")("script.py")
config
entrypointNc                     Xl         X l        g N_config_entrypoint)rK   r`   ra   s      rM   __init__elastic_launch.__init__   s    
 %rP   c                 V    [        U R                  U R                  [        U5      5      $ rc   )r   re   rf   rZ   )rK   argss     rM   __call__elastic_launch.__call__   s    DLL$*:*:DJGGrP   rd   )rQ   rR   rS   rT   rU   r   r   rW   rg   rk   r\   r]   rP   rM   r   r      s-    4&& sNT)&HrP   r   ra   rj   returnc                     [        U [        5      (       a  U R                  $ [        U [        5      (       a)  U [        R
                  :X  a  [        S U 5       S5      $ U $ g)a  Retrieve entrypoint name with the rule:
1. If entrypoint is a function, use ``entrypoint.__qualname__``.
2. If entrypoint is a string, check its value:
    2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
        which does not start with hifen letter (for example, "-u" will be skipped).
    2.2 otherwise, use ``entrypoint`` value.
3. Otherwise, return empty string.
c              3   :   #    U  H  oS    S:w  d  M  Uv   M     g7f)r   -Nr]   ).0args     rM   	<genexpr>'_get_entrypoint_name.<locals>.<genexpr>   s     >A#s   	r!   )
isinstancer   rQ   rW   sys
executablenext)ra   rj   s     rM   _get_entrypoint_namery      sR     *h''"""	J	$	$'>>CCrP   rdzv_parametersc                     U R                   S:w  a  gU R                  nUR                  5       nU(       d  [        S5      e[	        USS9u  p#US:X  a  [        SU S35      eX#4$ )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr*   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstriprJ   r   )rz   r   master_addrmaster_ports       rM   _get_addr_and_portr      s~     (*''H~~HY
 	
  9PRSKb+H:5ST
 	
 %%rP   r`   c                 	   U R                   (       dD  [        [        R                  " 5       R                  5      n[
        R                  SU5        X0l         [        X5      n[
        R                  S0 SU_SU R                  _SU R                  _SU R                  _SU R                   _SU R                  _S	U R                  _S
U R                  _SU R                  _SU R                   _SU R"                  R$                  _SU R&                  _SU R(                  _SU R*                  _SU R,                  _SU R.                  _SU R0                  _5        [3        S"U R                  U R                  U R                   U R                  U R                  U R4                  S.U R                  D6n[7        U5      u  pgU R,                  [8        R:                  S'   S nS n	[8        R<                  " [>        5      n
U
bT  [A        SSS9(       aE   [C        5       n	[E        U	[	        U
5      SS9nURG                  5         [
        R                  SU
5        [K        U RL                  U R                  U[O        U5      [P        RR                  " U5      U R                  U R                   UUU R4                  U R(                  U R*                  U R.                  U R0                  U RT                  S9n[W        UU R"                  U RX                  U RZ                  U R\                  US 9nU	b  U	R_                  UR`                  5        Sn [b        Rd                  " [b        Rf                  " U R&                  5      5        URi                  5       n[j        Rl                  " URo                  5       U R(                  5        URq                  5       (       a  [s        UURt                  S!9eURv                  U(       a  URx                  R{                  5         $ $ ! [H         a    [
        R                  SSS9  S nS n	 GNf = f! [r         a    e [|         a3    Sn[j        Rl                  " UR                  5       U R(                  5        e [H         a1    [j        Rl                  " UR                  5       U R(                  5        e f = f! U(       a  URx                  R{                  5         f f = f)#Nz3config has no run_id, generated a random run_id: %saR  Starting elastic_operator with launch configs:
  entrypoint               : %(entrypoint)s
  min_nodes                : %(min_nodes)s
  max_nodes                : %(max_nodes)s
  nproc_per_node           : %(nproc_per_node)s
  run_id                   : %(run_id)s
  rdzv_backend             : %(rdzv_backend)s
  rdzv_endpoint            : %(rdzv_endpoint)s
  rdzv_configs             : %(rdzv_configs)s
  max_restarts             : %(max_restarts)s
  monitor_interval         : %(monitor_interval)s
  log_dir                  : %(log_dir)s
  metrics_cfg              : %(metrics_cfg)s
  event_log_handler        : %(event_log_handler)s
  numa_options             : %(numa_options)s
  signals_to_handle        : %(signals_to_handle)s
  duplicate_stdout_filters : %(duplicate_stdout_filters)s
  duplicate_stderr_filters : %(duplicate_stderr_filters)s
ra   r   r   r   r"   r'   r%   r)   r-   r.   log_dirr2   r5   r6   r7   r8   r9   )r~   r   r"   r   r   r3   TORCHELASTIC_SIGNALS_TO_HANDLEzNai_infra/pytorch_distributed:torchelastic_enable_healthcheck_before_rendezvousF)default<   )alive_callbackportr=   z>Started early health check server on port %s before rendezvousz)Failed to start early health check serverT)exc_info)r$   local_world_sizera   rj   rdzv_handlerr-   r.   r   r   r3   r5   r6   r8   r9   r:   )specr    r0   r1   r;   health_check_server)namefailuresr]   )@r"   rW   uuiduuid4rF   rD   warningry   rE   r   r   r   r'   r%   r)   r-   r.   r    root_log_dirr2   r5   r6   r7   r8   r9   r   r3   r   rG   rH   getenvr   r   r   r   start	Exceptionr   r$   tuplerdzv_registryget_rendezvous_handlerr:   r   r0   r1   r;   set_delegate_get_alive_timer
   initialize_metricsMetricsConfigrunr	   recordget_event_succeeded	is_failedr   r   return_valuesr   shutdownr   get_event_failed)r`   ra   rj   r"   entrypoint_namerz   r   r   r   alive_callback_proxyhealthcheck_portr   agentshutdown_rdzvresults                  rM   r   r      s   
 ==TZZ\%%&LfU*:<O
KK	F$	
/	
))	
 ))	
 f33		

 fmm	
 F//	
 V11	
 F//	
 F//	
  7 7	
 v((55	
 6--	
  !9!9	
 F//	
  !9!9	
  '(G(G!	
" '(G(G#	
'&P + ##%%}}""""$$ 

O  2/BK 4:3K3KBJJ/0 yy!?@#X)	(#6#8 ";3)*#
  %%'KKP  [[..4["99/J((00$$ 22((!'!@!@!'!@!@!44D$ $$((!'!@!@00/E '))%*?*?@M )""7#8#89K9K#LMe//163K3KL
 #$ 
 ## &&( G  	(NNFQUNV"&#' 	(l    e,,.0H0HI e,,.0H0HI &&( s-   AP :B'P- "P*)P*-B R--R0 0$S):rG   rv   r   collections.abcr   dataclassesr   r   typingr   r@   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryr   torch._utils_internalr   r   torch.distributed.elasticr	   r
   *torch.distributed.elastic.agent.server.apir   :torch.distributed.elastic.agent.server.health_check_serverr   :torch.distributed.elastic.agent.server.local_elastic_agentr   r   r   )torch.distributed.elastic.multiprocessingr   r   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   torch.numa.bindingr   __all__rQ   rD   r   r   rW   rZ   ry   r   rF   r   rX   r   r]   rP   rM   <module>r      s#   
 
  $ (   E E K 5 A 
 
 N E P > * =	H	 i i iX$H $HNX^d%: $s) PS (&)&
3:sTz!"&&^)^)3%^) s)^) 
#s(^	^)rP   