
    +jC                       d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZ erd dlmZ d dlmZmZ d dlZd dlZd dlZ G d	 d
ej                  Zd"dZd#dZd Zd#dZd$d#dZd%dZ G d dej                  Z G d dej                  Z G d dej                  Z  G d dej                  Z! G d de          Z"d&d!Z#dS )'    )annotationsN)cleandoc)TYPE_CHECKING)override)ComfyExtensionio)ModelPatcher)CLIPVAEc                  @    e Zd ZdZed             Zedd	            Zd
S )MultiGPUCFGSplitNodea	  
    Prepares model to have sampling accelerated via splitting work units.

    Should be placed after nodes that modify the model object itself, such as compile or attention-switch nodes.

    Other than those exceptions, this node can be placed in any order.
    c                   t          j        dddt          | j                  t           j                            d          t           j                            dddd          gt           j                                        g	          S )
NMultiGPU_WorkUnitszMultiGPU CFG Splitadvanced/multigpumodelmax_gpus      defaultminstepnode_iddisplay_namecategorydescriptioninputsoutputs)r   Schemar   __doc__ModelInputIntOutputclss    ;/home/wildlama/comfy/ComfyUI/comfy_extras/nodes_multigpu.pydefine_schemaz"MultiGPUCFGSplitNode.define_schema   sv    y(-( --w''ZBB
 !!
 
 
 	
    r   r	   r   intreturnio.NodeOutputc                n    t           j                            ||d          }t          j        |          S )NT)reuse_loaded)comfymultigpucreate_multigpu_deepclonesr   
NodeOutput)r'   r   r   s      r(   executezMultiGPUCFGSplitNode.execute-   s/    99%X\9]]}U###r*   N)r   r	   r   r+   r,   r-   __name__
__module____qualname__r!   classmethodr)   r4    r*   r(   r   r      sY          
 
 [
 $ $ $ [$ $ $r*   r   patcherr	   devicetorch.devicec           	         |                                  }t          j                            ||          }|dS t	          j        d| d| d| d           |                     |           dS )zJCast compute dtype to one the device supports; no-op if already supported.NzSelect Model Device: using z compute dtype on z (model weight dtype was z).)model_dtyper0   model_managementunet_manual_castlogginginfoset_model_compute_dtype)r;   r<   weight_dtype
cast_dtypes       r(   _force_supported_compute_dtyperG   3   s}    &&((L'88vNNJL~z~~V~~nz~~~##J/////r*   c                x    t          | j        d          s$| j        | j        _        | j        | j        _        dS dS )ae  Stash the original load/offload device on the underlying model.

    Stored on patcher.model (which is shared with the input patcher), so
    later "default" selections can recover the loader's original routing.
    Only the first Select on a given chain writes these attrs; subsequent
    deepclones inherit them onto their freshly-loaded model below.
    _select_base_load_deviceN)hasattrr   load_devicerI   offload_device_select_base_offload_device)r;   s    r(   _remember_base_devicesrN   =   sF     7="<== K181D.4;4J111K Kr*   c                ~    t          | d          r*t          |d          s| j        |_        | j        |_        dS dS dS )zICarry the loader-original device attrs onto the freshly-deepcloned model.rI   N)rJ   rI   rM   )	src_model	dst_models     r(   _propagate_base_devicesrR   J   s]    y455 VgiQk>l>l V-6-O	*090U	---V V V Vr*   c                    | j         |k    r	|| _        | S | j        }|                     |          } || _        t	          || j                   t          | d          r|                     | j                    | S )aq  Return a patcher whose actual model weights live on *target_load_device*.

    If *patcher* is already on *target_load_device* we just retarget the
    (already-cloned) patcher's metadata in place. Otherwise we call
    :meth:`ModelPatcher.deepclone_multigpu` to spawn a fresh model from
    the loader's ``cached_patcher_init`` factory -- the only safe way to
    move weights that may already be partially loaded onto another device.

    NOTE: reusing the input patcher's model when the requested device
    matches its current load_device is a deliberate fast path. Anything
    that has already mutated the original model (e.g. a prior KSampler
    invocation on the same model) will be observed here. This is by
    design and documented on the SelectXDeviceNode docstrings -- placing
    Select X Device after a node that consumes the same model is not
    recommended.
    )new_load_deviceregister_load_device)rK   rL   r   deepclone_multigpurR   rJ   rU   )r;   target_load_devicetarget_offload_devicerP   s       r(   _retarget_patcherrY   Q   s    " 000!6I((9K(LLG2GIw}555w.// :$$W%8999Nr*   c                *   t          |            | j        j        }||n| j        j        }|t	          | ||          S |j        dk    r:|                                 r|                     d          } || _        || _	        | S t	          | ||          S )a  Resolve the requested device and produce a patcher routed there.

    For "default" we restore the loader's original load/offload pair.
    For CPU we pin both load and offload to CPU (and, on a dynamic
    patcher, downgrade to a plain ModelPatcher so the dynamic-only
    code paths are bypassed).
    For an explicit GPU we keep the loader's original offload but
    target the requested load device; if that differs from the current
    load device the patcher is deepcloned onto the new device.
    NcpuT)disable_dynamic)
rN   r   rI   rM   rY   type
is_dynamicclonerK   rL   )r;   resolvedbase_offload_override	base_loadbase_offloads        r(   _apply_patcher_devicerd   o   s     7###6I,A,M((SZS`S|L )\BBB} 	:
 mmDm99G&!)Wh===r*   r   c                B   |                      d          }|sdS fd|D             }t          |          t          |          k    rTt          j        d d           |                     d|           t          | d          r|                                  dS dS dS )aM  Drop any multigpu clone whose load_device matches *primary_device*.

    Without pruning, MultiGPU CFG Split would have stacked a clone on
    the same device the primary now occupies (i.e. the workflow places
    MultiGPU CFG Split before Select Model Device). Keeps the clone set
    consistent with the new primary placement.
    r1   Nc                *    g | ]}|j         k    |S r:   )rK   ).0mprimary_devices     r(   
<listcomp>z-_prune_multigpu_collision.<locals>.<listcomp>   s%    NNNaam~.M.M.M.M.Mr*   z/Select Model Device: pruning MultiGPU clone on z* that now collides with the primary model.match_multigpu_clones)get_additional_models_with_keylenrB   rC   set_additional_modelsrJ   rk   )r   ri   multigpu_modelsfiltereds    `  r(   _prune_multigpu_collisionrq      s     :::FFO NNNN?NNNH
8}}O,,,,  B~  B  B  B  	C  	C  	C##J9995122 	*'')))))	 -,	* 	*r*   c                  Z    e Zd ZdZed             Zedd            Zeddd            ZdS )SelectModelDeviceNodea&  
    Place the diffusion model on a specific device (default / cpu / gpu:N).

    - "default" restores the device assigned by the loader (even after a
      prior Select Model Device call).
    - "cpu" pins both the load and offload device to CPU.
    - "gpu:N" pins the load device to the Nth available GPU; the offload
      device is restored to the loader's original choice.

    When the requested device differs from the device the input model is
    already on, a fresh model is spawned via the loader's reload factory
    (cached_patcher_init) so the new patcher owns independent weights on
    the new device. Loaders that don't support multigpu (no factory) will
    cause the node to pass through unchanged with a warning.

    If the workflow already has MultiGPU CFG Split applied and the chosen
    GPU collides with one of the existing multigpu clones, that clone is
    dropped so two patchers don't end up bound to the same device.

    When the selected device does not exist on the current machine
    (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
    the node passes the model through unchanged and logs a message
    instead of failing.

    NOTE: Placing Select Model Device *after* a node that has already
    consumed the same model (e.g. a KSampler that ran on this model on
    the original device) is not recommended -- any state the prior
    consumer mutated on the original model will be observed when the
    selected device matches the original (fast path). Place Select Model
    Device before any consumer of the model.
    c                H   t          j        dddt          | j                  t           j                            d          t           j                            dt          j        	                                          gt           j        
                                g          S )NSelectModelDevicezSelect Model Devicer   r   r<   optionsr   )r   r    r   r!   r"   r#   Combor0   r@   get_gpu_device_optionsr%   r&   s    r(   r)   z#SelectModelDeviceNode.define_schema   s    y'.( --w''x1G1^1^1`1`aa
 !!
 
 
 	
r*   r   c                    dS NTr:   r'   r<   s     r(   validate_inputsz%SelectModelDeviceNode.validate_inputs   s	     tr*   r   r	   r<   strr,   r-   c                   |                                 }t          j                            |          }|0|dvr,t	          j        d| d           t          j        |          S 	 t          ||          }nC# t          $ r6}t	          j
        d| d           t          j        |          cY d }~S d }~ww xY w|%t          ||           t          ||j                   t          j        |          S )NNr   z'Select Model Device: requested device '+' not available, passing through unchanged.zHSelect Model Device: cannot retarget model, passing through unchanged. ())r_   r0   r@   resolve_gpu_device_optionrB   rC   r   r3   rd   RuntimeErrorwarningrG   rq   rK   )r'   r   r<   r`   es        r(   r4   zSelectModelDeviceNode.execute   s   )CCFKK.? ? ?Lv6vvvwww='''	()%::EE 	( 	( 	(Okghkkklll=''''''''	( *5(;;;%eU->???}U###s   'A8 8
B8+B3-B83B8Nr   )r   r	   r<   r~   r,   r-   r6   r7   r8   r!   r9   r)   r}   r4   r:   r*   r(   rs   rs      s         @ 
 
 [
    [
 $ $ $ $ [$ $ $r*   rs   c                  Z    e Zd ZdZed             Zedd            Zeddd            ZdS )SelectCLIPDeviceNodea  
    Place the CLIP text encoder on a specific device (default / cpu / gpu:N).

    - "default" restores the device assigned by the loader.
    - "cpu" pins both the load and offload device to CPU.
    - "gpu:N" pins the load device to the Nth available GPU.

    When the selected device does not exist on the current machine
    (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
    the node passes the CLIP through unchanged and logs a message
    instead of failing.
    c                H   t          j        dddt          | j                  t           j                            d          t           j                            dt          j        	                                          gt           j        
                                g          S )NSelectCLIPDevicezSelect CLIP Devicer   clipr<   rv   r   )r   r    r   r!   Clipr#   rx   r0   r@   ry   r%   r&   s    r(   r)   z"SelectCLIPDeviceNode.define_schema   s    y&-( --f%%x1G1^1^1`1`aa
   
 
 
 	
r*   r   c                    dS r{   r:   r|   s     r(   r}   z$SelectCLIPDeviceNode.validate_inputs      tr*   r   r
   r<   r~   r,   r-   c                   |                                 }t          j                            |          }|0|dvr,t	          j        d| d           t          j        |          S 	 t          |j	        |          |_	        n/# t          $ r"}t	          j        d| d           Y d }~nd }~ww xY wt          j        |          S )Nr   z&Select CLIP Device: requested device 'r   zFSelect CLIP Device: cannot retarget CLIP, passing through unchanged. (r   )r_   r0   r@   r   rB   rC   r   r3   rd   r;   r   r   )r'   r   r<   r`   r   s        r(   r4   zSelectCLIPDeviceNode.execute  s    zz||)CCFKK.? ? ?Lu&uuuvvv=&&&	k0xHHDLL 	k 	k 	kOiefiiijjjjjjjj	k}T"""s   'B 
B.B))B.Nr   )r   r
   r<   r~   r,   r-   r   r:   r*   r(   r   r      s~          
 
 [
    [ 
# 
# 
# 
# [
# 
# 
#r*   r   c                  Z    e Zd ZdZed             Zedd            Zeddd            ZdS )SelectVAEDeviceNodea  
    Place the VAE on a specific device (default / gpu:N).

    - "default" restores the device assigned by the loader.
    - "gpu:N" pins the load device to the Nth available GPU; the offload
      device is set to the standard VAE offload device.

    CPU is intentionally not exposed in the UI for the VAE; if a workflow
    supplies "cpu" anyway (e.g. opened from another machine), the request
    is dropped with a log message and the VAE is passed through unchanged.

    When the selected device does not exist on the current machine
    (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
    the node passes the VAE through unchanged and logs a message
    instead of failing.
    c                H   t          j        dddt          | j                  t           j                            d          t           j                            dt          j        	                                          gt           j        
                                g          S )NSelectVAEDevicezSelect VAE Devicer   vaer<   rv   r   )r   r    r   r!   Vaer#   rx   r0   r@   get_gpu_device_options_no_cpur%   r&   s    r(   r)   z!SelectVAEDeviceNode.define_schema,  s    y%,( --U##x1G1e1e1g1ghh
 
 
 
 	
r*   r   c                    dS r{   r:   r|   s     r(   r}   z#SelectVAEDeviceNode.validate_inputs<  r   r*   r   r   r<   r~   r,   r-   c                   t          j         |          }|j                                        |_        t          j                            |          }|0|dvr,t          j        d| d           t          j	        |          S |3|j
        dk    r(t          j        d           t          j	        |          S t          |d          s|j        |_        	 t          |j        |t          j                                                  |_        nC# t           $ r6}t          j        d| d	           t          j	        |          cY d }~S d }~ww xY w|j        j        |_        ||j        n||_        t          j	        |          S )
Nr   z%Select VAE Device: requested device 'r   r[   zLSelect VAE Device: CPU is not a supported choice, passing through unchanged._select_base_device)ra   zDSelect VAE Device: cannot retarget VAE, passing through unchanged. (r   )copyr;   r_   r0   r@   r   rB   rC   r   r3   r]   rJ   r<   r   rd   vae_offload_devicer   r   r   first_stage_model)r'   r   r<   r`   r   s        r(   r4   zSelectVAEDeviceNode.execute@  s    innk'')))CCFKK.? ? ?Lttttuuu=%%%HMU$:$:Lghhh=%%%s122 	1&)jC#	&/X&+&<&O&O&Q&Q  CKK  	& 	& 	&Ogcdggghhh=%%%%%%%%	&
 !$ 1080@S,,h
}S!!!s   8D 
E+E
E
ENr   )r   r   r<   r~   r,   r-   r   r:   r*   r(   r   r     s~         " 
 
 [
    [ " " " " [" " "r*   r   c                  B    e Zd ZdZed             Zeddd            ZdS )MultiGPUOptionsNodeaL  
    Select the relative speed of GPUs in the special case they have significantly different performance from one another.

    NOTE (not registered yet, see MultiGPUExtension.get_node_list below):
    The output GPUOptionsGroup is plumbed through create_multigpu_deepclones() and stored on
    model.model_options['multigpu_options'] via GPUOptionsGroup.register(), but the cond
    scheduler in comfy/samplers.py (calc_cond_batch_outer_multigpu) does NOT yet consult
    relative_speed when distributing conds across devices; it uses a uniform conds_per_device
    round-robin via next_available_device(). Before re-enabling this node, wire its
    relative_speed into the scheduler (e.g. via comfy.multigpu.load_balance_devices(),
    which already implements the proportional split) so the input actually affects work
    distribution.
    c                |   t          j        dddt          | j                  t           j                            dddd          t           j                            dd	d
d          t          j        d                              dd          gt          j        d                                          g          S )NMultiGPU_OptionszMultiGPU Optionsr   device_indexr   @   )r   r   maxrelative_speedg      ?g        g{Gz?r   GPU_OPTIONSgpu_optionsT)optionalr   )	r   r    r   r!   r$   r#   FloatCustomr%   r&   s    r(   r)   z!MultiGPUOptionsNode.define_schemam  s    y&+( --^QA2FF/#DQQ	-((..}t.LL 	-((//11
 
 
 	
r*   Nr   r+   r   floatr   comfy.multigpu.GPUOptionsGroupr,   r-   c                    |st           j                                        }n|                                }t           j                            ||          }|                    |           t          j        |          S )N)r   r   )r0   r1   GPUOptionsGroupr_   
GPUOptionsaddr   r3   )r'   r   r   r   opts        r(   r4   zMultiGPUOptionsNode.execute~  sm     	..88::KK%++--Kn''\R`'aa}[)))r*   N)r   r+   r   r   r   r   r,   r-   r5   r:   r*   r(   r   r   ^  s^          
 
 [
  	* 	* 	* 	* [	* 	* 	*r*   r   c                  &    e Zd Zedd            ZdS )MultiGPUExtensionr,   list[type[io.ComfyNode]]c                :   K   t           t          t          t          gS r   )r   rs   r   r   )selfs    r(   get_node_listzMultiGPUExtension.get_node_list  s       !! 	
 	
r*   N)r,   r   )r6   r7   r8   r   r   r:   r*   r(   r   r     s2        
 
 
 X
 
 
r*   r   r,   c                 "   K   t                      S r   )r   r:   r*   r(   comfy_entrypointr     s      r*   )r;   r	   r<   r=   )r;   r	   r   )r   r	   )r,   r   )$
__future__r   r   rB   inspectr   typingr   typing_extensionsr   comfy_api.latestr   r   comfy.model_patcherr	   comfy.sdr
   r   torchcomfy.model_managementr0   comfy.multigpu	ComfyNoder   rG   rN   rR   rY   rd   rq   rs   r   r   r   r   r   r:   r*   r(   <module>r      s   " " " " " "                     & & & & & & / / / / / / / / #000000""""""""         $ $ $ $ $2< $ $ $>0 0 0 0
K 
K 
K 
KV V V   <> > > > >>* * * *&F$ F$ F$ F$ F$BL F$ F$ F$R-# -# -# -# -#2< -# -# -#`A" A" A" A" A"", A" A" A"H** ** ** ** **", ** ** **Z	
 	
 	
 	
 	
 	
 	
 	
     r*   