
    
9jB                         S SK r S SKrS SKJr  S SKJrJrJrJr  S SK	J
r
JrJrJr  S SKJr  \(       a  S SKJr  \ " S S5      5       rg)	    N)	dataclass)TYPE_CHECKINGLiteralOptionalUnion)DeepSpeedSequenceParallelConfigDistributedTypeTorchContextParallelConfigTorchTensorParallelConfig)is_torch_version)Acceleratorc                   R   \ rS rSr% SrSr\\   \S'   Sr	\\   \S'   Sr
\\   \S'   Sr\\   \S'   Sr\S   \S	'   Sr\\   \S
'   Sr\S   \S'   Sr\S\4   \S'   Sr\S\4   \S'   Sr\S\4   \S'   SrS rS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S 5       r"\S 5       r#\S 5       r$\S 5       r%\S 5       r&\S 5       r'\S 5       r(S \)4S! jr*S,S \\)   4S" jjr+S#\,\,\S$4   \,\)S$4   4   4S% jr-S& r.S'\)S(\4S) jr/S-S* jr0S+r1g).ParallelismConfig!   aM  
A dataclass to configure parallelisms applied to the model. Inspired by torchtitan's `ParallelDims`
https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py

Args:
    dp_replicate_size (`int`, defaults to `1`):
        The size of the data parallel group. If `dp_replicate_size` is set to 1, the data parallel replication
        group will not be used.
    dp_shard_size (`int`, defaults to `1`):
        The size of the model shard group. If `dp_replicate_size > 1` and `tp_size > 1`, `dp_shard_size` must also
        be greater than 1, as composing DDP + TP is currently not supported.
    tp_size (`int`, defaults to `1`):
        The size of the tensor parallel group. If `tp_size` is set to `1`, the tensor parallel group will not be
        used.
    tp_handler (`~utils.TorchTensorParallelConfig`, defaults to `None`):
        The handler for the tensor parallel group.
    cp_size (`int`, defaults to `1`):
        The size of the context parallel group. Currently not supported, but reserved for future use and enabled
        for downstream libraries.
    cp_backend (`str`, defaults to `torch`):
        Which CP backend to use: `torch` (FSDP2)
    sp_size (`int`, defaults to `1`):
        The size of the sequence parallel group.
    sp_backend (`str`, defaults to `deepspeed`):
        Which SP backend to use:`deepspeed` (ALST/Ulysses)

You may obtain different distributed data parallel paradigms by configuring `dp_replicate_size` and `dp_shard_size`
together:
    - `dp_replicate_size == 1` and `dp_shard_size > 1`, we obtain Fully Sharded Data Parallel (FSDP).
    - `dp_replicate_size > 1` and `dp_shard_size > 1`, we obtain Hybrid Sharded Data Parallel (HSDP).
    - `dp_replicate_size > 1` and `dp_shard_size == 1` is an invalid configuration, to use pure DP, use
      `DistributedDataParallelKwargs` instead.

Ndp_replicate_sizedp_shard_sizetp_sizecp_sizetorch
cp_backendsp_size	deepspeed
sp_backend
tp_handler
cp_handler
sp_handlerc                    SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   SU R                   SU R                   S	U R                   S
U R                   S3$ )Nz'ParallelismConfig(
 	dp_replicate_size=z,
	dp_shard_size=z,
	tp_size=z,
	cp_size=z,
	cp_backend=z,
	sp_size=z,
	sp_backend=z,
	total_size=z
	tp_handler=z,
	cp_handler=z)
)
r   r   r   r   r   r   r   
total_sizer   r   selfs    W/home/wildlama/miniconda3/lib/python3.13/site-packages/accelerate/parallelism_config.py__repr__ParallelismConfig.__repr__U   s    ##'#9#9": ;#112 3 ' ' OO, - ' OO, - OO, - OO, - OO,C
1	
    c                    SS K nS/nUR                  U R                  R                  5        VVs0 s H;  u  p4X2;  d  M  U[	        US5      (       a  UR                  UR                  5      OU_M=     snn5        g s  snnf )Nr   device_mesh__dict__)copydeepcopyr'   itemshasattr)r    r(   _non_serializable_fieldskvs        r!   to_jsonParallelismConfig.to_jsond   ss    $1?  !MM//11DA4 N:0F0F4==,AM1	
s
   A;
3A;
c                 f    / nU R                   (       a  US/-  nU R                  (       a  US/-  nU$ )zENames of enabled dimensions across which data parallelism is applied.dp_replicatedp_shard)dp_replicate_enableddp_shard_enabledr    dimss     r!   dp_dim_namesParallelismConfig.dp_dim_namesq   s9     $$^$$D  ZL Dr$   c                     / nU R                   (       a  US/-  nU R                  (       a  US/-  nU R                  (       a  US/-  nU$ )z]Names of enabled dimensions which will receive the same batch (non-data parallel dimensions).tpcpsp)
tp_enabled
cp_enabled
sp_enabledr6   s     r!   non_dp_dim_names"ParallelismConfig.non_dp_dim_names{   sD     ??TFND??TFND??TFNDr$   c                 f    / nU R                   (       a  US/-  nU R                  (       a  US/-  nU$ )zlNames of enabled dimensions which will be flattened into a joint mesh across which is model sharded in FSDP.r3   r<   )r5   r?   r6   s     r!   dp_shard_cp_dim_names'ParallelismConfig.dp_shard_cp_dim_names   s5       ZL D??TFNDr$   c                     / nU R                   (       a  US/-  nU R                  (       a  US/-  nU R                  (       a  US/-  nU$ )z@Names of enabled dimensions across which loss should be averagedr2   r3   r<   )r4   r5   r?   r6   s     r!   dp_cp_dim_names!ParallelismConfig.dp_cp_dim_names   sK     $$^$$D  ZL D??TFNDr$   c                 D    / nU R                   (       a  US/-  nUS/-  nU$ )z^Names of enabled dimensions across which FSDP is applied, including data parallel replication.r2   dp_shard_cp)r4   r6   s     r!   fsdp_dim_names ParallelismConfig.fsdp_dim_names   s0     $$^$$Dr$   c                     U R                   U R                  -  U R                  -  U R                  -  U R                  -  $ )zSThe total size of the parallelism configuration, which is the product of all sizes.)r   r   r   r   r   r   s    r!   r   ParallelismConfig.total_size   s9     %%(:(::T\\IDLLX[_[g[gggr$   c                 N    U R                   U R                  -  U R                  -  $ )zhThe size of the non-data parallel dimensions, which is the product of tensor and context parallel sizes.)r   r   r   r   s    r!   non_data_parallel_size(ParallelismConfig.non_data_parallel_size   s      ||dll*T\\99r$   c                 4    U R                   U R                  -  $ )z_The size of the data parallel dimensions, which is the product of data parallel replication and)r   r   r   s    r!   data_parallel_size$ParallelismConfig.data_parallel_size   s     %%(:(:::r$   c                      U R                   S:  $ )zKTrue if data parallel replication is enabled, i.e. `dp_replicate_size > 1`.   )r   r   s    r!   r4   &ParallelismConfig.dp_replicate_enabled   s     %%))r$   c                      U R                   S:  $ )zDTrue if data parallel sharding is enabled, i.e. `dp_shard_size > 1`.rV   )r   r   s    r!   r5   "ParallelismConfig.dp_shard_enabled   s     !!A%%r$   c                      U R                   S:  $ )z:True if tensor parallelism is enabled, i.e. `tp_size > 1`.rV   )r   r   s    r!   r>   ParallelismConfig.tp_enabled        ||ar$   c                      U R                   S:  $ )z;True if context parallelism is enabled, i.e. `cp_size > 1`.rV   )r   r   s    r!   r?   ParallelismConfig.cp_enabled   r\   r$   c                      U R                   S:  $ )z;True if context parallelism is enabled, i.e. `sp_size > 1`.rV   )r   r   s    r!   r@   ParallelismConfig.sp_enabled   r\   r$   c                 4    U R                   U R                  -   $ )z$Names of all active mesh dimensions.)r8   rA   r   s    r!   active_mesh_dims"ParallelismConfig.active_mesh_dims   s       4#8#888r$   device_typec                    U R                   S:X  a  U R                  S:  a  g[        SS5      (       a  SSKJn  O[        S5      eU R                  5       n[        U5      S:X  a  gUu  pEU" UUUS	9nU R                  (       a  X`R                     R                  S
5        U R                  (       a  X`R                     R                  S5        U R                  (       a  X`R                     R                  S5        U$ )a  Builds a device mesh for the given device type based on the parallelism configuration.
This method will also create required joint meshes (e.g. `dp_shard_cp`, `dp_cp`, `dp`).

Args:
    device_type (`str`): The type of device for which to build the mesh, e
r   rV   Nz>=z2.2.0r   )init_device_meshz4Building a device_mesh requires to have torch>=2.2.0)mesh_dim_namesdprJ   dp_cp)r   r   r   torch.distributed.device_meshrf   RuntimeError	_get_meshlenr8   _flattenrD   rG   )r    rd   rf   meshrg   
mesh_shaper&   s          r!   build_device_mesh#ParallelismConfig.build_device_mesh   s     ??k)dllQ.>D'**FUVV~~t9>%)"&)

 ))*33D9%%223<<]K,,-66w?r$   c                    U R                   c0  Ub"  U R                  U5      U l         U R                   $ [        S5      eUb@  U R                   R                  U:w  a&  [        SU R                   R                   SU S35      eU R                   $ )Nz@You need to pass a device_type e.g cuda to build the device meshz4The device_mesh is already created with device type z@. However, you are trying to get a device mesh with device_type z<. Please check if you correctly initialized your device_mesh)r&   rq   
ValueErrorrd   )r    rd   s     r!   get_device_mesh!ParallelismConfig.get_device_mesh   s    #&#'#9#9+#F   !!cdd&##//;>$NtO_O_OkOkNl  mm  ny  mz  zv  w  r$   return.c                    ^ U R                    Vs0 s H  oU R                  U   _M     nn/ SQm[        UR                  5       U4S jS9n[	        [        U6 5      $ s  snf )zQGenerate mesh shape and dimension names for torch.distributed.init_device_mesh().)r2   r3   r<   r=   r;   c                 ,   > TR                  U S   5      $ )Nr   )index)x
mesh_orders    r!   <lambda>-ParallelismConfig._get_mesh.<locals>.<lambda>  s    :++AaD1r$   )key)rb   _sizessortedr*   tuplezip)r    parallelism	mesh_dimssorted_itemsr|   s       @r!   rl   ParallelismConfig._get_mesh  sf     OSNcNcdNc{$++k"::Nc	d D
OO2
 S,'(( es   Ac           
         U R                   c.  [        [        R                  R	                  SS5      5      U l         U R
                  c.  [        [        R                  R	                  SS5      5      U l        U R                  c.  [        [        R                  R	                  SS5      5      U l        U R                  c.  [        [        R                  R	                  SS5      5      U l        U R                  c%  [        R                  R	                  SS5      U l        U R                  c.  [        [        R                  R	                  SS5      5      U l	        U R                  c%  [        R                  R	                  S	S
5      U l
        U R                  S:  a  U R                  c  [        5       U l        U R                  S:  a  U R                  c  [        5       U l        Oq[        [        S9n[!        U R                  XR                     5      (       d=  [#        SU R                   SXR                      S[%        U R                  5       35      eU R                  S:  a  U R&                  c  [)        5       U l        U R                   S:  a  [#        SU R                    35      eU R
                  S:  a  [#        SU R
                   35      eU R                  S:  a  [#        SU R                   35      eU R                  S:  a  [#        SU R                   35      eS/nU R                  U;  a  [#        SU SU R                   35      eU R                  S:  a  [#        SU R                   35      eS
/nU R                  U;  a  [#        SU SU R                   35      eU R                  S:  a6  U R                  S:  a&  [#        SU R                   SU R                   S35      eU R                  S:  d  U R                  S:  a+  U R                   S:  a  U R
                  S:X  a  [#        S5      eU R                   U R
                  U R                  U R                  U R                  S.U l        g )N$PARALLELISM_CONFIG_DP_REPLICATE_SIZE1 PARALLELISM_CONFIG_DP_SHARD_SIZEPARALLELISM_CONFIG_TP_SIZEPARALLELISM_CONFIG_CP_SIZEPARALLELISM_CONFIG_CP_BACKENDr   PARALLELISM_CONFIG_SP_SIZEPARALLELISM_CONFIG_SP_BACKENDr   rV   )r   zParallelismConfig's cp_backend=z
 requires z, but cp_handler was set to z.dp_replicate_size must be at least 1, but got z*dp_shard_size must be at least 1, but got z$tp_size must be at least 1, but got z$cp_size must be at least 1, but got zcp_backend must be one of z
, but got z$sp_size must be at least 1, but got zsp_backend must be one of z[Context Parallelism (CP) and Sequence Parallelism (SP) are mutually exclusive. Got cp_size=z and sp_size=z+. Please set either cp_size=1 or sp_size=1.aC  Tensor/Context parallelism (tp/cp_size > 1) cannot be used with pure data parallelism (dp_replicate_size > 1 and dp_shard_size == 1). Please set dp_shard_size > 1 and dp_replicate_size == 1 to compose FSDP + TP/CP for 2D parallel, or set dp_replicate_size == 1 and dp_shard_size > 1 to compose HSDP + TP/CP for 3D parallel.)r2   r3   r;   r<   r=   )r   intosenvirongetr   r   r   r   r   r   r   r   r   r
   dict
isinstancert   typer   r   r   )r    cp_backends_config_mapvalid_cp_backendsvalid_sp_backendss       r!   __post_init__ParallelismConfig.__post_init__  s   !!)%(8^`c)d%eD"%!$RZZ^^4VX[%\!]D<<rzz~~.JCPQDL<<rzz~~.JCPQDL??" jjnn-LgVDO<<rzz~~.JCPQDL??" jjnn-LkZDO<<!&";"=<<!&"<">)-4*& "$//3I//3Z[[$9$//9J*Ukl{l{U|T}  ~Z  [_  `d  `o  `o  [p  Zq  r  <<!&"A"C!!A%MdNdNdMefgg!I$J\J\I]^__<<!CDLL>RSS<<!CDLL>RSS$I??"339:K9LJW[WfWfVghii<<!CDLL>RSS(M??"339:K9LJW[WfWfVghii <<!q 0#||nM$,, H<<  LL1q 0d6L6Lq6PUYUgUgklUlo  !22**,,,,,,
r$   r   sizec                     XR                   R                  5       ;   d"   SU R                   R                  5        35       eX R                   U'   [        X S3U5        g )NzParallelism must be one of _size)r   keyssetattr)r    r   r   s      r!   	_set_sizeParallelismConfig._set_size^  sV    kk..00d4OPTP[P[P`P`PbOc2dd0#'K U+T2r$   c                    [        5       nUR                  (       d  U R                  S:X  a  g U R                  S:X  a  U R                  SUR                  5        U R
                  S:X  a  U R                  S:  a  O@U R                  UR                  :w  a&  [        SU R                   SUR                   S35      eU R                  S:  aY  UR                  (       dH  UR                  (       d7  UR                  [        R                  :X  d  [        SUR                   S35      eU R                  R                  5        H7  u  p4US:X  d  M  [        X S	3S 5      c  M  UR                  S
U SU S35        M9     U(       a@  UR                   (       a.  ["        R$                  " SSR'                  U5      -   [(        5        g g g )NrV   r2   r   zParallelismConfig total_size (z ) does not match num_processes (zJ). Please adjust dp_replicate_size/ dp_shard_size/tp_size/cp_size/sp_size.zParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{Device} or DistributedType.DEEPSPEED, but got ._handlerzParallelismConfig.z_handler is set, but z0_size is set to 1. This handler will be ignored.z.ParallelismConfig has the following warnings:

)setmulti_devicer   r   num_processesr   r   rt   is_fsdp2distributed_typer	   	DEEPSPEEDr   r*   getattraddis_main_processwarningswarnjoinUserWarning)r    accelerator	_warningsr   r   s        r!   _validate_accelerator'ParallelismConfig._validate_acceleratorc  s   E	''DOOq,@ ??aNN>;+D+DE ??k)dllQ.>__ 9 9900A B""-";";!< =9:  ??Q  ''++/H/HH b  cn  c  c  b@  @A  B  "&!2!2!4KqyWT](+CTJV(5J;-  XH  I "5 44MMADIIiDXX 59r$   )r   r   r   r   r&   r   r   r   r   r   r   r   )N)r   r   )2__name__
__module____qualname____firstlineno____doc__r   r   r   __annotations__r   r   r   r   r   r   r   r   r   r   r   r
   r   r   r&   r"   r/   propertyr8   rA   rD   rG   rK   r   rP   rS   r4   r5   r>   r?   r@   rb   strrq   ru   r   rl   r   r   r   __static_attributes__ r$   r!   r   r   !   s;   !F (,x}+#'M8C='!GXc]!!GXc]!#'J '!GXc]!'+J$+ :>Jd556=:>Jd667>?CJd;;<CK

   	 	   	 	   h h : : ; ; * * & &             9 9!S !F 8C=  )5sCx%S/!AB )J
X3S 3 3
+r$   r   )r   r   dataclassesr   typingr   r   r   r   accelerate.utils.dataclassesr   r	   r
   r   accelerate.utils.versionsr   
accelerater   r   r   r$   r!   <module>r      sI    
  ! : :  7 & l l lr$   