
    
3jT:                     |   S SK Jr  S SKJrJr  S SKrS SKJr  SSK	J
r
  \(       a   \
" \5      r\ " S S5      5       r\ " S S	5      5       r\" S
S9 " S S5      5       r\" S
S9 " S S5      5       r\\\-  \\\   -  \\S4   -  4   r\\\   -  \\S4   -  r\\\\-  4   rS\S\R2                  S\\   4S jrg)    )	dataclass)TYPE_CHECKINGLiteralN   )
get_loggerc                      \ rS rSr% SrSr\S-  \S'   Sr\S-  \S'   Sr	\
\S'   Sr\S	   \S
'   Sr\R                  R                   R"                  S-  \S'   Sr\
\S'   Sr\
\S'   Sr\\S'   Sr\\S'   Sr\R.                  \S'   Sr\R                  R                   R"                  \S'   Sr\R                  R                   R"                  \S'   Sr\R                  R                   R"                  \S'   Sr\R                  R                   R"                  \S'   Sr\\S'   Sr\\S'   S r\S\ \\4   4S j5       r!\S\ \"\"4   4S j5       r#S\S\S\R.                  S\R                  R                   R"                  4S jr$S r%g)!ContextParallelConfig)   a3
  
Configuration for context parallelism.

Args:
    ring_degree (`int`, *optional*, defaults to `1`):
        Number of devices to use for Ring Attention. Sequence is split across devices. Each device computes
        attention between its local Q and KV chunks passed sequentially around ring. Lower memory (only holds 1/N
        of KV at a time), overlaps compute with communication, but requires N iterations to see all tokens. Best
        for long sequences with limited memory/bandwidth. Number of devices to use for ring attention within a
        context parallel region. Must be a divisor of the total number of devices in the context parallel mesh.
    ulysses_degree (`int`, *optional*, defaults to `1`):
        Number of devices to use for Ulysses Attention. Sequence split is across devices. Each device computes
        local QKV, then all-gathers all KV chunks to compute full attention in one pass. Higher memory (stores all
        KV), requires high-bandwidth all-to-all communication, but lower latency. Best for moderate sequences with
        good interconnect bandwidth.
    convert_to_fp32 (`bool`, *optional*, defaults to `True`):
        Whether to convert output and LSE to float32 for ring attention numerical stability.
    rotate_method (`str`, *optional*, defaults to `"allgather"`):
        Method to use for rotating key/value states across devices in ring attention. Currently, only `"allgather"`
        is supported.
    ulysses_anything (`bool`, *optional*, defaults to `False`):
        Whether to enable "Ulysses Anything" mode, which supports arbitrary sequence lengths and head counts that
        are not evenly divisible by `ulysses_degree`. When enabled, `ulysses_degree` must be greater than 1 and
        `ring_degree` must be 1.
    ring_anything (`bool`, *optional*, defaults to `False`):
        Whether to enable "Ring Anything" mode, which supports arbitrary sequence lengths. When enabled,
        `ring_degree` must be greater than 1 and `ulysses_degree` must be 1.
    mesh (`torch.distributed.device_mesh.DeviceMesh`, *optional*):
        A custom device mesh to use for context parallelism. If provided, this mesh will be used instead of
        creating a new one. This is useful when combining context parallelism with other parallelism strategies
        (e.g., FSDP, tensor parallelism) that share the same device mesh. The mesh must have both "ring" and
        "ulysses" dimensions. Use size 1 for dimensions not being used (e.g., `mesh_shape=(2, 1, 4)` with
        `mesh_dim_names=("ring", "ulysses", "fsdp")` for ring attention only with FSDP).

Nring_degreeulysses_degreeTconvert_to_fp32	allgather)r   alltoallrotate_methodmeshFulysses_anythingring_anything_rank_world_size_device_mesh_flattened_mesh
_ring_mesh_ulysses_mesh_ring_local_rank_ulysses_local_rankc                    U R                   c  SU l         U R                  c  SU l        U R                   S:X  a  U R                  S:X  a  [        S5      eU R                   S:  d  U R                  S:  a  [        S5      eU R                  S:w  a  [	        SU R                   S35      eU R
                  (       a6  U R                  S:X  a  [        S5      eU R                   S:  a  [        S5      eU R                  (       a6  U R                   S:X  a  [        S	5      eU R                  S:  a  [        S
5      eU R
                  (       a  U R                  (       a  [        S5      eg g )N   zfEither ring_degree or ulysses_degree must be greater than 1 in order to use context parallel inferencezF`ring_degree` and `ulysses_degree` must be greater than or equal to 1.r   z=Only rotate_method='allgather' is supported for now, but got .zIulysses_degree must be greater than 1 for ulysses_anything to be enabled.z8ulysses_anything cannot be enabled when ring_degree > 1.zCring_degree must be greater than 1 for ring_anything to be enabled.z8ring_anything cannot be enabled when ulysses_degree > 1.z:ulysses_anything and ring_anything cannot both be enabled.)r   r   
ValueErrorr   NotImplementedErrorr   r   selfs    ]/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/_modeling_parallel.py__post_init__#ContextParallelConfig.__post_init__e   sO   # D&"#Dq T%8%8A%=x  a4#6#6#:eff,%OPTPbPbOccde    ""a' !lmm!# ![\\1$ !fgg""Q& ![\\  T%7%7YZZ &8     returnc                 2    U R                   U R                  4$ N)r   r   r"   s    r$   
mesh_shape ContextParallelConfig.mesh_shape   s      $"5"566r'   c                     g)z$Dimension names for the device mesh.ringulysses r"   s    r$   mesh_dim_names$ContextParallelConfig.mesh_dim_names   s     #r'   rank
world_sizedevicec           	         Xl         X l        X0l        X@l        U R                  U R
                  -  U:  a)  [        SU R
                   SU R                   SU S35      eU R                  S   R                  5       U l        U R                  S   U l	        U R                  S   U l
        U R                  R                  5       U l        U R                  R                  5       U l        g )NzThe product of `ring_degree` (z) and `ulysses_degree` (z") must not exceed the world size (z).r.   r/   r0   )r   r   r   r   r   r   r    _flattenr   r   r   get_local_rankr   r   r#   r4   r5   r6   r   s        r$   setupContextParallelConfig.setup   s    
%
!1!11J>01A1A0BBZ[_[n[nZo  pR  S]  R^  ^`  a   $zz*;<EEG**V,!ZZ	2 $ > > @#'#5#5#D#D#F r'   )r   r   r   r   r   r   r   r   r   r   r   )&__name__
__module____qualname____firstlineno____doc__r   int__annotations__r   r   boolr   r   r   torchdistributeddevice_mesh
DeviceMeshr   r   r   r   r   r6   r   r   r   r   r   r   r%   propertytupler+   strr2   r;   __static_attributes__r1   r'   r$   r	   r	   )   s   "H #Kt"!%NC$J% OT 6AM723A<@D%


'
'
2
2T
9@ #d"M4E3K GU\\ 6:E5((33:@DOU&&22==D;?J!!--88?>BM5$$00;;B c ##[: 7E#s(O 7 7 #c3h # #G# G3 G GEL]L]LiLiLtLt Gr'   r	   c            
       .   \ rS rSr% SrSr\S-  \S'   Sr\	\S'   Sr
\	\S'   Sr\R                  \S'   Sr\R                  R                   R"                  \S'   SS	.S
\	S\	S\R                  S\R                  R                   R"                  S-  4S jjrSrg)ParallelConfig   z
Configuration for applying different parallelisms.

Args:
    context_parallel_config (`ContextParallelConfig`, *optional*):
        Configuration for context parallelism.
Ncontext_parallel_configr   r   r   r   )r   r4   r5   r6   r   c                    Xl         X l        X0l        X@l        U R                  b  U R                  R                  XX45        g g r*   )r   r   r   r   rP   r;   r:   s        r$   r;   ParallelConfig.setup   s@     
%
''3((..tN 4r'   )r   r   r   r   )r=   r>   r?   r@   rA   rP   r	   rC   r   rB   r   r   rE   r6   r   rF   rG   rH   r;   rL   r1   r'   r$   rN   rN      s     =A2T9@E3K GU\\ 6:E5((33: AEOO O 	O ++66=O Or'   rN   T)frozenc                   L    \ rS rSr% Sr\\S'   Sr\S-  \S'   Sr\	\S'   S r
S	rg)
ContextParallelInput   a  
Configuration for splitting an input tensor across context parallel region.

Args:
    split_dim (`int`):
        The dimension along which to split the tensor.
    expected_dims (`int`, *optional*):
        The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
        tensor has the expected number of dimensions before splitting.
    split_output (`bool`, *optional*, defaults to `False`):
        Whether to split the output tensor of the layer along the given `split_dim` instead of the input tensor.
        This is useful for layers whose outputs should be split after it does some preprocessing on the inputs (ex:
        RoPE).
	split_dimNexpected_dimsFsplit_outputc                 V    SU R                    SU R                   SU R                   S3$ )NzContextParallelInput(split_dim=, expected_dims=z, split_output=))rW   rX   rY   r"   s    r$   __repr__ContextParallelInput.__repr__   sK    00@@PQUQcQcPddstx  uF  uF  tG  GH  I  	Ir'   r1   )r=   r>   r?   r@   rA   rB   rC   rX   rY   rD   r]   rL   r1   r'   r$   rU   rU      s,     N $M3:$L$Ir'   rU   c                   >    \ rS rSr% Sr\\S'   Sr\S-  \S'   S rSr	g)ContextParallelOutput   a  
Configuration for gathering an output tensor across context parallel region.

Args:
    gather_dim (`int`):
        The dimension along which to gather the tensor.
    expected_dims (`int`, *optional*):
        The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
        tensor has the expected number of dimensions before gathering.

gather_dimNrX   c                 <    SU R                    SU R                   S3$ )Nz!ContextParallelOutput(gather_dim=r[   r\   )rb   rX   r"   s    r$   r]   ContextParallelOutput.__repr__   s&    24??2CCSTXTfTfSgghiir'   r1   )
r=   r>   r?   r@   rA   rB   rC   rX   r]   rL   r1   r'   r$   r`   r`      s"    	 O $M3:$jr'   r`   .sizegroupr(   c           	         [         R                  " US9n[        [         R                  " US95      nSU;   a  SO[        R
                  R                  5       n[        U5       Vs/ s H'  n[        R                  " SU[        R                  S9PM)     nn[         R                  " U[        R                  " U /U[        R                  S9US9  U Vs/ s H  owS   R                  5       PM     nnU$ s  snf s  snf )zkGather the local size from all ranks.
size: int, local size return: list[int], list of size from all ranks
)rf   cpu)r   )r6   dtyper   )distget_world_sizerK   get_backendrE   acceleratorcurrent_acceleratorrangeemptyint64
all_gathertensoritem)re   rf   r5   comm_backendsgather_device_gathered_sizesss           r$   gather_size_by_commrz   %  s    " $$51J((u56M"m3E9J9J9^9^9`MZ_`jZklZkUVekk$}EKKPZkNlOOdVME ,::>adiik>N:  m ;s   %.C2C7)dataclassesr   typingr   r   rE   torch.distributedrF   rj   utilsr   r=   loggerr	   rN   rU   r`   dictrK   rB   listrJ   ContextParallelInputTypeContextParallelOutputTypeContextParallelModelPlanProcessGrouprz   r1   r'   r$   <module>r      sB  $ " )      
H	 pG pG pGf O O O@ $I I I0 $j j j0  #I#d+?&@@5I]_bIbCccc  2D9N4OORWXmorXrRss   %=@Y%Y YZ V c  $*;*;  S	  r'   