
    
3j,             
          S SK r S SKJr  S SKJr  S SKrS SKJr  SSKJ	r	J
r
  SSKJrJr  SSKJrJrJrJr  SS	KJrJr  SS
KJrJrJr  SSKJr  SSKJr  SSKJrJ r   SSK!J"r"  SSK#J$r$  \RJ                  " \&5      r'S\RP                  S\)\RP                  \RP                  4   S\RP                  4S jr*S\RP                  S\)\RP                  \RP                  4   S\RP                  4S jr+\ " S S\5      5       r, " S S\RZ                  5      r. " S S5      r/ " S S5      r0 " S S\R                  RZ                  \5      r1 " S S \RZ                  5      r2 " S! S"\RZ                  5      r3 " S# S$\"\	\\\\5      r4g)%    N)	dataclass)Any   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)
BaseOutputapply_lora_scaleis_torch_versionlogging   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin))PixArtAlphaCombinedTimestepSizeEmbeddingsPixArtAlphaTextProjection)
ModelMixin)RMSNormxfreqsreturnc                 "   Uu  p#U R                  SS5      R                  S5      u  pE[        R                  " U* U/SS9R	                  S5      nU R                  5       U-  UR                  5       U-  -   R                  U R                  5      nU$ )Nr   )r   r   dim)	unflattenunbindtorchstackflattenfloattodtype)r   r   cossinx_realx_imag	x_rotatedouts           h/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_ltx2.pyapply_interleaved_rotary_embr0   &   s|    HC[[G,33B7NFfWf-26>>qAI779s?Y__.44
8
8
ACJ    c                 6   Uu  p#U R                   nSnU R                  S:w  aD  UR                  S:X  a4  UR                  u  pgpU R                  XhUS5      R	                  SS5      n SnU R                  S   n
U
S-  S:w  a  [        SU
 S	35      eU
S-  nU R                  " / U R                  S S QSPUP76 R                  5       nUS
S S2S S 24   nUS
SS 2S S 24   nUR                  S5      nUR                  S5      nX-  nUS
S S2S S 24   nUS
SS 2S S 24   nUR                  U* U5        UR                  UU5        UR                  " / UR                  S S QU
P76 nU(       a#  UR	                  SS5      R                  WWS5      nUR                  US9nU$ )NF   r      r   Tr   z6Expected x.shape[-1] to be even for split rotary, got ..r(   )
r(   ndimshapereshapeswapaxes
ValueErrorr&   	unsqueezeaddcmul_r'   )r   r   r)   r*   x_dtypeneeds_reshapebht_lastrsplit_xfirst_xsecond_xcos_usin_ur.   	first_out
second_outs                       r/   apply_split_rotary_embrN   .   s   HCggGMvv{sxx1}YY
aIIaAr"++Aq1 772;Dax1}QRVQWWXYZZ	A ii,",q,!,224Gc2A2qj!GsABz"HMM"EMM"E
/CC!QJIS!"aZJvx(w'
++
,syy"~
,t
,Cll1a ((Ar2
&&w&
CJr1   c                   .    \ rS rSr% SrS\S'   S\S'   Srg)AudioVisualModelOutputW   a  
Holds the output of an audiovisual model which produces both visual (e.g. video) and audio outputs.

Args:
    sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
        The hidden states output conditioned on the `encoder_hidden_states` input, representing the visual output
        of the model. This is typically a video (spatiotemporal) output.
    audio_sample (`torch.Tensor` of shape `(batch_size, TODO)`):
        The audio output of the audiovisual model.
ztorch.Tensorsampleaudio_sample N)__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__rT   r1   r/   rP   rP   W   s    	   r1   rP   c                   4  ^  \ rS rSrSrSS\S\S\4U 4S jjjr   SS\R                  S	\
\\R                  4   S-  S
\S-  S\R                  S-  S\\R                  \R                  \R                  \R                  \R                  4   4
S jjrSrU =r$ )LTX2AdaLayerNormSingleh   a  
Norm layer adaptive layer norm single (adaLN-single).

As proposed in PixArt-Alpha (see: https://huggingface.co/papers/2310.00426; Section 2.3) and adapted by the LTX-2.0
model. In particular, the number of modulation parameters to be calculated is now configurable.

Parameters:
    embedding_dim (`int`): The size of each embedding vector.
    num_mod_params (`int`, *optional*, defaults to `6`):
        The number of modulation parameters which will be calculated in the first return argument. The default of 6
        is standard, but sometimes we may want to have a different (usually smaller) number of modulation
        parameters.
    use_additional_conditions (`bool`, *optional*, defaults to `False`):
        Whether to use additional conditions for normalization or not.
embedding_dimnum_mod_paramsuse_additional_conditionsc                    > [         TU ]  5         X l        [        XS-  US9U l        [
        R                  " 5       U l        [
        R                  " XR                  U-  SS9U l	        g )Nr   )size_emb_dimra   Tbias)
super__init__r`   r   embnnSiLUsiluLinearlinear)selfr_   r`   ra   	__class__s       r/   rg   LTX2AdaLayerNormSingle.__init__y   sX    ,<(:Vo
 GGI	ii/B/B]/RY]^r1   Ntimestepadded_cond_kwargs
batch_sizehidden_dtyper   c                     U=(       d    S S S.nU R                   " U40 UDX4S.D6nU R                  U R                  U5      5      U4$ )N)
resolutionaspect_ratiors   rt   )rh   rm   rk   )rn   rq   rr   rs   rt   embedded_timesteps         r/   forwardLTX2AdaLayerNormSingle.forward   sN     .[VZ1[ HHXu1Buzu{{499%678:KKKr1   )rh   rm   r`   rk   )   F)NNN)rU   rV   rW   rX   rY   intboolrg   r#   Tensordictstrr(   tuplerz   r[   __classcell__ro   s   @r/   r]   r]   h   s     	_c 	_3 	__c 	_ 	_ =A!%+/
L,,
L  U\\ 12T9
L $J	
L
 kkD(
L 
u||U\\5<<u||S	T
L 
Lr1   r]   c                   $   \ rS rSrSrSrSrS r    SSSS\R                  S\R                  S-  S	\R                  S-  S
\
\R                  \R                  4   S-  S\
\R                  \R                  4   S-  S\R                  4S jjrSrg)LTX2AudioVideoAttnProcessor   a-  
Processor for implementing attention (SDPA is used by default if you're using PyTorch 2.0) for the LTX-2.0 model.
Compared to the LTX-1.0 model, we allow the RoPE embeddings for the queries and keys to be separate so that we can
support audio-to-video (a2v) and video-to-audio (v2a) cross attention.
Nc                 <    [        SS5      (       a  [        S5      eg N<z2.0zlLTX attention processors require a minimum PyTorch version of 2.0. Please upgrade your PyTorch installation.r   r<   rn   s    r/   rg   $LTX2AudioVideoAttnProcessor.__init__   $    C''~  (r1   attnLTX2Attentionhidden_statesencoder_hidden_statesattention_maskquery_rotary_embkey_rotary_embr   c                    Uc  UR                   OUR                   u  pxn	Ub<  UR                  XHU5      nUR                  XqR                  SUR                   S   5      nUc  UnUR                  b  UR	                  U5      n
UR                  U5      nUR                  U5      nUR                  U5      nUR                  U5      nUR                  U5      nUbW  UR                  S:X  a  [        X5      n[        Xb  UOU5      nO+UR                  S:X  a  [        X5      n[        Xb  UOU5      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      n[        UUUUSSU R                  U R                   S9nUR#                  SS5      nUR%                  UR&                  5      nUR                  b\  UR                  SUR                  S45      nS	[(        R*                  " W
5      -  nX.R-                  S5      -  nUR#                  SS5      nUR.                  S
   " U5      nUR.                  S   " U5      nU$ )Nr   interleavedsplitr           F	attn_mask	dropout_p	is_causalbackendparallel_configr          @r   r4   )r9   prepare_attention_maskviewheadsto_gate_logitsto_qto_kto_vnorm_qnorm_k	rope_typer0   rN   r!   r   _attention_backend_parallel_configr%   r'   r(   r#   sigmoidr=   to_out)rn   r   r   r   r   r   r   rs   sequence_lengthrD   gate_logitsquerykeyvaluegatess                  r/   __call__$LTX2AudioVideoAttnProcessor.__call__   sV    $9#@MF[FaFa 	'
Q %!88ZdeN+00ZZ^MaMabdMefN ($1!*--m<K		-(ii-.		/0E"kk#'~~.4UM2+EK[ 7*.uG,SD^.dtuDJJ#34mmA

B/0DJJ#34-$++ 11	
 &--a3%((5*)33A

B7GHM%--44E)OOB,??M)11!Q7MA}5A}5r1   rT   NNNN)rU   rV   rW   rX   rY   r   r   rg   r#   r   r   r   r[   rT   r1   r/   r   r      s      6:.2EICGCC ||C  %||d2	C
 t+C  ell :;dBC ellELL89D@C 
C Cr1   r   c                   P   \ rS rSrSrSrSrS r      SSSS\R                  S\R                  S-  S	\R                  S-  S
\
\R                  \R                  4   S-  S\
\R                  \R                  4   S-  S\R                  S-  S\S-  S\R                  4S jjrSrg)LTX2PerturbedAttnProcessor   zh
Processor which implements attention with perturbation masking and per-head gating for LTX-2.X models.
Nc                 <    [        SS5      (       a  [        S5      eg r   r   r   s    r/   rg   #LTX2PerturbedAttnProcessor.__init__   r   r1   r   r   r   r   r   r   r   perturbation_maskall_perturbedr   c	                    Uc  UR                   OUR                   u  pnUb<  UR                  XJU	5      nUR                  XR                  SUR                   S   5      nUc  UnUR                  b  UR	                  U5      nUR                  U5      nUc  Ub  [        R                  " US:H  5      OSnU(       a  UnGOuUR                  U5      nUR                  U5      nUR                  U5      nUR                  U5      nUbW  UR                  S:X  a  [        X5      n[        Xb  UOU5      nO+UR                  S:X  a  [        X5      n[        Xb  UOU5      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      n[!        UUUUSSU R"                  U R$                  S9nUR'                  SS	5      nUR)                  UR*                  5      nUb)  UR'                  SS	5      n[        R,                  " XU5      nUR                  b]  UR                  SUR                  S45      nS
[        R.                  " W5      -  nUUR1                  S5      -  nUR'                  SS	5      nUR2                  S   " U5      nUR2                  S   " U5      nU$ )Nr   r   Fr   r   r   r   r   r   r   r4   )r9   r   r   r   r   r   r#   allr   r   r   r   r   r0   rN   r!   r   r   r   r%   r'   r(   lerpr   r=   r   )rn   r   r   r   r   r   r   r   r   rs   r   rD   r   r   r   r   r   s                    r/   r   #LTX2PerturbedAttnProcessor.__call__   s    $9#@MF[FaFa 	'
Q %!88ZdeN+00ZZ^MaMabdMefN ($1!*--m<K		/0 ARA^EII&71&<=diM!MIIm,E))12CKK&E++c"C+>>]28QE6/I^O_C ^^w.25KE0/I^O_C OOA

B'78E--DJJ#34COOA

B'78E1(// $ 5 5	M *11!Q7M),,U[[9M ,a+ %

5AR S*)33A

B7GHM%--44E)EOOB,??M)11!Q7MA}5A}5r1   rT   )NNNNNN)rU   rV   rW   rX   rY   r   r   rg   r#   r   r   r~   r   r[   rT   r1   r/   r   r      s      6:.2EICG15%)RR ||R  %||d2	R
 t+R  ell :;dBR ellELL89D@R !<<$.R d{R 
R Rr1   r   c                     ^  \ rS rSrSr\r\\/r             SS\	S\	S\	S\	S\
S	\S
\	S-  S\S\S\
S\S\S\4U 4S jjjr    SS\R                  S\R                  S-  S\R                  S-  S\\R                  \R                  4   S-  S\\R                  \R                  4   S-  S\R                  4S jjrSrU =r$ )r   iJ  z
Attention class for all LTX-2.0 attention layers. Compared to LTX-1.0, this supports specifying the query and key
RoPE embeddings separately for audio-to-video (a2v) and video-to-audio (v2a) cross-attention.
N	query_dimr   kv_headsdim_headdropoutre   cross_attention_dimout_biasqk_normnorm_epsnorm_elementwise_affiner   apply_gated_attentionc                   > [         TU ]  5         U	S:w  a  [        S5      eX@l        XB-  U l        Uc  U R                  OXC-  U l        Xl        Ub  UOUU l        X`l        XPl	        Xl
        X l        Xl        [        R                  R                  XB-  XS9U l        [        R                  R                  XC-  XS9U l        [        R                  R%                  XR                  US9U l        [        R                  R%                  U R                  U R
                  US9U l        [        R                  R%                  U R                  U R
                  US9U l        [        R                  R-                  / 5      U l        U R.                  R1                  [        R                  R%                  U R                  U R                  US95        U R.                  R1                  [        R                  R3                  U5      5        U(       a$  [        R                  R%                  XSS9U l        OS U l        Uc  U R7                  5       nU R9                  U5        g )Nrms_norm_across_headszIOnly 'rms_norm_across_heads' is supported as a valid value for `qk_norm`.epselementwise_affinerd   T)rf   rg   NotImplementedErrorhead_dim	inner_diminner_kv_dimr   r   use_biasr   out_dimr   r   r#   ri   r   r   r   rl   r   r   r   
ModuleListr   appendDropoutr   _default_processor_clsset_processor)rn   r   r   r   r   r   re   r   r   r   r   r   r   r   	processorro   s                  r/   rg   LTX2Attention.__init__S  s   " 	--%&qrr !).6.>DNNHDW":M:Y#6_h  
"hh&&x'7X&rhh&&x':&uHHOOI~~DOI	HHOOD$<$<d>O>OVZO[	HHOOD$<$<d>O>OVZO[	hh))"-588??4>>4<<h?WX588++G45 "'((//)/"ND"&D335I9%r1   r   r   r   r   r   r   c                    [        [        R                  " U R                  R                  5      R
                  R                  5       5      nUR                  5        VV	s/ s H  u  pX;  d  M  UPM     n
nn	[        U
5      S:  a:  [        R                  SU
 SU R                  R                  R                   S35        UR                  5        VVs0 s H  u  pX;   d  M  X_M     nnnU R                  " XX#XE40 UD6nU$ s  sn	nf s  snnf )Nr   zattention_kwargs z are not expected by z and will be ignored.)setinspect	signaturer   r   
parameterskeysitemslenloggerwarningro   rU   )rn   r   r   r   r   r   kwargsattn_parameterskrD   unused_kwargsws               r/   rz   LTX2Attention.forward  s     g//0G0GHSSXXZ['-||~R~tq9Q~R}!NN#M?2GH`H`HiHiGjj  A $*<<>J>41Q5I$!$>J!6HX
lr
  S
 Ks   D.DD"D)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )   r   @   r   TNTr   ư>Tr   FNr   )rU   rV   rW   rX   rY   r   r   r   _available_processorsr}   r&   r~   r   rg   r#   r   r   rz   r[   r   r   s   @r/   r   r   J  sn   
 98:TU
 *..(,&&+1&1& 1& 	1&
 1& 1& 1& !4Z1& 1& 1& 1& "&1& 1&  $1& 1&l 6:.2EICG||  %||d2 t+	
  ell :;dB ellELL89D@ 
 r1   r   c            7       H  ^  \ rS rSrSr            S9S\S\S\S\S\S\S	\S
\S\S\S\S\S\S\S\S\S\S\S\4&U 4S jjjr	\
S\R                  S\R                  S\S\\R                  S4   4S j5       r                S:S\R                  S\R                  S \R                  S!\R                  S\R                  S"\R                  S#\R                  S$\R                  S%\R                  S&\R                  S'\R                  S-  S(\R                  S-  S)\\R                  \R                  4   S-  S*\\R                  \R                  4   S-  S+\\R                  \R                  4   S-  S,\\R                  \R                  4   S-  S-\R                  S-  S.\R                  S-  S/\R                  S-  S0\R                  S-  S1\R                  S-  S2\R                  S-  S3\S4\S5\R                  S-  S6\S-  S\R                  46S7 jjrS8rU =r$ );LTX2VideoTransformerBlocki  am  
Transformer block used in [LTX-2.0](https://huggingface.co/Lightricks/LTX-Video).

Args:
    dim (`int`):
        The number of channels in the input and output.
    num_attention_heads (`int`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`):
        The number of channels in each head.
    qk_norm (`str`, defaults to `"rms_norm"`):
        The normalization layer to use.
    activation_fn (`str`, defaults to `"gelu-approximate"`):
        Activation function to use in feed-forward.
    eps (`float`, defaults to `1e-6`):
        Epsilon value for normalization layers.
r    num_attention_headsattention_head_dimr   	audio_dimaudio_num_attention_headsaudio_cross_attention_dimvideo_gated_attnvideo_cross_attn_adalnaudio_gated_attnaudio_cross_attn_adalnr   activation_fnattention_biasattention_out_biasr   r   r   perturbed_attnc                   > [         TU ]  5         UU l        U(       a  [        nO[        n[        UUUS9U l        [        UUUUUS UUUU	U" 5       S9U l        [        UUUS9U l	        [        UUUUUS UUUUU" 5       S9U l
        [        UUUS9U l        [        UUUUUUUUUU	U" 5       S9U l        [        UUUS9U l        [        UUUUUUUUUUU" 5       S9U l        [        UUUS9U l        [        UUUUUUUUUU	U" 5       S9U l        [        UUUS9U l        [        UUUUUUUUUUU" 5       S9U l        [        UUUS9U l        [)        XS9U l        [        UUUS9U l        [)        X^S9U l        Xl        Xl        U R0                  (       a  SOSnU R2                  (       a  SOSn[4        R6                  " [8        R:                  " UU5      US-  -  5      U l        [4        R6                  " [8        R:                  " UU5      US-  -  5      U l        U
=(       d    UU l         U R@                  (       a`  [4        R6                  " [8        R:                  " SU5      5      U l!        [4        R6                  " [8        R:                  " SU5      5      U l"        [4        R6                  " [8        R:                  " S	U5      5      U l#        [4        R6                  " [8        R:                  " S	U5      5      U l$        g )
Nr   )r   r   r   r   re   r   r   r   r   r   r   )r   r   r   r   r   re   r   r   r   r   r   )r   	   r|         ?r      )%rf   rg   r  r   r   r   norm1r   attn1audio_norm1audio_attn1norm2attn2audio_norm2audio_attn2audio_to_video_normaudio_to_video_attnvideo_to_audio_normvideo_to_audio_attnnorm3r   ffaudio_norm3audio_ffr   r   ri   	Parameterr#   randnscale_shift_tableaudio_scale_shift_tablecross_attn_adalnprompt_scale_shift_tableaudio_prompt_scale_shift_table&video_a2v_cross_attn_scale_shift_table&audio_a2v_cross_attn_scale_shift_table)rn   r    r   r   r   r   r   audio_attention_head_dimr   r   r   r   r   r   r   r   r  r   r   r   r  attn_processor_clsvideo_mod_param_numaudio_mod_param_numro   s                           r/   rg   "LTX2VideoTransformerBlock.__init__  s   . 	,!;!< Sc>PQ
"%(' $'"2(*

 #9#J\](+.- $'"2(*
 Sc>PQ
" 3%(''"2(*

 #9#J\]( 9+.-'"2(*
  $+3CL^#_ #0 )+.-'"2(*$
  $+9#Rd#e #0 #+.-'"2(*$
  Sc>PQ
c?"9#J\]#IK
 '=#&<##'#>#>aA#'#>#>aA!#ekk:Ms.SVY[^V^.^!_')||EKK@SU^4_bkmpbp4p'q$ !7 P:P  ,.LLQ9L,MD)24,,u{{1i?X2YD/ 79ll5;;qRUCV6W368ll5;;qR[C\6]3r1   r  tembrs   r   .c                     U R                   S   nU S   R                  UR                  5      UR                  X!R                   S   US5      -   nUR	                  SS9nU$ )Nr   NNr4   r   r   r   )r9   r'   devicer:   r"   )r  r%  rs   num_ada_params
ada_values
ada_paramss         r/   get_mod_params(LTX2VideoTransformerBlock.get_mod_paramsH  sh     +003&z255dkkBT\\

1~rF
 

  &&1&-
r1   Nr   audio_hidden_statesr   audio_encoder_hidden_states
temb_audiotemb_ca_scale_shifttemb_ca_audio_scale_shifttemb_ca_gatetemb_ca_audio_gatetemb_prompttemb_prompt_audiovideo_rotary_embaudio_rotary_embca_video_rotary_embca_audio_rotary_embencoder_attention_maskaudio_encoder_attention_maskself_attention_maskaudio_self_attention_maska2v_cross_attention_maskv2a_cross_attention_maskuse_a2v_cross_attentionuse_v2a_cross_attentionr   r   c                 	   UR                  S5      nU R                  U R                  UU5      nUS S u  nnnn n!n"U R                  (       a	  USS u  n#n$n%U R	                  U5      n&U&SU-   -  U-   n&U&S UUS.n'U R
                  (       a
  UU'S'   UU'S'   U R                  " S0 U'D6n(UU(U-  -   nU R                  U R                  UU5      n)U)S S u  n*n+n,n-n.n/U R                  (       a	  U)SS u  n0n1n2U R                  U5      n3U3SU+-   -  U*-   n3U3S UUS.n4U R
                  (       a
  UU4S'   UU4S'   U R                  " S0 U4D6n5UU5U,-  -   nU R                  (       aD  U R                  U R                  UU5      n6U6u  n7n8U R                  U R                  UU5      n9U9u  n:n;U R                  U5      n&U R                  (       a  U&SW$-   -  W#-   n&U R                  (       a  USW8-   -  W7-   nU R                  U&US US9n(U R                  (       a  U(W%-  n(UU(-   nU R!                  U5      n3U R                  (       a  U3SW1-   -  W0-   n3U R                  (       a  USW;-   -  W:-   nU R#                  U3US US9n5U R                  (       a  U5W2-  n5UU5-   nU(       d  U(       Ga  U R%                  U5      n&U R'                  U5      n3U R(                  S S	2S S 24   n<U R(                  S	S 2S S 24   n=U R                  U<UU5      n>U R                  U=U	U5      n?U>u  n@nAnBnCU?S   R+                  S
5      nDU R,                  S S	2S S 24   nEU R,                  S	S 2S S 24   nFU R                  UEUU5      nGU R                  UFU
U5      nHUGu  nInJnKnLUHS   R+                  S
5      nMU(       am  U&SW@R+                  S
5      -   -  WAR+                  S
5      -   nNU3SWIR+                  S
5      -   -  WJR+                  S
5      -   nOU R/                  UNUOUUUS9nPUWDUP-  -   nU(       am  U&SWBR+                  S
5      -   -  WCR+                  S
5      -   nNU3SWKR+                  S
5      -   -  WLR+                  S
5      -   nOU R1                  UOUNUUUS9nQUWMUQ-  -   nU R3                  U5      SU!-   -  U -   n&U R5                  U&5      nRUURU"-  -   nU R7                  U5      SU.-   -  U--   n3U R9                  U35      nSUUSU/-  -   nX4$ )Nr   r|   r  r4   )r   r   r   r   r   r   )r   r   r   r3   r   )r   r   r   r   rT   )sizer,  r  r   r  r  r  r  r   r	  r
  r  r  r  r  r  r  r  r  r  r  squeezer  r  r  r  r  r  r  )Trn   r   r.  r   r/  r%  r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  r   r   rs   video_ada_params	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpshift_text_qscale_text_qgate_text_qnorm_hidden_statesvideo_self_attn_argsattn_hidden_statesaudio_ada_paramsaudio_shift_msaaudio_scale_msaaudio_gate_msaaudio_shift_mlpaudio_scale_mlpaudio_gate_mlpaudio_shift_text_qaudio_scale_text_qaudio_gate_text_qnorm_audio_hidden_statesaudio_self_attn_argsattn_audio_hidden_statesvideo_prompt_ada_paramsshift_text_kvscale_text_kvaudio_prompt_ada_paramsaudio_shift_text_kvaudio_scale_text_kvvideo_per_layer_ca_scale_shiftvideo_per_layer_ca_gatevideo_ca_ada_paramsvideo_ca_gate_paramvideo_a2v_ca_scalevideo_a2v_ca_shiftvideo_v2a_ca_scalevideo_v2a_ca_shifta2v_gateaudio_per_layer_ca_scale_shiftaudio_per_layer_ca_gateaudio_ca_ada_paramsaudio_ca_gate_paramaudio_a2v_ca_scaleaudio_a2v_ca_shiftaudio_v2a_ca_scaleaudio_v2a_ca_shiftv2a_gatemod_norm_hidden_statesmod_norm_audio_hidden_statesa2v_attn_hidden_statesv2a_attn_hidden_states	ff_outputaudio_ff_outputsT                                                                                       r/   rz   !LTX2VideoTransformerBlock.forwardS  sx   : #''*
  ..t/E/EtZXIYZ\[\I]F	9h	9h&&6Fq6K3L,!ZZ6/1y=AIM 0%) 01	 
 8I !454A 1!ZZ?*>?%(:X(EE  ..t/K/KZYcdRa  	k./?\j &&HXYZ[\H]E 24E#'#3#34G#H #;q??R#SVe#e  6%) 07	 
 8I !454A 1#'#3#3#K6J#K 14L~4]]   &*&9&9$:W:WYdfp&q#+B(M=&*&9&9335F
'# 8O4!4 "ZZ6&&!3q<7G!H<!W  $9Q=N$OR_$_!!ZZ"7!1	 ( 
 &&!3k!A%(:: $(#3#34G#H &&'?1GYCY'Z]o'o$  *EM`I`*adw*w'#'#3#3$"=!7	 $4 $
  &&'?BS'S$14LL #&=!%!9!9-!H'+'?'?@S'T$ .2-X-XY[Z[Y[]^Y^-_*&*&Q&QRSRTVWRW&X#"&"5"56TViku"v"&"5"56M|]g"h]pZ 24FHZ*1-55a8H .2-X-XY[Z[Y[]^Y^-_*&*&Q&QRSRTVWRW&X#"&"5"5.0I:# #'"5"56MOacm"n]pZ 24FHZ*1-55a8H ');*22155*&..q1*2& 0H*221550&..q102, *.)A)A**F%8#6#; *B *& !.;Q0Q Q ');*22155*&..q1*2& 0H*221550&..q102, *.)A)A0*@%8#6#; *B *& ':HG]<]&]# "ZZ6!i-H9TGG./	%	H(<<#'#3#34G#HAP_L_#`cr#r --(@A1On4TT11r1   )r  r  r  r
  r  r   r  r	  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  )FFFFr   gelu-approximateTTr   Fr   F)NNNNNNNNNNNNTTNN)rU   rV   rW   rX   rY   r}   r~   r   r&   rg   staticmethodr#   r   r   r,  rz   r[   r   r   s   @r/   r   r     s   8 "'',!&',./##'#(&$+W^W^ !W^  	W^
 !W^ W^ $'W^ $'W^ W^ !%W^ W^ !%W^ W^ W^  !W^" !#W^$ %W^& !'W^( )W^* +W^ W^r  <</4||IL	u||S 	! , ,015EIEIHLHL6:<@379=8<8<(,(,15%)7E2||E2 #\\E2  %||	E2
 &+\\E2 llE2 LLE2 #\\E2 $)<<E2 llE2 "LLE2 \\D(E2 !<<$.E2  ell :;dBE2  ell :;dBE2  #5<<#=>E!E2" #5<<#=>E#E2$ !&t 3%E2& ',llT&9'E2( #\\D0)E2* $)<<$#6+E2, #(,,"5-E2. #(,,"5/E20 "&1E22 "&3E24 !<<$.5E26 d{7E28 
9E2 E2r1   r   c            !         ^  \ rS rSrSr              S#S\S\S\S\S\S\S	\S
\S\\S4   S\S\S\S\	S\S\SS4 U 4S jjjr
 S$S\S\S\S\S\R                  S\S\R                  4S jjr S%S\S\S\R                  S\S\R                  4
S jjrS r S&S \R                  S\\R                  -  S-  S\\R                  \R                  4   4S! jjrS"rU =r$ )'LTX2AudioVideoRotaryPosEmbedi  a`  
Video and audio rotary positional embeddings (RoPE) for the LTX-2.0 model.

Args:
    causal_offset (`int`, *optional*, defaults to `1`):
        Offset in the temporal axis for causal VAE modeling. This is typically 1 (for causal modeling where the VAE
        treats the very first frame differently), but could also be 0 (for non-causal modeling).
r    
patch_sizepatch_size_tbase_num_framesbase_height
base_widthsampling_rate
hop_lengthscale_factors.thetacausal_offsetmodalitydouble_precisionr   r   r   Nc                   > [         TU ]  5         Xl        X l        X0l        US;  a  [        SU< S35      eXl        X@l        Xl        XPl	        X`l
        Xpl        Xl        [        U5      [        U5      -  [        U	S   5      -  U l        Xl        Xl        Xl        Xl        U R$                  S;  a  [        SU S35      eXl        g )N)r   r   z
rope_type=z9 not supported. Choose between 'interleaved' and 'split'.r   )videoaudioz	Modality z@ is not supported. Supported modalities are `video` and `audio`.)rf   rg   r    r  r  r<   r   r  r   r  r  r  r  r&   audio_latents_per_secondr  r  r  r  r  )rn   r    r  r  r  r  r  r  r  r  r  r  r  r  r   r   ro   s                   r/   rg   %LTX2AudioVideoRotaryPosEmbed.__init__%  s    $ 	$(44
	|+deff".#6  '$ +$(-m(<uZ?P(PSXYfghYiSj(j%*
* == 22y
2rstt 0r1   rs   
num_framesheightwidthr(  fpsc                    [         R                  " SX R                  [         R                  US9n[         R                  " SX0R                  [         R                  US9n[         R                  " SX@R                  [         R                  US9n	[         R
                  " XxU	SS9n
[         R                  " U
SS9n
U R                  U R                  U R                  4n[         R                  " XR                  U
R                  S9nXR                  SSSS5      -   n[         R                  " X/S	S9nUR                  SS5      nUR                  S5      R                  USSS5      n[         R                  " U R                  UR                  S
9nS/UR                  -  nS	US'   XR                  " U6 -  nUSS2SS4   U R                   -   U R                  S   -
  R#                  SS9USS2SS4'   USS2SS4   U-  USS2SS4'   U$ )am  
Create per-dimension bounds [inclusive start, exclusive end) for each patch with respect to the original pixel
space video grid (num_frames, height, width). This will ultimately have shape (batch_size, 3, num_patches, 2)
where
    - axis 1 (size 3) enumerates (frame, height, width) dimensions (e.g. idx 0 corresponds to frames)
    - axis 3 (size 2) stores `[start, end)` indices within each dimension

Args:
    batch_size (`int`):
        Batch size of the video latents.
    num_frames (`int`):
        Number of latent frames in the video latents.
    height (`int`):
        Latent height of the video latents.
    width (`int`):
        Latent width of the video latents.
    device (`torch.device`):
        Device on which to create the video grid.

Returns:
    `torch.Tensor`:
        Per-dimension patch boundaries tensor of shape [batch_size, 3, num_patches, 2].
r   startendstepr(   r(  ij)indexingr   )r(   r(  r   r4   r   r(  N.min)r#   aranger  float32r  meshgridr$   tensorr(   r(  r   r%   r=   repeatr  r8   r  clamp)rn   rs   r  r  r  r(  r  grid_fgrid_hgrid_wgridr  patch_size_delta
patch_endslatent_coordsscale_tensorbroadcast_shapepixel_coordss                     r/   prepare_video_coords1LTX2AudioVideoRotaryPosEmbed.prepare_video_coordsV  s   F A:<M<MUZUbUbkqrA6u}}eklA5emmdjk~~fftD{{4Q' ''$//J
 <<
**T[[Y11!Q1==
 T$6B?%--a3%//299*aAN ||D$6$6}?S?ST# 2 22$'8'8/'JJ $01c	#:T=O=O#ORVRdRdefRg#g"n"nst"n"uQ3Y #/q!Sy"9C"?Q3Yr1   shiftc                 (   [         R                  " XBU-   U R                  [         R                  US9nU R                  S   nXV-  nXpR
                  -   U-
  R                  SS9nXpR                  -  U R                  -  nXPR                  -   U-  n	XR
                  -   U-
  R                  SS9n	XR                  -  U R                  -  n
[         R                  " X/SS9nUR                  S5      R                  USS5      nUR                  S5      nU$ )af  
Create per-dimension bounds [inclusive start, exclusive end) of start and end timestamps for each latent frame.
This will ultimately have shape (batch_size, 3, num_patches, 2) where
    - axis 1 (size 1) represents the temporal dimension
    - axis 3 (size 2) stores `[start, end)` indices within each dimension

Args:
    batch_size (`int`):
        Batch size of the audio latents.
    num_frames (`int`):
        Number of latent frames in the audio latents.
    device (`torch.device`):
        Device on which to create the audio grid.
    shift (`int`, *optional*, defaults to `0`):
        Offset on the latent indices. Different shift values correspond to different overlapping windows with
        respect to the same underlying latent grid.

Returns:
    `torch.Tensor`:
        Per-dimension patch boundaries tensor of shape [batch_size, 1, num_patches, 2].
r  r   r  r   r   r4   )r#   r  r  r  r  r  clipr  r  r$   r=   expand)rn   rs   r  r(  r  r  audio_scale_factorgrid_start_melgrid_start_sgrid_end_mel
grid_end_saudio_coordss               r/   prepare_audio_coords1LTX2AudioVideoRotaryPosEmbed.prepare_audio_coords  s   > %/d6G6Gu}}ek

 "//24(+=+==@RRXX]^X_%7$:L:LL !2!226HH$'9'99<NNTTYZT[!OO3d6H6HH
{{L#=2F#--a077
BK#--a0r1   c                     U R                   S:X  a  U R                  " U0 UD6$ U R                   S:X  a  U R                  " U0 UD6$ g )Nr  r  )r  r  r  )rn   argsr   s      r/   prepare_coords+LTX2AudioVideoRotaryPosEmbed.prepare_coords  sK    ==G#,,d=f==]]g%,,d=f== &r1   coordsc                 V   U=(       d    UR                   nUR                  S   nUR                  S:X  a*  UR                  SSS9u  pEXE-   S-  nUR	                  S5      nU R
                  S:X  a$  U R                  U R                  U R                  4nOU R
                  S:X  a  U R                  4n[        R                  " [        U5       Vs/ s H  oqS S 2U4   WU   -  PM     snSS9R                  U5      nUS-  n	U R                  (       a  [        R                  O[        R                  n
[        R                   " U R"                  [        R$                  " S	S
U R&                  U	-  XS95      nU[        R(                  -  S-  R                  [        R                  S9nUR+                  S5      S-  S-
  U-  nUR-                  SS5      R/                  S5      nU R0                  S:X  a  UR3                  5       R5                  SSS9nUR7                  5       R5                  SSS9nU R&                  U	-  S:w  a  [        R8                  " US S 2S S 2S U R&                  U	-  24   5      n[        R:                  " US S 2S S 2S U R&                  U	-  24   5      n[        R<                  " X/SS9n[        R<                  " UU/SS9nX4$ U R0                  S:X  GaE  U R&                  S-  nUR                  S   nUU-
  nUR3                  5       nUR7                  5       nUS:w  ar  [        R8                  " US S 2S S 2S U24   5      n[        R:                  " US S 2S S 2S U24   5      n[        R>                  " UU/SS9n[        R>                  " UU/SS9nUR                  S   nUR                  S   nURA                  UUU RB                  S5      nURA                  UUU RB                  S5      n[        RD                  " USS5      n[        RD                  " USS5      nWW4$ s  snf )Nr4   r3   r   r   r   r   r  r  r   g      ?)r  r  stepsr(   r(  r7   r6   r   r   r   )axis)#r(  r9   r8   chunkrE  r  r  r  r  r#   r$   ranger'   r  float64r  powr  linspacer    pir=   	transposer%   r   r)   repeat_interleaver*   	ones_like
zeros_likecatconcatenater:   r   r;   )rn   r  r(  num_pos_dimscoords_start
coords_endmax_positionsir  num_rope_elemsfreqs_dtypepow_indicesr   	cos_freqs	sin_freqscos_paddingsin_paddingexpected_freqscurrent_freqspad_sizecos_freqsin_freqrA   rC   s                           r/   rz   $LTX2AudioVideoRotaryPosEmbed.forward  s    (6== ||A ;;!'-||A2|'>$L"/36F^^B'F ==G#!1143C3CT__UM]]g%!113M{{U<EXYEX1a4L=+;;EXY_abeeflm%) (,'<'<emm%--iiJJNN#TXX5OWbr
 uxx'#-111F #a'!+u4B'//2
 >>]*		55aR5@I		55aR5@Ixx.(A-#ooi1>Y>@Y>Y8Y.Z[#..yA?ZNAZ?Z9Z/[\!II{&>BG	!II{I&>BG	4 ##1 ^^w&!XX]N!KKOM%5Hyy{Hyy{H1}#oohq!YhY.GH#..x1ixi/HI ,,k8-D2N ,,k8-D2N q!Aq!A''1d.F.FKH''1d.F.FKHxA6IxA6I)##m Zs   P&)r  r  r  r  r  r    r  r  r  r   r  r  r   r  r  r  )r4   r4         r  >     r       r       @r4   r  Tr   r  )      8@)r   )N)rU   rV   rW   rX   rY   r}   r   r&   r   r~   rg   r#   r(  r   r  r  r  rz   r[   r   r   s   @r/   r  r    s    !")4!%&#%!/1/1 /1 	/1
 /1 /1 /1 /1 /1 S#X/1 /1 /1 /1 /1 /1  !!/1" 
#/1 /1p DD D 	D
 D D D 
DV 44 4 	4
 4 
4l> IMK$llK$,/%,,,>,EK$	u||U\\)	*K$ K$r1   r  c            V         ^  \ rS rSrSrSrS/rS/r\" SSSS	9\" SSSS	9\" SS
SS	9S.\" SSSS	9\" SSSS	9S.\	" SSS9S.r
\                                          STS\S\S-  S\S\S\S\S\S\\\\4   S\S\S\S\S\S\S\S-  S\S \S!\S"\S#\S$\S%\S&\S'\S(\S)\S*\S+\S,\S-\S.\S/\S0\S1\S2\S3\S4\S5\S6\S7\S8\S9S4TU 4S: jjj5       r\" S;5                        SUS<\R(                  S=\R(                  S>\R(                  S?\R(                  S@\R*                  SA\R*                  S-  SB\R(                  S-  SC\R(                  S-  SD\R(                  S-  SE\R(                  S-  SF\S-  SG\S-  SH\S-  SI\SJ\S-  SK\R(                  S-  SL\R(                  S-  SM\SN\\   S-  SO\R(                  S-  SP\S;\\\4   S-  SQ\S9\R(                  40SR jj5       rSSrU =r$ )VLTX2VideoTransformer3DModeli&  a  
A Transformer model for video-like data used in [LTX](https://huggingface.co/Lightricks/LTX-Video).

Args:
    in_channels (`int`, defaults to `128`):
        The number of channels in the input.
    out_channels (`int`, defaults to `128`):
        The number of channels in the output.
    patch_size (`int`, defaults to `1`):
        The size of the spatial patches to use in the patch embedding layer.
    patch_size_t (`int`, defaults to `1`):
        The size of the tmeporal patches to use in the patch embedding layer.
    num_attention_heads (`int`, defaults to `32`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`, defaults to `64`):
        The number of channels in each head.
    cross_attention_dim (`int`, defaults to `2048 `):
        The number of channels for cross attention heads.
    num_layers (`int`, defaults to `28`):
        The number of layers of Transformer blocks to use.
    activation_fn (`str`, defaults to `"gelu-approximate"`):
        Activation function to use in feed-forward.
    qk_norm (`str`, defaults to `"rms_norm_across_heads"`):
        The normalization layer to use.
Tnormr   r4   r   F)	split_dimexpected_dimssplit_outputr   )r   r   r;  )r   r4   )
gather_dimr  ) ropeproj_outin_channelsout_channelsNr  r  r   r   r   vae_scale_factorspos_embed_max_posr  r  
gated_attncross_attn_modaudio_in_channelsaudio_out_channelsaudio_patch_sizeaudio_patch_size_tr   r   r   r  audio_pos_embed_max_posaudio_sampling_rateaudio_hop_lengthr   audio_cross_attn_mod
num_layersr   r   r   r   caption_channelsr   r  
rope_thetarope_double_precisionr  timestep_scale_multiplier$cross_attn_timestep_scale_multiplierr   r  r   c+                 *  > [         T1U ]  5         U=(       d    UnU=(       d    UnXV-  n+UU-  n,[        R                  " UU+5      U l        [        R                  " UU,5      U l        U)(       a  [        U U+S9U l        [        U U,S9U l        U=(       d    UU l	        U(       a  SOSn-U(       a  SOSn.[        U+U-SS9U l        [        U,U.SS9U l        [        U+SSS9U l        [        U,SSS9U l        [        U+SSS9U l        [        U,SSS9U l        [        R"                  " [$        R&                  " SU+5      U+S	-  -  5      U l        [        R"                  " [$        R&                  " SU,5      U,S	-  -  5      U l        U R                  (       a   [        U+SSS9U l        [        U,SSS9U l        [1        U+UUU	U
UUU#U%S
U$U(US9U l        [1        U,UUUUUU/U#U%SU$U(US9U l        [7        U	U5      n/[1        UUUU/U
UU#U%S
U$U(US9U l        [1        UUUU/UUU#U%SU$U(US9U l        [        R<                  " [?        U5       V0s/ s HH  n0[A        S&0 SU+_SU_SU_SU_SU,_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU!_SU"_S U_S!U_S"U(_S#U*_6PMJ     sn05      U l!        [        RD                  " U+S$SS%9U l#        [        R                  " U+U5      U l$        [        RD                  " U,S$SS%9U l%        [        R                  " U,U5      U l&        SU l'        g s  sn0f )'N)in_featureshidden_sizer  r|   F)r`   ra   r3   r4   r   r  r  )r    r  r  r  r  r  r  r  r  r  r  r   r   r  )r    r  r  r  r  r  r  r  r  r  r  r   r   )r    r  r  r  r  r  r  r  r  r  r   r   )r    r  r  r  r  r  r  r  r  r  r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r  r   r   rT   )(rf   rg   ri   rl   proj_inaudio_proj_inr   caption_projectionaudio_caption_projectionprompt_modulationr]   
time_embedaudio_time_embedav_cross_attn_video_scale_shiftav_cross_attn_audio_scale_shiftav_cross_attn_video_a2v_gateav_cross_attn_audio_v2a_gater  r#   r  r  r  prompt_adalnaudio_prompt_adalnr  r  
audio_ropemaxcross_attn_ropecross_attn_audio_roper   r  r   transformer_blocks	LayerNormnorm_outr  audio_norm_outaudio_proj_outgradient_checkpointing)2rn   r  r  r  r  r   r   r   r  r  r  r  r  r  r  r  r   r  r   r   r   r  r  r  r  r   r  r  r   r   r   r   r  r   r  r  r	  r  r
  r  r   use_prompt_embeddingsr  r   audio_inner_dimvideo_time_emb_mod_paramsaudio_time_emb_mod_paramscross_attn_pos_embed_max_posrD   ro   s2                                                    r/   rg   $LTX2VideoTransformer3DModel.__init__S  s   \ 	#2{/D3D'<	36NN yyi8YY'8/J !&?L\js&tD#,E,/-D)
 "0!G3G *8AQ!)=A1!0&?[`
 !7,Eaf!
 0Fa50
, 0FA0
,
 -Ca5-
)
 -CA-
)
 "$ekk!Y.G)UX..X!Y')||EKK?4SVegjVj4j'k$ !! 6yQRns tD&<U'D# 1!%-#!+'2 3
	 7'+3-'-.'2 9
" (++<>U'V$;)!%8#!'2 3 
 &B)'+8-''2 9&
"  #%--0 z*/. +A- * !(; (: )<	
 . /H .F /H &0 ,: &6 ,@ $ #0 $2  (:!" !#$ (?%& ('( $2), +/#
: YDUS		)\: ll?Y^_ ii9KL&+#Es   ?ALattention_kwargsr   r.  r   r/  rq   audio_timestepsigmaaudio_sigmar;  r<  r  r  r  r  audio_num_framesvideo_coordsr  isolate_modalitiesspatio_temporal_guidance_blocksr   use_cross_timestepreturn_dictc                    Ub  UOUnUb  UOUnU	bB  U	R                   S:X  a2  SU	R                  UR                  5      -
  S-  n	U	R                  S5      n	U
bB  U
R                   S:X  a2  SU
R                  UR                  5      -
  S-  n
U
R                  S5      n
UR	                  S5      nUc&  U R
                  R                  UXXR                  US9nUc&  U R                  R                  UXR                  5      nU R                  UUR                  S9nU R                  UUR                  S9nU R                  USS2SS2SS24   UR                  S9nU R                  USS2SS2SS24   UR                  S9nU R                  U5      nU R                  U5      nU R                  R                  U R                  R                   -  nU R#                  UR%                  5       UUR                  S9u  nnUR'                  US	UR	                  S	5      5      nUR'                  US	UR	                  S	5      5      nU R)                  UR%                  5       UUR                  S9u  n n!U R'                  US	U R	                  S	5      5      n U!R'                  US	U!R	                  S	5      5      n!U R*                  (       a  U R-                  UR%                  5       UUR                  S9u  n"n#U R/                  UR%                  5       UUR                  S9u  n$n#U"R'                  US	U"R	                  S	5      5      n"U$R'                  US	U$R	                  S	5      5      n$OS=n"n$U(       a  UR%                  5       OUR%                  5       n%U R1                  U%UUR                  S9u  n&n#U R3                  U%U-  UUR                  S9u  n'n#U&R'                  US	U&R4                  S	   5      n&U'R'                  US	U'R4                  S	   5      n'U(       a  UR%                  5       OUR%                  5       n(U R7                  U(UUR                  S9u  n)n#U R9                  U(U-  UUR                  S9u  n*n#U)R'                  US	U)R4                  S	   5      n)U*R'                  US	U*R4                  S	   5      n*U R                  R:                  (       af  U R=                  U5      nUR'                  US	UR	                  S	5      5      nU R?                  U5      nUR'                  US	UR	                  S	5      5      nU=(       d    / n[A        U5      S:  a  Uc  [B        RD                  " U45      nUb  UR                   S:X  a
  USS2SS4   nUb  [B        RF                  " US:H  5      OS
n+[I        U5      n,[K        U RL                  5       H  u  n-n.U-U,;   a  UOSn/U-U,;   a  U+OS
n0[B        RN                  " 5       (       aJ  U RP                  (       a9  U RS                  U.UUUUUU U&U)U'U*U"U$UUUUU	U
SSSSU(       + U(       + U/U05      u  pM~  U." S'0 SU_SU_SU_SU_SU_SU _SU&_SU)_SU'_SU*_SU"_SU$_SU_SU_SU_SU_SU	_SU
_SS_SS_SS_S S_S!U(       + _S"U(       + _S#U/_S$U0_6u  pM     U RT                  S%   USS2SS2S4   -   n1U1SS2SS2S4   U1SS2SS2S4   n3n2U RW                  U5      nUSU3-   -  U2-   nU RY                  U5      n4U RZ                  S%   U!SS2SS2S4   -   n5U5SS2SS2S4   U5SS2SS2S4   n7n6U R]                  U5      nUSU7-   -  U6-   nU R_                  U5      n8U(       d  U4U84$ [a        U4U8S&9$ )(a  
Forward pass for LTX-2.0 audiovisual video transformer.

Args:
    hidden_states (`torch.Tensor`):
        Input patchified video latents of shape `(batch_size, num_video_tokens, in_channels)`.
    audio_hidden_states (`torch.Tensor`):
        Input patchified audio latents of shape `(batch_size, num_audio_tokens, audio_in_channels)`.
    encoder_hidden_states (`torch.Tensor`):
        Input video text embeddings of shape `(batch_size, text_seq_len, self.config.caption_channels)`.
    audio_encoder_hidden_states (`torch.Tensor`):
        Input audio text embeddings of shape `(batch_size, text_seq_len, self.config.caption_channels)`.
    timestep (`torch.Tensor`):
        Input timestep of shape `(batch_size, num_video_tokens)`. These should already be scaled by
        `self.config.timestep_scale_multiplier`.
    audio_timestep (`torch.Tensor`, *optional*):
        Input timestep of shape `(batch_size,)` or `(batch_size, num_audio_tokens)` for audio modulation
        params. This is only used by certain pipelines such as the I2V pipeline.
    sigma (`torch.Tensor`, *optional*):
        Input scaled timestep of shape (batch_size,). Used for video prompt cross attention modulation in
        models such as LTX-2.3.
    audio_sigma (`torch.Tensor`, *optional*):
        Input scaled timestep of shape (batch_size,). Used for audio prompt cross attention modulation in
        models such as LTX-2.3. If `sigma` is supplied but `audio_sigma` is not, `audio_sigma` will be set to
        the provided `sigma` value.
    encoder_attention_mask (`torch.Tensor`, *optional*):
        Optional multiplicative text attention mask of shape `(batch_size, text_seq_len)`.
    audio_encoder_attention_mask (`torch.Tensor`, *optional*):
        Optional multiplicative text attention mask of shape `(batch_size, text_seq_len)` for audio modeling.
    num_frames (`int`, *optional*):
        The number of latent video frames. Used if calculating the video coordinates for RoPE.
    height (`int`, *optional*):
        The latent video height. Used if calculating the video coordinates for RoPE.
    width (`int`, *optional*):
        The latent video width. Used if calculating the video coordinates for RoPE.
    fps: (`float`, *optional*, defaults to `24.0`):
        The desired frames per second of the generated video. Used if calculating the video coordinates for
        RoPE.
    audio_num_frames: (`int`, *optional*):
        The number of latent audio frames. Used if calculating the audio coordinates for RoPE.
    video_coords (`torch.Tensor`, *optional*):
        The video coordinates to be used when calculating the rotary positional embeddings (RoPE) of shape
        `(batch_size, 3, num_video_tokens, 2)`. If not supplied, this will be calculated inside `forward`.
    audio_coords (`torch.Tensor`, *optional*):
        The audio coordinates to be used when calculating the rotary positional embeddings (RoPE) of shape
        `(batch_size, 1, num_audio_tokens, 2)`. If not supplied, this will be calculated inside `forward`.
    isolate_modalities (`bool`, *optional*, defaults to `False`):
        Whether to isolate each modality by turning off cross-modality (audio-to-video and video-to-audio)
        cross attention (for all blocks). Use for modality guidance in LTX-2.3.
    spatio_temporal_guidance_blocks (`list[int]`, *optional*, defaults to `None`):
        The transformer block indices at which to apply spatio-temporal guidance (STG), which shortcuts the
        self-attention operations by simply using the values rather than the full scaled dot-product attention
        (SDPA) operation. If `None` or empty, STG will not be applied to any block.
    perturbation_mask (`torch.Tensor`, *optional*):
        Perturbation mask for STG of shape `(batch_size,)` or `(batch_size, 1, 1)`. Should be 0 at batch
        elements where STG should be applied and 1 elsewhere. If STG is being used but `peturbation_mask` is
        not supplied, will default to applying STG (perturbing) all batch elements.
    use_cross_timestep (`bool` *optional*, defaults to `False`):
        Whether to use the cross modality (audio is the cross modality of video, and vice versa) sigma when
        calculating the cross attention modulation parameters. `True` is the newer (e.g. LTX-2.3) behavior;
        `False` is the legacy LTX-2.0 behavior.
    attention_kwargs (`dict[str, Any]`, *optional*):
        Optional dict of keyword args to be passed to the attention processor.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a dict-like structured output of type `AudioVisualModelOutput` or a tuple.

Returns:
    `AudioVisualModelOutput` or `tuple`:
        If `return_dict` is `True`, returns a structured output of type `AudioVisualModelOutput`, otherwise a
        `tuple` is returned where the first element is the denoised video latent patch sequence and the second
        element is the denoised audio latent patch sequence.
Nr   r4   g     r   )r  r  rx   r   Fr   r.  r   r/  r%  r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  r   r   r'  )rR   rS   rT   )1r8   r'   r(   r=   rD  r  r  r(  r  r  r  r  r  r  configr  r
  r  r%   r   r  r  r  r  r  r  r9   r  r  r&  r  r  r   r#   zerosr   r   	enumerater   is_grad_enabledr%  _gradient_checkpointing_funcr  r"  r  r  r#  r$  rP   )9rn   r   r.  r   r/  rq   r-  r.  r/  r;  r<  r  r  r  r  r0  r1  r  r2  r3  r   r4  r,  r5  rs   r7  r8  video_cross_attn_rotary_embaudio_cross_attn_rotary_emb%timestep_cross_attn_gate_scale_factorr%  ry   r0  audio_embedded_timestepr5  rD   r6  video_ca_timestepvideo_cross_attn_scale_shiftvideo_cross_attn_a2v_gateaudio_ca_timestepaudio_cross_attn_scale_shiftaudio_cross_attn_v2a_gater   
stg_blocks	block_idxblockblock_perturbation_maskblock_all_perturbedscale_shift_valuesr  scaleoutputaudio_scale_shift_valuesaudio_shiftaudio_scaleaudio_outputs9                                                            r/   rz   #LTX2VideoTransformer3DModel.forward)  s		   H ,:+E8%0%<k% "-2H2M2MQR2R&'*@*C*CMDWDW*X&X\d%d"%;%E%Ea%H"'38T8Y8Y]^8^,-0L0O0OPcPiPi0j,jnv+v(+G+Q+QRS+T("''*
 9999J7K7KQT : L ????,.H.HL  99\-:N:N9O??<@S@Z@Z?[&*&:&:<1Q3PQ	;R[h[o[o&:&p#&*&@&@AaC#,?,F,F 'A '
#
 ]3"001DE KK<<t{{?d?dd 	. #'//!&,, #2 #

 yyR27-22:rCTCYCYZ\C]^.2.C.C""$!,22 /D /
+
+
  __ZZ__R5HI
"9">">z2OfOkOklnOo"p!!!..J]EXEX / NK $(#:#:##%*K^KdKd $; $ q &**:r;;K;KB;OPK 1 6 6z2GXG]G]^`Ga b.22K+ 6HK//1XM]M]M_*.*N*N!&,, +O +
'$a
 (,'H'H EE!&,, (I (
$!1
 (D'H'H8>>rB(
$ %>$B$B:rSlSrSrsuSv$w!/AEMMO~G]G]G_*.*N*N!,22 +O +
'$a
 (,'H'H EE!,22 (I (
$!1
 (D'H'H8>>rB(
$ %>$B$B:rSlSrSrsuSv$w! ;;,,$($;$;<Q$R!$9$>$>z2}OaOabdOe$f!*.*G*GHc*d'*E*J*JB 3 8 8 <+'
 +J*OR'./!38I8Q %ZM :(->-C-Cq-H 1!T4- @=N=Z		"3q"89`e89
 )$*A*A BIu;D
;R&7X\#3<
3J-PU$$&&4+F+F595V5V!')/00--%$$//*0****+'7622< 6; 6"/6(;6 +@6 1L	6
 6  *6 )E6 /K6 ";6 (A6 !,6 '86 &66 &66 )D6  )D!6" ,B#6$ 2N%6& )-'6( /3)6* .2+6, .2-6. 1C,B/60 1C,B162 '>364 #65622G !CB "33J?BSTUWXZ^T^B__)!Q'24Fq!Qw4Oum4%U3e;}-#'#?#?
#KNefgijlpfpNq#q #;Aq!G#DF^_`bcef_fFg["112EF1Q_ES**+>?L))%V,OOr1   )r  r#  r  r$  r  r  r  r  r  r  r  r  r  r  r  r%  r"  r  r  r  r  r  r  r  r   )*   rS  r4   r4   r  rS  i   r  r  r  r  FFrS  rS  r4   r4   r  r   r  r3   r  r  r  FF0   r  r   Fr   i   TTr  Tr4     rU  r   TF)NNNNNNNNr  NNNFNNFNT)rU   rV   rW   rX   rY    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_repeated_blocksr   r   _cp_planr   r}   r   r~   r   r&   rg   r   r#   r   
LongTensorlistr   r   rz   r[   r   r   s   @r/   r  r  &  s   4 (,$(.x$34 2AQ]bc%9AUVej%k&:QVWfk&l
 $aqtT#aqtT
 *QaHH  #&#%"%#'2=!# $!$), !"#)+(*)-"#')#( #!&%*/.(- $##'#&*)-48&"$WS,S, DjS, 	S,
 S, !S,  S, !S, !c3/S, S, S, S, S, S, S,   $J!S," #S,$  %S,& $''S,( #&)S,* $'+S,,  -S,. "%/S,0 !1S,2 3S,4 5S,6 #7S,8 9S,: ;S,< =S,> "&?S,@ AS,B CS,D ES,F !GS,H IS,J  $KS,L MS,N $'OS,P /2QS,R SS,V WS,X 
YS, S,j () 37%)+/6:<@!%! '+,0,0#(<@15#(26 1rP||rP #\\rP  %||	rP
 &+\\rP ""rP ((4/rP ||d"rP \\D(rP !&t 3rP ',llT&9rP $JrP d
rP TzrP rP  *!rP" llT)#rP$ llT)%rP& !'rP( *.cT)9)rP* !<<$.+rP, !-rP. sCx.4//rP0 1rP2 
3rP *rPr1   r  )5r   dataclassesr   typingr   r#   torch.nnri   configuration_utilsr   r   loadersr   r	   utilsr
   r   r   r   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   modeling_utilsr   normalizationr   
get_loggerrU   r   r   r   r0   rN   rP   Moduler]   r   r   r   r   r  r  rT   r1   r/   <module>rk     sa     !    B ? L L L I I 6 $ ] ' # 
		H	%ELL u||U\\?Y9Z _d_k_k &ell &5u||9S3T &Y^YeYe &R !Z ! ! &LRYY &LRS Sl` `FOEHHOO%9 Od|2		 |2~H$299 H$VvP^-CEUWavPr1   