
    
3jU                        S SK r S SKrS SKJr  S SKJs  Jr  SSKJrJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJrJr     SS\R&                  S	\R&                  S
\\R*                  -  S\S\S\4S jjrSS\R&                  S\S\R&                  4S jjr " S S\R4                  5      r " S S\R4                  5      r " S S\R4                  5      r " S S\\\5      rg)    N   )ConfigMixinregister_to_config)PeftAdapterMixin)FeedForward)
ModelMixin)LTX2AttentionLTX2AudioVideoAttnProcessortext_hidden_statessequence_lengthsdevicepadding_sidescale_factorepsc                     U R                   u  pgpU R                  n
[        R                  " XrS9R	                  S5      nUS:X  a  XSS2S4   :  nO$US:X  a  XqSS2S4   -
  nX:  nO[        SU 35      eUSS2SS2SS4   nU R                  U) S5      nX-  R                  USSS5      nUR                  S	S
S9X-   -  nU R                  U) [        S5      5      R                  S	S
S9nU R                  U) [        S5      5      R                  S	S
S9nU U-
  UU-
  U-   -  nUU-  nUR                  S5      nUR                  S5      R                  SSX-  5      nUR                  U) S5      nUR                  U
S9nU$ )a}  
Performs per-batch per-layer normalization using a masked mean and range on per-layer text encoder hidden_states.
Respects the padding of the hidden states.

Args:
    text_hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, hidden_dim, num_layers)`):
        Per-layer hidden_states from a text encoder (e.g. `Gemma3ForConditionalGeneration`).
    sequence_lengths (`torch.Tensor of shape `(batch_size,)`):
        The number of valid (non-padded) tokens for each batch instance.
    device: (`str` or `torch.device`, *optional*):
        torch device to place the resulting embeddings on
    padding_side: (`str`, *optional*, defaults to `"left"`):
        Whether the text tokenizer performs padding on the `"left"` or `"right"`.
    scale_factor (`int`, *optional*, defaults to `8`):
        Scaling factor to multiply the normalized hidden states by.
    eps (`float`, *optional*, defaults to `1e-6`):
        A small positive value for numerical stability when performing normalization.

Returns:
    `torch.Tensor` of shape `(batch_size, seq_len, hidden_dim * num_layers)`:
        Normed and flattened text encoder hidden states.
r   r   rightNleftz,padding_side must be 'left' or 'right', got            )r      Tdimkeepdiminfz-infr   dtype)shaper   torcharange	unsqueeze
ValueErrormasked_fillviewsumfloataminamaxflattensqueezeexpandto)r   r   r   r   r   r   
batch_sizeseq_len
hidden_dim
num_layersoriginal_dtypetoken_indicesmaskstart_indicesmasked_text_hidden_statesnum_valid_positionsmasked_meanx_minx_maxnormalized_hidden_states	mask_flats                        ]/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/pipelines/ltx2/connectors.pyper_layer_masked_mean_normr>      s   < 3E2J2J/J'--N LL8BB1EMw488		1d7";;-G~VWW1dD !D !3 > >uc J+8>>z1aQRS+//FD/IM`MfgK **D5%,?DDY]D^E**D5%-@EE&Z^E_E !3[ @UU]UXEXY7,F  8??BR ''B
0GHI7CCYJPST7:::P##    text_encoder_hidden_statesreturnc                 n    [         R                  " U S-  SSS9nU [         R                  " X!-   5      -  nU$ )Nr   Tr   )r    meanrsqrt)r@   r   variancenorm_text_encoder_hidden_statess       r=   per_token_rms_normrG   Q   s6    zz4a7QMH&@5;;x~C^&^#**r?   c                      ^  \ rS rSrSr     SS\S\S\S\S\S\4U 4S	 jjjr	S
\S\S\\
R                  -  S\\
R                  \
R                  4   4S jrSrU =r$ )LTX2RotaryPosEmbed1dW   zQ
1D rotary positional embeddings (RoPE) for the LTX 2.0 text encoder connectors.
r   base_seq_lenthetadouble_precision	rope_typenum_attention_headsc                    > [         TU ]  5         US;  a  [        SU< S35      eXl        X l        X0l        X@l        XPl        X`l        g )N)interleavedsplitz
rope_type=z9 not supported. Choose between 'interleaved' and 'split'.)	super__init__r#   r   rK   rL   rM   rN   rO   )selfr   rK   rL   rM   rN   rO   	__class__s          r=   rT   LTX2RotaryPosEmbed1d.__init__\   sO     	44
	|+deff(
 0"#6 r?   r.   posr   rA   c                    [         R                  " U[         R                  US9nX@R                  -  nUR	                  S5      R                  US5      nSnU R                  (       a  [         R                  O[         R                  n[         R                  " U R                  [         R                  " SSU R                  U-  XsS95      nU[         R                  -  S-  R                  [         R                  S	9n	UR	                  S
5      S-  S-
  U	-  n	U R                  S:X  a  U	R                  5       R!                  SS
S9n
U	R#                  5       R!                  SS
S9nU R                  U-  S:w  a  [         R$                  " U
S S 2S S 2S U R                  U-  24   5      n[         R&                  " US S 2S S 2S U R                  U-  24   5      n[         R(                  " X/S
S9n
[         R(                  " X/S
S9nX4$ U R                  S:X  GaD  U R                  S-  nU	R*                  S
   nX-
  nU	R                  5       nU	R#                  5       nUS:w  ar  [         R$                  " US S 2S S 2S U24   5      n[         R&                  " US S 2S S 2S U24   5      n[         R,                  " UU/S
S9n[         R,                  " UU/S
S9nUR*                  S   nUR*                  S   nUR/                  UUU R0                  S
5      nUR/                  UUU R0                  S
5      n[         R2                  " USS5      n
[         R2                  " USS5      nW
W4$ )N)r   r   r   r   r   r         ?)startendstepsr   r          @r   r   rQ   r   rR   )axis)r    r!   float32rK   r"   repeatrM   float64powrL   linspacer   pir-   rN   cosrepeat_interleavesin	ones_like
zeros_likecatr   concatenatereshaperO   swapaxes)rU   r.   rX   r   grid_1dgridnum_rope_elemsfreqs_dtypepow_indicesfreqs	cos_freqs	sin_freqscos_paddingsin_paddingexpected_freqscurrent_freqspad_sizecos_freqsin_freqbts                        r=   forwardLTX2RotaryPosEmbed1d.forwardp   s    ,,s%--G---  #**:q9 '+'<'<emm%--iiJJNN#TXX5OWbr
 uxx'#-111F #a'!+u4 >>]*		55aR5@I		55aR5@Ixx.(A-#ooi1>Y>@Y>Y8Y.Z[#..yA?ZNAZ?Z9Z/[\!II{&>BG	!II{&>BG	4 ##1 ^^w&!XX]N!KKOM%5Hyy{Hyy{H1}#oohq!YhY.GH#..x1ixi/HI ,,k8-D2N ,,k8-D2N q!Aq!A''1d.F.FKH''1d.F.FKHxA6IxA6I)##r?   )rK   r   rM   rO   rN   rL   )        @TrQ       )__name__
__module____qualname____firstlineno____doc__intr'   boolstrrT   r    r   tupleTensorr   __static_attributes____classcell__rV   s   @r=   rI   rI   W   s     !!%&#%77 7 	7
 7 7 !7 7(<$<$ <$ ell"	<$
 
u||U\\)	*<$ <$r?   rI   c                      ^  \ rS rSr    SS\S\S\S\S\S\S\4U 4S	 jjjr  SS\	R                  S\	R                  S
-  S\	R                  S
-  S\	R                  4S jjrSrU =r$ )LTX2TransformerBlock1d   r   rO   attention_head_dimactivation_fnr   rN   apply_gated_attentionc           
        > [         TU ]  5         [        R                  R	                  XSS9U l        [        UUUUUU[        5       S9U l        [        R                  R	                  XSS9U l	        [        XS9U l        g )NFr   elementwise_affine)	query_dimheadskv_headsdim_headrN   r   	processor)r   )rS   rT   r    nnRMSNormnorm1r	   r
   attn1norm2r   ff)	rU   r   rO   r   r   r   rN   r   rV   s	           r=   rT   LTX2TransformerBlock1d.__init__   sx     	XX%%cu%M
"%('"713

 XX%%cu%M
c?r?   Nhidden_statesattention_mask
rotary_embrA   c                     U R                  U5      nU R                  XBUS9nX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r   query_rotary_emb)r   r   r   r   )rU   r   r   r   norm_hidden_statesattn_hidden_statesff_hidden_statess          r=   r   LTX2TransformerBlock1d.forward   s]     "ZZ6!ZZ(:lvZw%:!ZZ677#56%8r?   )r   r   r   r   )zgelu-approximateư>rQ   F)NN)r   r   r   r   r   r   r'   r   rT   r    r   r   r   r   r   s   @r=   r   r      s     0&&+@@ !@  	@
 @ @ @  $@ @: /3*.	|| t+ LL4'	
 
 r?   r   c                     ^  \ rS rSrSrSr           SS\S\S\S\S-  S	\S
\S\S\S\S\	S\4U 4S jjjr
  SS\R                  S\R                  S-  S\S\\R                  \R                  4   4S jjrSrU =r$ )LTX2ConnectorTransformer1d   z
A 1D sequence transformer for modalities such as text.

In LTX 2.0, this is used to process the text encoder hidden states for each of the video and audio streams.
TrO   r   r1   num_learnable_registersNrope_base_seq_len
rope_thetarope_double_precisionr   causal_temporal_positioningrN   gated_attentionc                 T  > [         TU ]  5         Xl        X-  U l        Xl        X@l        S U l        UbJ  [        R                  " X@R                  5      S-  S-
  n[        R                  R                  U5      U l        [        U R                  UUUU
US9U l        [        R                  R                  [        U5       Vs/ s H  n[        U R                  UUU
US9PM     sn5      U l        [        R                  R#                  U R                  USS9U l        SU l        g s  snf )Nr^   rZ   )rK   rL   rM   rN   rO   )r   rO   r   rN   r   Fr   )rS   rT   rO   	inner_dimr   r   learnable_registersr    randr   	ParameterrI   rope
ModuleListranger   transformer_blocksr   norm_outgradient_checkpointing)rU   rO   r   r1   r   r   r   r   r   r   rN   r   init_registers_rV   s                 r=   rT   #LTX2ConnectorTransformer1d.__init__   s    	#6 ,A+F('>$#' "."ZZ(?PSVVY\\N',xx'9'9.'ID$(NN*2 3
	 #((("5"5 z*	 +A '(;'9'*9 +	#
 ((SUZ([&+#	s   !D%r   r   attn_mask_binarize_thresholdrA   c                    UR                   u  pEnU R                  Gb  XPR                  -  S:w  a(  [        SUR                   S    SU R                   35      eXPR                  -  n[        R
                  " U R                  US45      nX#:  R                  5       n	U	R                  S:X  a   U	R                  S5      R                  S5      n	[        U5       V
s/ s H  oXU
   R                  5       S S 24   PM     nn
U Vs/ s H  oR                   S   PM     nnU Vs/ s H  oU-
  PM	     nn[        X5       VVs/ s H   u  nn[        R                  " USSSU4SS9PM"     nnn[        R                  " U Vs/ s H  oR                  S5      PM     snSS9n[        R                   " U	S/S9R                  S	5      nUU-  SU-
  U-  -   n[        R"                  " U5      nU R%                  XEUR&                  S
9nU R(                   HJ  n[        R*                  " 5       (       a&  U R,                  (       a  U R/                  UXU5      nMC  U" XUS9nML     U R1                  U5      nX4$ s  sn
f s  snf s  snf s  snnf s  snf )Nr   z$The `hidden_states` sequence length r   z: should be divisible by the number of learnable registers    )padvaluer_   )dimsr   r   )r   r   )r   r   r   r#   r    tiler   ndimr+   r   r   zipFr   rl   r"   fliprk   r   r   r   is_grad_enabledr   _gradient_checkpointing_funcr   )rU   r   r   r   r.   r/   r   num_register_repeats	registersbinary_attn_maskihidden_states_non_paddedxvalid_seq_lensvalid_seq_lenpad_lengthsppadded_hidden_statesflipped_maskr   blocks                        r=   r   "LTX2ConnectorTransformer1d.forward  s    "/!4!4
Q ##/555: :=;N;Nq;Q:R S//3/K/K.LN 
 $+.J.J#J 

4#;#;>RTU=VWI . NSSU$$)#3#;#;A#>#F#Fq#I afgqar'sar\]a!9L9Q9Q9SUV6V(War$'s2JK2JQggaj2JNKHVW}]2KW@CD\@j$@j1aaAq\3@j ! $ $)99FZ-[FZkk!nFZ-[ab#c  ::&6aSAKKBOL(+??1|CSW`B``M #--n=N YYz=;O;OYP
 ,,E$$&&4+F+F $ A A%hr s %m_i j	 - m4,,5 (tKW$ .\s   #I,I1I6='I;;J)	r   r   r   r   r   rO   r   r   r   )      r   r   r   r   Tr   FrQ   F)Ng     )r   r   r   r   r    _supports_gradient_checkpointingr   r'   r   r   rT   r    r   r   r   r   r   r   s   @r=   r   r      s    (,$ $&"%.1!%#&*,1& %1, 1,  1, 	1,
 "%t1, 1, 1,  $1, 1, &*1, 1, 1, 1,l /3.5	3-||3- t+3- ',	3-
 
u||U\\)	*3- 3-r?   r   c            ,       ^  ^  \ rS rSrSr\                     S!S\S\S\S\S\S\S	-  S
\S\S\S\S\S	-  S\S\S\S\S\S\	S\S\S\S\4*U 4S jjj5       r
  S"S\R                  S\R                  S\	S\S\\R                  \R                  \R                  4   4
S jjrS rU =r$ )#LTX2TextConnectorsiM  z}
Text connector stack used by LTX 2.0 to process the packed text encoder hidden states for both the video and audio
streams.
caption_channelstext_proj_in_factor#video_connector_num_attention_heads"video_connector_attention_head_dimvideo_connector_num_layers'video_connector_num_learnable_registersNvideo_gated_attn#audio_connector_num_attention_heads"audio_connector_attention_head_dimaudio_connector_num_layers'audio_connector_num_learnable_registersaudio_gated_attnconnector_rope_base_seq_lenr   r   r   rN   per_modality_projectionsvideo_hidden_dimaudio_hidden_dim	proj_biasc                 8  > [         TU ]  5         X-  nU(       a7  [        R                  " UUUS9U l        [        R                  " UUUS9U l        O[        R                  " UUUS9U l        [        UUUUUUUUUUS9
U l        [        UU	U
UUUUUUUS9
U l	        g )N)bias)
rO   r   r1   r   r   r   r   r   rN   r   )
rS   rT   r   Linearvideo_text_proj_inaudio_text_proj_intext_proj_inr   video_connectoraudio_connector)rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rN   r   r   r   r   text_encoder_dimrV   s                          r=   rT   LTX2TextConnectors.__init__S  s    2 	+A#&(ii0@BRYb&cD#&(ii0@BRYb&cD# "		*:<LS\ ]D9 CA1$K9!"7(C, 
  : CA1$K9!"7(C, 
r?   r@   r   r   r   rA   c                    UR                   S:X  a(  UR                  SU R                  R                  S45      nU R                  R                  (       Ga  [        U5      nUR                  SS5      nUR                  5       R                  S5      n[        R                  " Xe[        R                  " U5      5      n[        R                  " U R                  R                  U R                  R                  -  5      nXW-  n[        R                  " U R                  R                  U R                  R                  -  5      n	XY-  n
U R!                  U5      nU R#                  U
5      nO;UR%                  SS9n['        UUUR(                  UUS9nU R+                  U5      nUnUnUR,                  nUR/                  [        R0                  5      S-
  R/                  U5      nUR3                  UR4                  S   SSUR4                  S   5      nU[        R6                  " U5      R8                  -  nU R;                  UU5      u  nnUS:  R/                  [        R0                  5      nUR3                  UR4                  S   UR4                  S   S5      nUU-  nU R=                  UU5      u  nnUUUR?                  S5      4$ )	aK  
Given per-layer text encoder hidden_states, extracts features and runs per-modality connectors to get text
embeddings for the LTX-2.X DiT models.

Args:
    text_encoder_hidden_states (`torch.Tensor`)):
        Per-layer text encoder hidden_states. Can either be 4D with shape `(batch_size, seq_len,
        caption_channels, text_proj_in_factor) or 3D with the last two dimensions flattened.
    attention_mask (`torch.Tensor` of shape `(batch_size, seq_len)`):
        Multiplicative binary attention mask where 1s indicate unmasked positions and 0s indicate masked
        positions.
    padding_side (`str`, *optional*, defaults to `"left"`):
        The padding side used by the text encoder's text encoder (either `"left"` or `"right"`). Defaults to
        `"left"` as this is what the default Gemma3-12B text encoder uses. Only used if
        `per_modality_projections` is `False` (LTX-2.0 models).
    scale_factor (`int`, *optional*, defaults to `8`):
        Scale factor for masked mean/range normalization. Only used if `per_modality_projections` is `False`
        (LTX-2.0 models).
r   r   r   r_   )r   r   r   r   r   r   r   r   ) r   	unflattenconfigr   r   rG   r*   r   r"   r    whererk   mathsqrtr   r   r   r   r&   r>   r   r   r   r-   int64rn   r   finfomaxr   r  r+   )rU   r@   r   r   r   rF   	bool_maskvideo_scale_factorvideo_norm_text_embaudio_scale_factoraudio_norm_text_embvideo_text_emb_projaudio_text_emb_projr   text_emb_proj
text_dtypeadd_attn_maskvideo_text_embeddingvideo_attn_maskr   audio_text_embeddingr   s                         r=   r   LTX2TextConnectors.forward  s   4 &**a/)C)M)MaRVR]R]RnRnprQs)t&;;///.@A[.\+.M.U.UVWYZ.[+&++-77;I.3kkE<L<LMl<m/+
 "&4;;+G+G$++JfJf+f!g"A"V!%4;;+G+G$++JfJf+f!g"A"V #'"9"9:M"N"&"9"9:M"N  .11b19.H#=!1188))/+ !--.MNM"/"/ )..
(++EKK81<@@L'//0D0DQ0GBP^PdPdegPhi&Z)@)D)DD040D0DEXZg0h-o ,d266u{{C+334H4N4Nq4QSgSmSmnoSprst36FF"&"6"67JM"Za#%9;K;S;STV;WWWr?   )r  r   r   r   r   )i   1   r   r   r   r   Fr   r   r   r   Fr   r   TFrQ   Fr   i   F)r      )r   r   r   r   r   r   r   r   r'   r   rT   r    r   r   r   r   r   r   s   @r=   r   r   M  s   
  !%#%3525*+>A!&3525*+>A!&+/#&*,1&). $ $-7
7
 !7
 .1	7

 -07
 %(7
 25t7
 7
 .17
 -07
 %(7
 25t7
 7
 &)7
 7
   $!7
" &*#7
$ %7
& #''7
( )7
* +7
, -7
 7
z #OX$)LLOX OX 	OX
 OX 
u||U\\5<<7	8OX OXr?   r   )r   r  r   )r   )r  r    torch.nnr   torch.nn.functional
functionalr   configuration_utilsr   r   loadersr   models.attentionr   models.modeling_utilsr   $models.transformers.transformer_ltx2r	   r
   r   r   r   r   r'   r>   rG   ModulerI   r   r   r    r?   r=   <module>r'     s         B ' + / ^ @$@$ll@$ %,,@$ 	@$
 @$ 
@$F+5<< +e +W\WcWc +U$299 U$p)RYY )Xo- o-dOX%5{ OXr?   