
    
3j6              	       x   S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJ	r	  SSKJ
r
  SSKJrJrJr  S	rS
rS rS\R$                  S\S\S\R$                  4S jrS\S\S\4S jrS\S\S\
R0                  4S jr S'S\R$                  S\R$                  S\S\R$                  4S jjrS\R$                  S\R$                  S\R$                  4S jr " S S\
R8                  5      r " S S\
R8                  5      r " S  S!\
R8                  5      r " S" S#\
R8                  5      r S$ r!S% r"\RF                  S4S& jr$g)(a?  Text Embeddings Pipeline: Feature Extractors + Embeddings Processors.

Provides the 3-block Gemma text encoder pipeline:
  1. Gemma Model (external) -- runs the LLM, gets hidden states
  2. Feature Extractor (V1/V2) -- normalization + projection
  3. Embeddings Processor (Video / AV) -- wraps Embeddings1DConnector(s)
    N)Path)load_torch_file)	rearrange)nn   )Embeddings1DConnectorload_audio_embeddings_connectorload_video_embeddings_connectorzmodel.diffusion_model.ztext_embedding_projection.c                     US:  R                  [        R                  5      nUR                  U R                  S   U R                  S   S5      nX-  U4$ )zAConvert additive mask to binary [B, seq, 1] and zero out padding.ư>r   r   )totorchint64reshapeshape)encodedencoded_maskbinary_masks      X/home/wildlama/comfy/ComfyUI/custom_nodes/ComfyUI-LTXVideo/text_embeddings_connectors.py_to_binary_maskr   !   sO    (*..u{{;K%%gmmA&6a8H!LK +--    x
target_dim
source_dimreturnc                 8    U [         R                  " X-  5      -  $ )z9Rescale normalization: x * sqrt(target_dim / source_dim).)mathsqrt)r   r   r   s      r   _rescale_normr   (   s    tyy0111r   sdprefixc                     U R                  5        VVs0 s H,  u  p#UR                  U5      (       d  M  U[        U5      S U_M.     snn$ s  snnf )z0Extract keys with *prefix* and strip the prefix.N)items
startswithlen)r    r!   kvs       r   
_filter_sdr(   -   s@    ,.HHJOJDA!,,v:NAc&kmaJOOOs
   AAmodalityc                     U S3n[          U S3n[          U S3nX   nUR                  u  px[        R                  " XXP;   S9n	U	R	                  [        U [          U S35      5        U	R                  US9$ )zLoad an aggregate_embed Linear from the state dict.

Args:
    sd: Full checkpoint state dict.
    modality: ``"video"`` or ``"audio"``.
    dtype: Target dtype.

Returns ``linear`` module.
_aggregate_embedz.weightz.biasbias.dtype_PREFIX_TEXT_PROJr   r   Linearload_state_dictr(   r   )
r    r)   r0   name
weight_keybias_keyweightout_featuresembedding_dimlinears
             r   _load_aggregate_embedr<   2   s     Z'(D%&tfG4J#$TF%0H^F"(,,LYY}HF
:b->,?vQ*GHI9959!!r   encoded_textsequence_lengthspadding_sidec                    U R                   u  p4pVU R                  n[        R                  " XGS9SSS24   nUS:X  a  XSS2S4   :  n	O$US:X  a  XASS2S4   -
  n
X:  n	O[	        SU 35      e[        U	S5      n	U R                  U	) S5      nX-  R                  USSS5      nUR                  S	S
S9US-   -  nU R                  U	) [        S5      5      R                  S	S
S9nU R                  U	) [        S5      5      R                  S	S
S9nX-
  nSX-
  -  US-   -  nUR                  X4S5      n[        U	S5      R                  SSXV-  5      nUR                  U) S5      nU$ )aU  
Normalize a 4D tensor [B, T, D, L] per sample and per layer, using sequence_lengths to mask.
Returns [B, T,  D * L] tensor with original padding preserved.

Args:
    encoded_text: 4D tensor [B, T, D, L]
    sequence_lengths: 1D tensor [B] with actual sequence lengths
    padding_side: "left" or "right" to indicate which side has padding
)deviceNrightleftz,padding_side must be 'left' or 'right', got zb t -> b t 1 1g        r   )r      Tdimkeepdimr   infz-inf   zb t 1 1 -> b t 1)r   rA   r   arange
ValueErrorr   masked_fillviewsumfloataminamaxr   expand)r=   r>   r?   BTDLrA   token_indicesmaskstart_indicesmaskeddenommeanx_minx_maxrange_normedmask_flatteneds                      r   norm_and_concat_padded_batchrc   K   s    ##JA!  F LL247;Mw488		QW55-G~VWWT+,D %%teS1F!''1a3E::&$:/54<@D $$dUE%L9>>6SW>XE$$dUE&M:??D @ E ]F ,%&&4-8F ^^A"%F t%78??BNN5FMr   attention_maskc                 J   U R                   u  p#pE[        R                  " U S-  SSS9nU [        R                  " US-   5      -  nUR	                  X#XE-  5      nUR                  5       R                  S5      n[        R                  " X[        R                  " U5      5      nU$ )a  Per-token RMSNorm normalization.

Each token is normalized independently using RMSNorm over its D
dimension.  This is naturally causal and supports packing since each
token's normalization is self-contained.

Args:
    encoded_text: 4D tensor [B, T, D, L]
    attention_mask: [B, T] binary mask (1=real, 0=pad)

Returns:
    [B, T, D*L] normalized and flattened tensor with padding zeroed out.
rD   TrE   r   rJ   )	r   r   r]   rsqrtr   bool	unsqueezewhere
zeros_like)	r=   rd   rT   rU   rV   rW   variancera   mask_3ds	            r   norm_and_concat_per_token_rmsrm      s    " ##JA!zz,/q$?HEKK488F^^A!%(F!!#--b1G[[%*:*:6*BCFMr   c                   Z   ^  \ rS rSrSrSS\R                  S\4U 4S jjjrS	S jr	Sr
U =r$ )
FeatureExtractorV1   z019B: per-segment norm -> aggregate_embed -> 3840aggregate_embedis_avc                 :   > [         TU ]  5         Xl        X l        g N)super__init__rq   rr   )selfrq   rr   	__class__s      r   rv   FeatureExtractorV1.__init__   s    .
r   c                     UR                  SS9n[        XU5      nUR                  UR                  5      nU R	                  U5      nU R
                  (       a  XfS.$ SU0$ )NrJ   rF   )videoaudior|   )rO   rc   r   r0   rq   rr   )rw   all_layer_hiddensrd   r?   r>   ra   featuress          r   forwardFeatureExtractorV1.forward   sg    )--"-5-
 ,223''/::%99""r   )rq   rr   )FrC   )__name__
__module____qualname____firstlineno____doc__r   Modulerg   rv   r   __static_attributes____classcell__rx   s   @r   ro   ro      s,    :		 $  
	# 	#r   ro   c                   z   ^  \ rS rSrSr S
S\R                  S\S\R                  S-  4U 4S jjjrSS jr	S	r
U =r$ )FeatureExtractorV2   z822B: per-token RMS norm -> rescale -> aggregate_embed(s)Nvideo_aggregate_embedr:   audio_aggregate_embedc                 F   > [         TU ]  5         Xl        X0l        X l        g rt   )ru   rv   r   r   r:   )rw   r   r:   r   rx   s       r   rv   FeatureExtractorV2.__init__   s"     	%:"%:"*r   c                 b   [        X5      nUR                  UR                  5      nU R                  R                  nSU R                  [        XEU R                  5      5      0nU R                  b>  U R                  R                  nU R                  [        XGU R                  5      5      US'   U$ )Nr|   r}   )rm   r   r0   r   r9   r   r:   r   )rw   r~   rd   r?   ra   v_dimresulta_dims           r   r   FeatureExtractorV2.forward   s    ./@Q,223**77T//fT-?-?@

 %%1..;;E"88fT-?-?@F7O r   )r   r:   r   rt   r   )r   r   r   r   r   r   r3   intrv   r   r   r   r   s   @r   r   r      sH    B 37		+!yy	+ 	+  "yy4/		+ 	+ r   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )VideoEmbeddingsProcessor   z2Video-only embeddings processor: single connector.video_connectorc                 .   > [         TU ]  5         Xl        g rt   )ru   rv   r   )rw   r   rx   s     r   rv   !VideoEmbeddingsProcessor.__init__   s    .r   c                 n    U R                  US   U5      u  p4[        X45      u  p5X5R                  S5      4$ )Nr|   rJ   )r   r   squeeze)rw   r   rd   r   rY   r   s         r   create_embeddings*VideoEmbeddingsProcessor.create_embeddings   s<    ,,Xg->O.w=++B///r   )r   
r   r   r   r   r   r   rv   r   r   r   r   s   @r   r   r      s    </(= /0 0r   r   c                   >   ^  \ rS rSrSrS\S\4U 4S jjrS rSrU =r	$ )AVEmbeddingsProcessor   z;Audio-video embeddings processor: dual connectors + concat.r   audio_connectorc                 :   > [         TU ]  5         Xl        X l        g rt   )ru   rv   r   r   )rw   r   r   rx   s      r   rv   AVEmbeddingsProcessor.__init__   s    
 	..r   c                     U R                  US   U5      u  p4[        X45      u  p5U R                  US   U5      u  pg[        R                  " X6/SS9nX5R                  S5      4$ )Nr|   r}   rJ   r{   )r   r   r   r   catr   )rw   r   rd   r   rY   r   audio_encoded_s           r   r   'AVEmbeddingsProcessor.create_embeddings   si    ,,Xg->O.w=//0A>R))W4"=++B///r   )r   r   r   r   s   @r   r   r      s'    E/./ //0 0r   r   c                    [          S3nX ;  a  gX   nUR                  S   UR                  S   pT[          S3nX`;   n[        R                  " XEUS9nUR	                  [        U [          S35      5        UR                  US9$ )	z_Load the single aggregate_embed (V1 models).

Returns:
    nn.Linear or None if key not found.
aggregate_embed.weightNr   r   aggregate_embed.biasr,   aggregate_embed.r/   r1   )	r    r0   keyr8   in_featuresr9   r7   has_biasr;   s	            r   _load_single_aggregate_embedr     s     5
6C
}WF &Qa#$$89H~HYY{x@F
:b->,??O*PQR9959!!r   c           	         [        U 5      n U R                  5       (       d  g[        [        U 5      SS9nSU;  a  [	        SU  S35      eUS   nUR
                  S   UR
                  S   pTS	nXb;   n[        R                  " XEUS
9nUR                  UR                  5        V	V
s0 s H/  u  pU	R                  S5      (       d  M  U	R                  S5      U
_M1     sn
n	5        UR                  US9$ s  sn
n	f )zLoad aggregate_embed from a standalone checkpoint file (legacy fallback).

Args:
    path: Path to safetensors file containing aggregate_embed weights.
    dtype: Target dtype.

Returns:
    nn.Linear or None if file does not exist.
NFreturn_metadatar   zCheckpoint z+ does not contain 'aggregate_embed.weight'.r   r   r   r,   r   r/   )r   existsr   strrL   r   r   r3   r4   r#   r$   removeprefixr   )pathr0   	loaded_sdr8   r   r9   r7   r   r;   r&   r'   s              r   &_load_single_aggregate_embed_from_filer     s     :D;;==D	5AIy0$JK
 	
 /0F &Qa%H$HYY{x@F
 ")	
)||./ 2ANN-.1)	
 9959!!	
s   %C2
C2
c                    [        [        U 5      SS9u  p4[        R                  " UR	                  SS5      5      nUR	                  S0 5      n[
         S3U;   n[         S3U;   n[        X6U5      n	U(       a  [        X6U5      n
[        X5      nO[        U	5      nU(       a  SSSSS	S
.nUR                  5        H-  u  pUR	                  U5      nX:X  a  M   SU SU< SU< 35       e   [        USU5      nU(       a  [        USU5      OSnUR	                  SS5      n[        UUU5      U4$ [        X15      nUc  Ub  [        X!5      n[!        UUS9U4$ )a  Load feature extractor + embeddings processor from LTX-V checkpoint.

Auto-detects model variant (19B/22B, video-only/AV).

Args:
    ltxv_path: Path to the LTX-V checkpoint.
    dtype: Target dtype for loaded modules.
    fallback_proj_path: Optional path to a standalone ``proj_linear.safetensors``
        file.  Used as a legacy fallback when the aggregate_embed is not
        stored inside the LTX-V checkpoint (V1 models).

Returns:
    (feature_extractor, embeddings_processor)
Tr   configz{}transformerz audio_adaln_single.linear.weightzvideo_aggregate_embed.weightFper_token_rms)caption_projection_first_linearcaption_proj_input_norm caption_projection_second_linearcaption_proj_before_connectortext_encoder_norm_typez,Unexpected config for dual-aggregate model: =z, expected r|   r}   Nprompt_embedding_dimi   )rr   )r   r   jsonloadsget_PREFIX_BASEr2   r
   r	   r   r   r#   r<   r   r   r   ro   )	ltxv_pathr0   fallback_proj_pathr    metadatar   transformer_configrr   has_dual_aggregater   r   	processor	_expectedr   expected_valactual	video_agg	audio_aggr:   rq   s                       r   load_text_embeddings_pipeliner   <  s   " #3y>4HLBZZXt45FM26n<=CE-..JKrQ 6beTO 9"RWX)/K	,_=	  05',05-1&5
	 "+!2C'++C0F) >%q
+l-=?) "3 *"gu=	AF)"gu=D	*../EtL"9mYGSS22=O#5#A@
 e<iHHr   )rB   )%r   r   r   pathlibr   r   comfy.utilsr   einopsr   r   embeddings_connectorr   r	   r
   r   r2   r   Tensorr   r   dictr   r(   r3   r<   rc   rm   r   ro   r   r   r   r   r   bfloat16r    r   r   <module>r      sf       '    (0 .2U\\ 2s 2 2 2
P4 P P P
"d "c "RYY "8  7,,7ll7 7 \\	7t,,LL \\@# #( H
0ryy 
00BII 02"&"N ^^<Ir   