
    
3j)             
          S SK r S SKrS SKJrJr  S SKrS SKrS SKJ	r	J
r
JrJr  SSKJrJr  SSKJrJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSK J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)J*r*  \" 5       (       a  S SK+J,s  J-r.  Sr/OSr/\R`                  " \15      r2Sr3    S#S\4S\4S\5S\54S jjr6    S$S\4S-  S\7\Rp                  -  S-  S\9\4   S-  S\9\5   S-  4S jjr:S%S  jr; " S! S"\#\\5      r<g)&    N)AnyCallable)Gemma3ForConditionalGenerationGemma3ProcessorGemmaTokenizerGemmaTokenizerFast   )MultiPipelineCallbacksPipelineCallback)FromSingleFileMixinLTX2LoraLoaderMixin)AutoencoderKLLTX2AudioAutoencoderKLLTX2Video)LTX2VideoTransformer3DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )LTX2TextConnectors)LTX2PipelineOutput)LTX2VocoderLTX2VocoderWithBWETFa$  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTX2Pipeline
        >>> from diffusers.pipelines.ltx2.export_utils import encode_video

        >>> pipe = LTX2Pipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

        >>> frame_rate = 24.0
        >>> video, audio = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=768,
        ...     height=512,
        ...     num_frames=121,
        ...     frame_rate=frame_rate,
        ...     num_inference_steps=40,
        ...     guidance_scale=4.0,
        ...     output_type="np",
        ...     return_dict=False,
        ... )

        >>> encode_video(
        ...     video[0],
        ...     fps=frame_rate,
        ...     audio=audio[0].float().cpu(),
        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
        ...     output_path="video.mp4",
        ... )
        ```
base_seq_lenmax_seq_len
base_shift	max_shiftc                 4    XC-
  X!-
  -  nX5U-  -
  nX-  U-   nU$ N )image_seq_lenr   r   r    r!   mbmus           `/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/pipelines/ltx2/pipeline_ltx2.pycalculate_shiftr*   U   s3     
	K$>?A%%A		Q	BI    num_inference_stepsdevice	timestepssigmasc                    Ub  Ub  [        S5      eUb  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
X2S.UD6  U R                  n[        U5      nX14$ Ub  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
XBS.UD6  U R                  n[        U5      nX14$ U R                  " U4S	U0UD6  U R                  nX14$ )a  
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

Args:
    scheduler (`SchedulerMixin`):
        The scheduler to get timesteps from.
    num_inference_steps (`int`):
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
        must be `None`.
    device (`str` or `torch.device`, *optional*):
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
    timesteps (`list[int]`, *optional*):
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
        `num_inference_steps` and `sigmas` must be `None`.
    sigmas (`list[float]`, *optional*):
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
        `num_inference_steps` and `timesteps` must be `None`.

Returns:
    `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
    second element is the number of inference steps.
zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr.   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r.   r-   r/   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r/   r-   r-   r$   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r.   len)	schedulerr,   r-   r.   r/   kwargsaccepts_timestepsaccept_sigmass           r)   retrieve_timestepsr>   c   s}   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	M)MfM''	!)n )) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	GvGG''	!)n )) 	 3MFMfM''	))r+   c                     UR                  [        [        SUR                  5      5      SS9nU R                  [        [        SU R                  5      5      SS9nXU-  -  nX%-  SU-
  U -  -   n U $ )a  
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891).

Args:
    noise_cfg (`torch.Tensor`):
        The predicted noise tensor for the guided diffusion process.
    noise_pred_text (`torch.Tensor`):
        The predicted noise tensor for the text-guided diffusion process.
    guidance_rescale (`float`, *optional*, defaults to 0.0):
        A rescale factor applied to the noise predictions.

Returns:
    noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
r   T)dimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaleds         r)   rescale_noise_cfgrL      s{    " ""tE!_5I5I,J'KUY"ZHmmU1inn%= >mMG#''9: 6!>N:NR[9[[Ir+   c            S       b  ^  \ rS rSrSrSrS/r/ SQr S}S\S\	S	\
S
\S\\-  S\S\S\\-  S\S-  4U 4S jjjr     S~S\\\   -  S\S\S\S\R0                  S-  S\R2                  S-  4S jjr           SS\\\   -  S\\\   -  S-  S\S\S\R8                  S-  S\R8                  S-  S\R8                  S-  S\R8                  S-  S\S\S\R0                  S-  S\R2                  S-  4S  jjr\R<                  " 5            SS\S#\S$\S%\S&\R>                  S-  S'\ \\!4   S-  S\\R0                  -  S-  4S( jj5       r"        SS) jr#\$SS*\R8                  S+\S,\S-\R8                  4S. jj5       r%\$ SS*\R8                  S/\S0\S1\S+\S,\S-\R8                  4S2 jj5       r&\$ SS*\R8                  S4\R8                  S5\R8                  S6\'S-\R8                  4
S7 jj5       r(\$ SS*\R8                  S4\R8                  S5\R8                  S6\'S-\R8                  4
S8 jj5       r)\$S*\R8                  S4\R8                  S5\R8                  4S9 j5       r*\$S*\R8                  S4\R8                  S5\R8                  4S: j5       r+\$ S}S*\R8                  S;\'\R8                  -  S&\R>                  S-  4S< jj5       r,\$ SS*\R8                  S+\S-  S,\S-  S-\R8                  4S= jj5       r-\$  SS*\R8                  S>\S?\S+\S-  S,\S-  S-\R8                  4S@ jj5       r.          SSD\SE\S0\S1\S/\S;\'S\R2                  S-  S\R0                  S-  S&\R>                  S-  S*\R8                  S-  S-\R8                  4SF jjr/         SSD\SE\SG\S?\S;\'S\R2                  S-  S\R0                  S-  S&\R>                  S-  S*\R8                  S-  S-\R8                  4SH jjr0 S}SI\R8                  SJ\R8                  SK\S\!S-  S-\R8                  4
SL jjr1 S}SI\R8                  SJ\R8                  SK\S\!S-  S-\R8                  4
SM jjr2\3SN 5       r4\3SO 5       r5\3SP 5       r6\3SQ 5       r7\3SR 5       r8\3SS 5       r9\3ST 5       r:\3SU 5       r;\3SV 5       r<\3SW 5       r=\3SX 5       r>\3SY 5       r?\3SZ 5       r@\3S[ 5       rA\3S\ 5       rB\R<                  " 5       \C" \D5      SSS!SASBS]S^SSS_SCS3SCSSSSSSCSSSSSSSSSCSS`SS!SS"SaSSSS*/S4(S\\\   -  S\\\   -  S-  S0\S1\S/\Sb\'Sc\Sd\\'   S-  Se\\   Sf\'Sg\'Sh\'Si\'Sj\'S-  Sk\'S-  Sl\'S-  Sm\'S-  Sn\\   S-  S;\'S\S&\R>                  \\R>                     -  S-  S*\R8                  S-  So\R8                  S-  S\R8                  S-  S\R8                  S-  S\R8                  S-  S\R8                  S-  Sp\'\\'   -  Sq\'\\'   -  S-  Sr\S#\S-  Ss\St\ \\!4   S-  Su\Sv\Sw\Sx\ \\!4   S-  Sy\E\\/S4   S-  Sz\\   S\4PS{ jj5       5       rFS|rGU =rH$ )LTX2Pipeline   a  
Pipeline for text-to-video generation.

Reference: https://github.com/Lightricks/LTX-Video

Args:
    transformer ([`LTXVideoTransformer3DModel`]):
        Conditional Transformer architecture to denoise the encoded video latents.
    scheduler ([`FlowMatchEulerDiscreteScheduler`]):
        A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    vae ([`AutoencoderKLLTXVideo`]):
        Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
    text_encoder ([`T5EncoderModel`]):
        [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
        the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
    tokenizer (`CLIPTokenizer`):
        Tokenizer of class
        [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
    tokenizer (`T5TokenizerFast`):
        Second Tokenizer of class
        [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
    connectors ([`LTX2TextConnectors`]):
        Text connector stack used to adapt text encoder hidden states for the video and audio branches.
z>text_encoder->connectors->transformer->vae->audio_vae->vocoder	processor)latentsprompt_embedsnegative_prompt_embedsNr:   vae	audio_vaetext_encoder	tokenizer
connectorstransformervocoderc
                   > [         T
U ]  5         U R                  UUUUUUUUU	S9	  [        U SS 5      b  U R                  R
                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b   U R                  R                  R                  OSU l        [        U S5      b   U R                  R                  R"                  OSU l        [        U SS 5      b   U R                  R                  R&                  OS	U l        [        U SS 5      b   U R                  R                  R*                  OS
U l        [/        U R                  S9U l        [        U SS 5      b  U R2                  R4                  U l        g SU l        g )N)	rT   rU   rV   rW   rX   rY   rZ   r:   rP   rT          rU      rY   r   i>     )vae_scale_factorrW      )super__init__register_modulesgetattrrT   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratiorU   mel_compression_ratioaudio_vae_mel_compression_ratio$audio_vae_temporal_compression_ratiorY   config
patch_sizetransformer_spatial_patch_sizepatch_size_ttransformer_temporal_patch_sizesample_rateaudio_sampling_ratemel_hop_lengthaudio_hop_lengthr   video_processorrW   model_max_lengthtokenizer_max_length)selfr:   rT   rU   rV   rW   rX   rY   rZ   rP   r8   s             r)   rc   LTX2Pipeline.__init__   s    	%!# 	 
	
 3:$t2L2XDHH..^` 	* 4;43M3YDHH//_` 	+
 5<D+t4T4`DNN00fg 	, :A{TX9Y9eDNN55kl 	1 3:$t2T2`D##..fg 	+ 5<D-4P4\D##00bc 	,
 29{D1Q1]DNN!!--ch 	  5<D+t4T4`DNN!!00fi 	  .t?a?ab/6t[$/O/[DNN++ 	!ae 	!r+   r   ra   promptnum_videos_per_promptmax_sequence_lengthscale_factorr-   dtypec           	         U=(       d    U R                   nU=(       d    U R                  R                  n[        U[        5      (       a  U/OUn[        U5      n[        U SS5      bM  SU R                  l        U R                  R                  c%  U R                  R                  U R                  l	        U Vs/ s H  oR                  5       PM     nnU R                  USUSSSS9n	U	R                  n
U	R                  nU
R                  U5      n
UR                  U5      nU R                  XSS9nUR                  n[         R"                  " US	S
9nUR%                  SS5      R                  US9nUR&                  u  nnnUR)                  SUS5      nUR+                  Xr-  US	5      nUR+                  US	5      nUR)                  US5      nX4$ s  snf )a  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `list[str]`, *optional*):
        prompt to be encoded
    device: (`str` or `torch.device`):
        torch device to place the resulting embeddings on
    dtype: (`torch.dtype`):
        torch dtype to cast the prompt embeds to
    max_sequence_length (`int`, defaults to 1024): Maximum sequence length to use for the prompt.
rW   Nleft
max_lengthTpt)paddingr   
truncationadd_special_tokensreturn_tensors)	input_idsattention_maskoutput_hidden_statesr@   r   r	   r   r   )_execution_devicerV   r   
isinstancestrr9   re   rW   padding_side	pad_token	eos_tokenstripr   r   tohidden_statestorchstackflattenshaperepeatview)ry   r{   r|   r}   r~   r-   r   
batch_sizeptext_inputstext_input_idsprompt_attention_masktext_encoder_outputstext_encoder_hidden_statesrR   _seq_lens                    r)   _get_gemma_prompt_embeds%LTX2Pipeline._get_gemma_prompt_embeds  s   * 14110**00'44&&[
4d+7*0DNN'~~''/+/>>+C+C(%+,V'')V,nn *# % 
 %.. + : :'**62 5 8 8 @#00$ae  1  
 &:%G%G"%*[[1KQS%T"2::1a@CC%CP &++7A%,,Q0EqI%**:+MwXZ[ 5 : ::r J 5 < <=RTU V33; -s   9GTnegative_promptdo_classifier_free_guidancerR   rS   r   negative_prompt_attention_maskc                 @   U=(       d    U R                   n[        U[        5      (       a  U/OUnUb  [        U5      nOUR                  S   nUc  U R                  UUU	U
UUS9u  pWU(       a  Uc  U=(       d    Sn[        U[        5      (       a  X/-  OUnUb;  [        U5      [        U5      La$  [        S[        U5       S[        U5       S35      eU[        U5      :w  a!  [        SU S[        U5       S	U SU S
3	5      eU R                  UUU	U
UUS9u  phXWXh4$ )ab  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `list[str]`, *optional*):
        prompt to be encoded
    negative_prompt (`str` or `list[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
        Whether to use classifier free guidance or not.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    device: (`torch.device`, *optional*):
        torch device
    dtype: (`torch.dtype`, *optional*):
        torch dtype
r   )r{   r|   r}   r~   r-   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r   r   r   r9   r   r   type	TypeErrorr1   )ry   r{   r   r   r|   rR   rS   r   r   r}   r~   r-   r   r   s                 r)   encode_promptLTX2Pipeline.encode_promptQ  s   R 1411'44&&VJ&,,Q/J 373P3P&;$7) 4Q 40M '+A+I-3O@J?\_@`@`j+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  FJEbEb&&;$7) Fc FB" 5Kkkr+      
   system_promptmax_new_tokensseed	generatorgeneration_kwargsc           	      t   U=(       d    U R                   nUc  SSS.nSUS.SSU 3S./nU R                  R                  R                  US	SS
9n	U R                  U	SSS9R	                  U5      n
U R
                  R	                  U5        Ub  UR                  5       n[        R                  " U5        U R
                  R                  " S0 U
DSU0DUD6n[        U5       VVs/ s H   u  pU[        U
R                  U   5      S PM"     nnnU R                  R                  R                  USS9nU$ s  snnf )z
Enhances the supplied `prompt` by generating a new prompt using the current text encoder (default is a
`transformers.Gemma3ForConditionalGeneration` model) from it and a system prompt.
NTgffffff?)	do_sampletemperaturesystem)rolecontentuserzuser prompt: F)tokenizeadd_generation_promptr   )textimagesr   r   )skip_special_tokensr$   )r   rP   rW   apply_chat_templater   rV   initial_seedr   manual_seedgenerate	enumerater9   r   batch_decode)ry   r{   r   r   r   r   r   r-   messagestemplatemodel_inputsgenerated_sequencesiseqgenerated_idsenhanced_prompts                   r)   enhance_promptLTX2Pipeline.enhance_prompt  sZ    1411$.23 G -8-x(@A
 >>++??SXpt?u~~8DQU~VYYZ`aV$  ))+D$"//88 

)
  
 NWWjMklMk61S!7!7!:;=>Mkl..22??cg?h ms   $'D4c           
      t  ^  US-  S:w  d	  US-  S:w  a  [        SU SU S35      eUbW  [        U 4S jU 5       5      (       d=  [        ST R                   SU Vs/ s H  oT R                  ;  d  M  UPM     sn 35      eUb  Ub  [        S	U S
U S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUb  Uc  [        S5      eUb  Uc  [        S5      eUb  Ub  UR                  UR                  :w  a&  [        SUR                   SUR                   S35      eUR                  UR                  :w  a&  [        SUR                   SUR                   S35      eU
S:  d  US:  a  U	(       d  [        S5      eg g s  snf )Nr\   r   z8`height` and `width` have to be divisible by 32 but are z and r   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fr#   )_callback_tensor_inputs).0kry   s     r)   	<genexpr>,LTX2Pipeline.check_inputs.<locals>.<genexpr>  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but got: `prompt_attention_mask` z% != `negative_prompt_attention_mask`         zSpatio-Temporal Guidance (STG) is specified but no STG blocks are supplied. Please supply a list ofblock indices at which to apply STG in `spatio_temporal_guidance_blocks`)r1   allr   r   r   rC   r   r   )ry   r{   heightwidth"callback_on_step_end_tensor_inputsrR   rS   r   r   spatio_temporal_guidance_blocks	stg_scaleaudio_stg_scaler   s   `            r)   check_inputsLTX2Pipeline.check_inputs  sm    B;!urzQWX^W__dejdkklmnn-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa$)>)Fdee!-2P2Xvww$)?)K""&<&B&BB --:-@-@,A B.445Q8 
 %**.L.R.RR 55J5P5P4Q R6<<=Q@  _/C"7Ba[  Cb"7G pHs   F51F5rQ   rn   rp   returnc           
          U R                   u  p4pVnXR-  nXa-  n	Xq-  n
U R                  USUUU	UU
U5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      n U $ )
Nr   r   r   r^      r   r	         )r   reshapepermuter   )rQ   rn   rp   r   num_channels
num_framesr   r   post_patch_num_framespost_patch_heightpost_patch_widths              r)   _pack_latentsLTX2Pipeline._pack_latents  s     ?Fmm;
*e * :"0 .//!	
 //!Q1aAq9AA!QGOOPQSTUr+   r   r   r   c           
          U R                  S5      nU R                  XaX#SXTU5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      R                  SS5      n U $ )
Nr   r   r^   r   r   r   r   r	   r   )sizer   r   r   )rQ   r   r   r   rn   rp   r   s          r)   _unpack_latentsLTX2Pipeline._unpack_latents(  st     \\!_
//*&\gqr//!Q1aAq9AA!QGOOPQSTU]]^_abcr+         ?latents_meanlatents_stdscaling_factorc                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-
  U-  U-  n U $ Nr   r   r   r   r-   r   rQ   r   r   r   s       r)   _normalize_latentsLTX2Pipeline._normalize_latents4  su     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X)^;kIr+   c                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-  U-  U-   n U $ r   r   r   s       r)   _denormalize_latents!LTX2Pipeline._denormalize_latents?  su    
 $((B1a8;;GNNGMMZ!&&q"aA699'..'--X'.8<Gr+   c                     UR                  U R                  U R                  5      nUR                  U R                  U R                  5      nX-
  U-  $ r#   r   r-   r   rQ   r   r   s      r)   _normalize_audio_latents%LTX2Pipeline._normalize_audio_latentsI  s@    #w~~w}}E!nnW^^W]]C&+55r+   c                     UR                  U R                  U R                  5      nUR                  U R                  U R                  5      nX-  U-   $ r#   r  r  s      r)   _denormalize_audio_latents'LTX2Pipeline._denormalize_audio_latentsO  s@    #w~~w}}E!nnW^^W]]C%55r+   noise_scalec                 r    [        U R                  X R                  U R                  S9nX-  SU-
  U -  -   nU$ )Nr   r-   r   r   )r   r   r-   r   )rQ   r  r   noisenoised_latentss        r)   _create_noised_state!LTX2Pipeline._create_noised_stateU  s=     W]]i^e^k^kl$,K7/JJr+   c                    Ubf  Ubc  U R                   u  p4pVXR-  nXa-  nU R                  USXrX5      n U R                  SSSSSS5      R                  SS5      R                  SS5      n U $ U R	                  SS5      R                  SS5      n U $ )Nr   r   r   r^   r   r	   r   )r   r   r   r   	transpose)	rQ   rn   rp   r   r   latent_lengthlatent_mel_binspost_patch_latent_lengthpost_patch_mel_binss	            r)   _pack_audio_latents LTX2Pipeline._pack_audio_latents]  s    
 !l&> HO}}DJm'4'C$"1">ooB 8H[G ooaAq!Q7??1EMMaQRSG
  ''1-55a;Gr+   r  num_mel_binsc                    Ub`  Ub]  U R                  S5      nU R                  XQUSXC5      n U R                  SSSSSS5      R                  SS5      R                  SS5      n U $ U R	                  SSU45      R                  SS5      n U $ )Nr   r   r	   r   r^   r   r   )r   r   r   r   	unflattenr  )rQ   r  r  rn   rp   r   s         r)   _unpack_audio_latents"LTX2Pipeline._unpack_audio_latentsr  s     !l&> aJoojrS_lGooaAq!Q7??1EMMaQRSG  ''B+=>HHANGr+      y   r   r   num_channels_latentsc                    U
b  U
R                   S:X  a  U R                  XR                  R                  U R                  R                  U R                  R
                  R                  5      n
U R                  XR                  U R                  5      n
U
R                   S:w  a  [        SU
R                   S35      eU R                  XU	5      n
U
R                  XS9$ X0R                  -  nX@R                  -  nUS-
  U R                  -  S-   nXXSU4n[!        U	["        5      (       a*  [%        U	5      U:w  a  [        S[%        U	5       SU S	35      e['        XXS
9n
U R                  XR                  U R                  5      n
U
$ )Nr   r	   $Provided `latents` tensor has shape @, but the expected shape is [batch_size, num_seq, num_features].r-   r   r   /You have passed a list of generators of length +, but requested an effective batch size of @. Make sure the batch size matches the length of the generators.r  )rE   r   rT   r   r   rm   r   r   ro   rq   r1   r   r  r   rg   ri   r   rC   r9   r   )ry   r   r#  r   r   r   r  r   r-   r   rQ   r   s               r)   prepare_latentsLTX2Pipeline.prepare_latents  s    ||q 11XX22DHH4H4H$((//JhJh ,,@@$BfBf ||q  :7==/  JJ  K  //iPG::V:99===;;; 1n)L)LLqP
:uMi&&3y>Z+GA#i.AQ R&<'gi 
 u&V$$88$:^:^
 r+   audio_latent_lengthc
                 .   U	b  U	R                   S:X  a  U R                  U	5      n	U	R                   S:w  a  [        SU	R                   S35      eU R	                  XR
                  R                  U R
                  R                  5      n	U R                  XU5      n	U	R                  XvS9$ X@R                  -  n
XX:4n[        U[        5      (       a*  [        U5      U:w  a  [        S[        U5       SU S35      e[        XXvS	9n	U R                  U	5      n	U	$ )
Nr^   r	   r%  r&  r'  r(  r)  r*  r  )rE   r  r1   r   r  rU   r   r   r  r   rk   r   rC   r9   r   )ry   r   r#  r-  r  r  r   r-   r   rQ   r  r   s               r)   prepare_audio_latents"LTX2Pipeline.prepare_audio_latents  s&    ||q 227;||q  :7==/  JJ  K  33G^^=X=XZ^ZhZhZtZtuG//iPG::V:99&*N*NN3FXi&&3y>Z+GA#i.AQ R&<'gi 
 u&V**73r+   sampledenoised_outputstep_idxc                 L    Uc  U R                   nXUR                  U   -  -
  nU$ r#   r:   r/   )ry   r1  r2  r3  r:   	sample_x0s         r)   convert_velocity_to_x0#LTX2Pipeline.convert_velocity_to_x0  s1     Iy/?/?/III	r+   c                 L    Uc  U R                   nX-
  UR                  U   -  nU$ r#   r5  )ry   r1  r2  r3  r:   sample_vs         r)   convert_x0_to_velocity#LTX2Pipeline.convert_x0_to_velocity  s0     I,	0@0@0JJr+   c                     U R                   $ r#   )_guidance_scalery   s    r)   guidance_scaleLTX2Pipeline.guidance_scale      ###r+   c                     U R                   $ r#   )_guidance_rescaler?  s    r)   rH   LTX2Pipeline.guidance_rescale      %%%r+   c                     U R                   $ r#   )
_stg_scaler?  s    r)   r   LTX2Pipeline.stg_scale      r+   c                     U R                   $ r#   )_modality_scaler?  s    r)   modality_scaleLTX2Pipeline.modality_scale  rB  r+   c                     U R                   $ r#   )_audio_guidance_scaler?  s    r)   audio_guidance_scale!LTX2Pipeline.audio_guidance_scale      )))r+   c                     U R                   $ r#   )_audio_guidance_rescaler?  s    r)   audio_guidance_rescale#LTX2Pipeline.audio_guidance_rescale   s    +++r+   c                     U R                   $ r#   )_audio_stg_scaler?  s    r)   r   LTX2Pipeline.audio_stg_scale  s    $$$r+   c                     U R                   $ r#   )_audio_modality_scaler?  s    r)   audio_modality_scale!LTX2Pipeline.audio_modality_scale  rS  r+   c                 L    U R                   S:  =(       d    U R                  S:  $ Nr   )r>  rP  r?  s    r)   r   (LTX2Pipeline.do_classifier_free_guidance  #    $$s*Q0J0JS0PQr+   c                 L    U R                   S:  =(       d    U R                  S:  $ )Nr   )rH  rY  r?  s    r)   do_spatio_temporal_guidance(LTX2Pipeline.do_spatio_temporal_guidance  s!    #%G4+@+@3+FGr+   c                 L    U R                   S:  =(       d    U R                  S:  $ r`  )rL  r\  r?  s    r)   do_modality_isolation_guidance+LTX2Pipeline.do_modality_isolation_guidance  rb  r+   c                     U R                   $ r#   )_num_timestepsr?  s    r)   num_timestepsLTX2Pipeline.num_timesteps  s    """r+   c                     U R                   $ r#   )_current_timestepr?  s    r)   current_timestepLTX2Pipeline.current_timestep  rF  r+   c                     U R                   $ r#   )_attention_kwargsr?  s    r)   attention_kwargsLTX2Pipeline.attention_kwargs   rF  r+   c                     U R                   $ r#   )
_interruptr?  s    r)   	interruptLTX2Pipeline.interrupt$  rJ  r+   g      8@(   g      @Fpil
frame_rater,   r/   r.   r@  r   rM  rH   rQ  r   r]  rV  r   audio_latentsdecode_timestepdecode_noise_scaleuse_cross_timestepprompt_max_new_tokensprompt_enhancement_kwargsprompt_enhancement_seedoutput_typereturn_dictrs  callback_on_step_endr   c)                     [        U&[        [        45      (       a  U&R                  n'U=(       d    U
nU=(       d    UnU=(       d    UnU=(       d    UnU R	                  UUUU'UUUUUUUS9  Xl        Xl        Xl        Xl        Xl	        Xl
        UU l        UU l        U%U l        SU l        SU l        Ub  [        U[         5      (       a  Sn)O3Ub!  [        U["        5      (       a  [%        U5      n)OUR&                  S   n)U R(                  n*Ub  Ub  U R+                  UUU U"UU!U*S9nU R-                  UUU R.                  UUUUUU(U*S9
u  nnnnU R.                  (       a.  [0        R2                  " UU/SS9n[0        R2                  " UU/SS9nS	n+[5        U S
S5      b  [5        U R6                  SS	5      n+U R9                  UUU+S9u  n,n-n.US-
  U R:                  -  S-   n/X0R<                  -  n0X@R<                  -  n1Ub  UR>                  S:X  a(  [@        RC                  S5        UR&                  u    n2n/n0n1OMUR>                  S:X  a$  [@        RE                  SUR&                   S35        O[G        SUR&                   S35      eU RH                  RJ                  RL                  n3U RO                  U)U-  U3UUUU[0        RP                  U*UU5
      nXV-  n4U RR                  U RT                  -  [W        U RX                  5      -  n5[[        U4U5-  5      n6Ub  UR>                  S:X  a'  [@        RC                  S5        UR&                  u    n2n6n2OMUR>                  S:X  a$  [@        RE                  SUR&                   S35        O[G        SUR&                   S35      e[5        U SS5      b   U R\                  RJ                  R^                  OSn7U7U R`                  -  n8[5        U SS5      b   U R\                  RJ                  Rb                  OSn9U Re                  U)U-  U9U6U7U[0        RP                  U*UUS9	nUc  [f        Rh                  " SSU-  U5      OUn[k        U Rl                  RJ                  Ro                  SS 5      U Rl                  RJ                  Ro                  S!S"5      U Rl                  RJ                  Ro                  SS 5      U Rl                  RJ                  Ro                  S#S$5      U Rl                  RJ                  Ro                  S%S&5      5      n:[p        Rr                  " U Rl                  5      n;[u        U;UU*U	UU:S'9u    n2[u        U Rl                  UU*U	UU:S'9u  p[w        [%        U	5      XpRl                  Rx                  -  -
  S5      n<[%        U	5      U l=        U RH                  R|                  R                  UR&                  S   U/U0U1UR                  US(9n=U RH                  R                  R                  UR&                  S   U6UR                  5      n>U R.                  (       aH  U=R                  S)S*U=R>                  S-
  -  -   5      n=U>R                  S)S*U>R>                  S-
  -  -   5      n>U R                  US+9 n?[        U	5       GHq  u  n@nAU R                  (       a  M  WAU l        U R.                  (       a  [0        R2                  " U/S,-  5      OUnBUBR                  UR                  5      nBU R.                  (       a  [0        R2                  " U/S,-  5      OUnCUCR                  UR                  5      nCWAR                  WBR&                  S   5      nDU RH                  R                  S-5         U RI                  UBUCU,U-UDUDU.U.U/U0U1UU6U=U>SSSUU%SS.9u  nEnFSSS5        WERW                  5       nEWFRW                  5       nFU R.                  (       GaQ  WER                  S,5      u  nGnEU R                  UUEW@U Rl                  5      nEU R                  UUGU@U Rl                  5      nGU R                  S-
  UEUG-
  -  nHWFR                  S,5      u  nInFU R                  UUFU@U;5      nFU R                  UUIU@U;5      nIU R                  S-
  UFUI-
  -  nJU R                  (       d  U R                  (       ax  W@S:X  a_  U,R                  S,SS9S   nKU-R                  S,SS9S   nLU.R                  S,SS9S   nMU=R                  S,SS9S   nNU>R                  S,SS9S   nOWDR                  S,SS9S   nDO@S=nHnJU,nKU-nLU.nMU=nNU>nOU R                  UWEW@U Rl                  5      nEU R                  UWFU@U;5      nFU R                  (       a  U RH                  R                  S/5         U RI                  UR                  UR                  S09UR                  UR                  S09WKWLWDUDWMUMU/U0U1UU6WNWOSUSUU%SS.9u  nPnQSSS5        WPRW                  5       nPWQRW                  5       nQU R                  UUPW@U Rl                  5      nPU R                  UUQU@U;5      nQU R                  WEUP-
  -  nRU R                  WFUQ-
  -  nSOS=nRnSU R                  (       a  U RH                  R                  S15         U RI                  UR                  UR                  S09UR                  UR                  S09WKWLWDUDWMUMU/U0U1UU6WNWOS2SSUU%SS.9u  nTnUSSS5        WTRW                  5       nTWURW                  5       nUU R                  UUTW@U Rl                  5      nTU R                  UUUU@U;5      nUU R                  S-
  WEUT-
  -  nVU R                  S-
  WFUU-
  -  nWOS=nVnWWEWH-   WR-   WV-   nXWFWJ-   WS-   WW-   nYU R                  S:  a  [        WXWEU R                  S39nEOWXnEU R                  S:  a  [        WYWFU R                  S39nFOWYnFU R                  UWEW@U Rl                  5      nEU R                  UWFU@U;5      nFU Rl                  R                  UEWAUSS49S   nU;R                  UFUAUSS49S   nU&bJ  0 nZU' H  n[[        5       U[   WZU['   M     U&" U W@WAWZ5      n\U\R                  S5U5      nU\R                  S6U5      nW@[%        U	5      S-
  :X  d)  W@S-   U<:  a0  W@S-   U Rl                  Rx                  -  S:X  a  U?R                  5         [        (       d  GM\  [        R                  " 5         GMt     SSS5        U R                  UU/U0U1U R                  U R                  5      nU R                  UU R\                  R                  U R\                  R                  5      nU R                  UU6U8S79nU#S8:X  a`  U R                  UU R                  R                  U R                  R                  U R                  RJ                  R                  5      nUn]Un^GOUR                  UR                  5      nU R                  RJ                  R                  (       d  SnDO[        UR&                  UU*UR                  S99n_[        U["        5      (       d  U/U)-  nUc  UnO[        U["        5      (       d  U/U)-  n[0        R                  " UU*UR                  S:9nD[0        R                  " UU*UR                  S:9SS2SSSS4   nSU-
  U-  UW_-  -   nU R                  UU R                  R                  U R                  R                  U R                  RJ                  R                  5      nUR                  U R                  R                  5      nU R                  R                  UWDSS49S   n]U R                  R                  U]U#S;9n]UR                  U R\                  R                  5      nU R\                  R                  USS49S   n`U R                  U`5      n^U R                  5         U$(       d  W]W^4$ [        W]W^S<9$ ! , (       d  f       GNv= f! , (       d  f       GN<= f! , (       d  f       GNJ= f! , (       d  f       GNG= f)=uN+  
Function invoked when calling the pipeline for generation.

Args:
    prompt (`str` or `list[str]`, *optional*):
        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
        instead.
    height (`int`, *optional*, defaults to `512`):
        The height in pixels of the generated image. This is set to 480 by default for the best results.
    width (`int`, *optional*, defaults to `768`):
        The width in pixels of the generated image. This is set to 848 by default for the best results.
    num_frames (`int`, *optional*, defaults to `121`):
        The number of video frames to generate
    frame_rate (`float`, *optional*, defaults to `24.0`):
        The frames per second (FPS) of the generated video.
    num_inference_steps (`int`, *optional*, defaults to 40):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    sigmas (`List[float]`, *optional*):
        Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
        their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
        will be used.
    timesteps (`list[int]`, *optional*):
        Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
        in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
        passed will be used. Must be in descending order.
    guidance_scale (`float`, *optional*, defaults to `4.0`):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality. Used for the video modality (there is
        a separate value `audio_guidance_scale` for the audio modality).
    stg_scale (`float`, *optional*, defaults to `0.0`):
        Video guidance scale for Spatio-Temporal Guidance (STG), proposed in [Spatiotemporal Skip Guidance for
        Enhanced Video Diffusion Sampling](https://arxiv.org/abs/2411.18664). STG uses a CFG-like estimate
        where we move the sample away from a weak sample from a perturbed version of the denoising model.
        Enabling STG will result in an additional denoising model forward pass; the default value of `0.0`
        means that STG is disabled.
    modality_scale (`float`, *optional*, defaults to `1.0`):
        Video guidance scale for LTX-2.X modality isolation guidance, where we move the sample away from a
        weaker sample generated by the denoising model withy cross-modality (audio-to-video and video-to-audio)
        cross attention disabled using a CFG-like estimate. Enabling modality guidance will result in an
        additional denoising model forward pass; the default value of `1.0` means that modality guidance is
        disabled.
    guidance_rescale (`float`, *optional*, defaults to 0.0):
        Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
        Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
        [Common Diffusion Noise Schedules and Sample Steps are
        Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
        using zero terminal SNR. Used for the video modality.
    audio_guidance_scale (`float`, *optional* defaults to `None`):
        Audio guidance scale for CFG with respect to the negative prompt. The CFG update rule is the same for
        video and audio, but they can use different values for the guidance scale. The LTX-2.X authors suggest
        that the `audio_guidance_scale` should be higher relative to the video `guidance_scale` (e.g. for
        LTX-2.3 they suggest 3.0 for video and 7.0 for audio). If `None`, defaults to the video value
        `guidance_scale`.
    audio_stg_scale (`float`, *optional*, defaults to `None`):
        Audio guidance scale for STG. As with CFG, the STG update rule is otherwise the same for video and
        audio. For LTX-2.3, a value of 1.0 is suggested for both video and audio. If `None`, defaults to the
        video value `stg_scale`.
    audio_modality_scale (`float`, *optional*, defaults to `None`):
        Audio guidance scale for LTX-2.X modality isolation guidance. As with CFG, the modality guidance rule
        is otherwise the same for video and audio. For LTX-2.3, a value of 3.0 is suggested for both video and
        audio. If `None`, defaults to the video value `modality_scale`.
    audio_guidance_rescale (`float`, *optional*, defaults to `None`):
        A separate guidance rescale factor for the audio modality. If `None`, defaults to the video value
        `guidance_rescale`.
    spatio_temporal_guidance_blocks (`list[int]`, *optional*, defaults to `None`):
        The zero-indexed transformer block indices at which to apply STG. Must be supplied if STG is used
        (`stg_scale` or `audio_stg_scale` is greater than `0`). A value of `[29]` is recommended for LTX-2.0
        and `[28]` is recommended for LTX-2.3.
    noise_scale (`float`, *optional*, defaults to `0.0`):
        The interpolation factor between random noise and denoised latents at each timestep. Applying noise to
        the `latents` and `audio_latents` before continue denoising.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        The number of videos to generate per prompt.
    generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will be generated by sampling using the supplied random `generator`.
    audio_latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will be generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    prompt_attention_mask (`torch.Tensor`, *optional*):
        Pre-generated attention mask for text embeddings.
    negative_prompt_embeds (`torch.FloatTensor`, *optional*):
        Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
        provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
    negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
        Pre-generated attention mask for negative text embeddings.
    decode_timestep (`float`, defaults to `0.0`):
        The timestep at which generated video is decoded.
    decode_noise_scale (`float`, defaults to `None`):
        The interpolation factor between random noise and denoised latents at the decode timestep.
    use_cross_timestep (`bool` *optional*, defaults to `False`):
        Whether to use the cross modality (audio is the cross modality of video, and vice versa) sigma when
        calculating the cross attention modulation parameters. `True` is the newer (e.g. LTX-2.3) behavior;
        `False` is the legacy LTX-2.0 behavior.
    system_prompt (`str`, *optional*, defaults to `None`):
        Optional system prompt to use for prompt enhancement. The system prompt will be used by the current
        text encoder (by default, a `Gemma3ForConditionalGeneration` model) to generate an enhanced prompt from
        the original `prompt` to condition generation. If not supplied, prompt enhancement will not be
        performed.
    prompt_max_new_tokens (`int`, *optional*, defaults to `512`):
        The maximum number of new tokens to generate when performing prompt enhancement.
    prompt_enhancement_kwargs (`dict[str, Any]`, *optional*, defaults to `None`):
        Keyword arguments for `self.text_encoder.generate`. If not supplied, default arguments of
        `do_sample=True` and `temperature=0.7` will be used. See
        https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate
        for more details.
    prompt_enhancement_seed (`int`, *optional*, default to `10`):
        Random seed for any random operations during prompt enhancement.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between
        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ltx.LTX2PipelineOutput`] instead of a plain tuple.
    attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    callback_on_step_end (`Callable`, *optional*):
        A function that calls at the end of each denoising steps during the inference. The function is called
        with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
        callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
        `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`List`, *optional*, defaults to `["latents"]`):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.
    max_sequence_length (`int`, *optional*, defaults to `1024`):
        Maximum sequence length to use with the `prompt`.

Examples:

Returns:
    [`~pipelines.ltx.LTX2PipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.ltx.LTX2PipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is a list with the generated images.
)r{   r   r   r   rR   rS   r   r   r   r   r   FNr   r   )r{   r   r   r   r   r   r-   )
r{   r   r   r|   rR   rS   r   r   r}   r-   r   r   rW   r   )r   r   zGot latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred.r	   z,You have supplied packed `latents` of shape zp, so the latent dims cannot be inferred. Make sure the supplied `height`, `width`, and `num_frames` are correct.r%  z, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, latent_dim, latent_frames, latent_height, latent_width].r^   zsGot audio_latents of shape [batch_size, num_channels, audio_length, mel_bins], `audio_num_frames` will be inferred.z2You have supplied packed `audio_latents` of shape zj, so the latent dims cannot be inferred. Make sure the supplied `num_frames` and `frame_rate` are correct.z*Provided `audio_latents` tensor has shape z}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, num_channels, audio_length, mel_bins].rU   @   r]   )r#  r-  r  r  r   r-   r   rQ   r   max_image_seq_len   base_image_seq_lenra   r    gffffff?r!   gffffff @)r/   r(   )fps)r   )r   )totalr   cond_uncond)r   audio_hidden_statesencoder_hidden_statesaudio_encoder_hidden_statestimestepsigmaencoder_attention_maskaudio_encoder_attention_maskr   r   r   r  audio_num_framesvideo_coordsaudio_coordsisolate_modalitiesr   perturbation_maskr  rs  r  
uncond_stgr   uncond_modalityT)rH   )r  rQ   rR   )r  latentr  r'  )r  )framesaudio)sr   r   r
   tensor_inputsr   r>  rH  rL  rD  rP  rY  r\  rU  rr  rv  rn  r   rC   r9   r   r   r   r   r   r   catre   rW   rX   ri   rg   rE   loggerinfowarningr1   rY   rm   in_channelsr+  float32rs   ru   floatrl   roundrU   mel_binsrk   latent_channelsr/  nplinspacer*   r:   getcopydeepcopyr>   maxorderrj  ropeprepare_video_coordsr-   
audio_ropeprepare_audio_coordsr   progress_barr   rw  r   r   expandcache_contextchunkr7  r@  rQ  rd  rg  r   r   rM  r]  rH   rL   rV  r;  steplocalspopupdateXLA_AVAILABLExm	mark_stepr   ro   rq   r  r   r   r  r  rT   r   timestep_conditioningr   tensordecoderv   postprocess_videorZ   maybe_free_model_hooksr   )ary   r{   r   r   r   r   r{  r,   r/   r.   r@  r   rM  rH   rQ  r   r]  rV  r   r  r|   r   rQ   r|  rR   r   rS   r   r}  r~  r  r   r  r  r  r  r  rs  r  r   r}   r   r-   tokenizer_padding_sideconnector_prompt_embedsconnector_audio_prompt_embedsconnector_attention_masklatent_num_frameslatent_heightlatent_widthr   r#  
duration_saudio_latents_per_secondr  r  r  num_channels_latents_audior(   audio_schedulernum_warmup_stepsr  r  r  r   tlatent_model_inputaudio_latent_model_inputr  noise_pred_videonoise_pred_audionoise_pred_video_uncond_textvideo_cfg_deltanoise_pred_audio_uncond_textaudio_cfg_deltavideo_prompt_embedsaudio_prompt_embedsprompt_attn_maskvideo_pos_idsaudio_pos_idsnoise_pred_video_uncond_stgnoise_pred_audio_uncond_stgvideo_stg_deltaaudio_stg_delta noise_pred_video_uncond_modality noise_pred_audio_uncond_modalityvideo_modality_deltaaudio_modality_deltanoise_pred_video_gnoise_pred_audio_gcallback_kwargsr   callback_outputsvideor  r  generated_mel_spectrogramssa                                                                                                    r)   __call__LTX2Pipeline.__call__(  s   D *-=?U,VWW1E1S1S.3E~)6Y3E~!7!K;K 	/Q'#9"7+I,K+ 	 	
  .#-!1%9" /%9"'=$!1!% *VS"9"9JJvt$<$<VJ&,,Q/J'' $);((+4,#"; ) F +(,(H(H"7'#9"7+I 3  
	
!"* ++!II'=}&MSTUM$)II/MOd.ekl$m!!'4d+7%,T^^^V%T"[_[j[j0?U \k \
X!>@X
 (!^0S0SSVWW"D"DD B BB||q  x HO}}D1'"B7==/ Ri j
 !:7==/  JY  Z 
  $//66BB&&.. MM
  ,
$$t'<'<<uTEnEn?oo 	! !.F!FG$!!Q& J -:,?,?)1&##q(HI\I\H] ^m n
 !@ATAT@U  VS  T  :A{TX9Y9et~~,,55km&$*N*NN5<T;PT5U5aDNN!!11gh 	# 22..!; 0%#--! 3 

 TZSaS!&9"9;NOgmNN!!%%&94@NN!!%%&:DANN!!%%&94@NN!!%%lD9NN!!%%k48
 --7!
1 *<NN*
&	 s9~0CnnFZFZ0ZZ\]^!)n '',,AAMM!/gnnbl B 
 ''22GG"$4m6J6J
 ++'..tdl>O>ORS>S6T/TUL'..tdl>O>ORS>S6T/TUL %89\!),1>>)*&AEAaAaUYYy1}%=gn"%7%:%:=;N;N%O"6:6V6VEII}o12\i ) ,D+F+F}GZGZ+[( 88$6$<$<Q$?@%%33MB9=9I9I&8,D.E4Q!)&/G5M#4,*&)9%1%1+08<*.+=)9$)+ :J :6$&6 C0 $4#9#9#; #3#9#9#; 333EUE[E[\]E^B02B'+'B'B7L\^_aeaoao'p$373N3N!=q$..40 (,':':Q'>CSVrCr&sOEUE[E[\]E^B02B'+'B'B=Rbdegv'w$373N3N%'CQ40 (,'@'@1'D(+GG'O
 774;^;^62I2O2OPQWX2O2YZ[2\/2O2U2UVW]^2U2_`a2b//G/M/MaUV/M/WXY/Z,,8,>,>qa,>,H,KM,8,>,>qa,>,H,KM $,>>!>#;A#>899Oo*A'*G''?$$0M$0M'+'B'B7L\^_aeaoao'p$'+'B'B=Rbdegv'w$33))77ESWScSc*1**=;N;N**O0=0@0@}GZGZ0@0[2E8K%-"*3C9I'8#0". *-=)6)6/4<[.2/A-=(-- Td TP35P F2 3N2S2S2U/2M2S2S2U/262M2M!<a3/ 372M2M%'BA3/ '+nn8HKf8f&gO&*&:&:>NQl>l&mO899Oo66))778IJ]a]m]m*1**=;N;N**O0=0@0@}GZGZ0@0[2E8K%-"*3C9I'8#0". *-=)6)6/3<@.2/A-=(-- ^n ^Z8:Z K2 8X7]7]7_47W7]7]7_47;7R7R!A1dnn84 8<7R7R%'GO84 -1,?,?!,C(+KK,( -1,E,E,I(+KK,( CDC(+? &6%G/%Y\p%p"%5%G/%Y\p%p" ((1,'8*,<tOdOd($ (:$..2'8*,<tOjOj($ (:$ $(#>#>wHXZ[]a]k]k#l #'#>#>}N^`acr#s  ..--.>7X]-^_`a !0 4 45Eq-ej 4 klm n'3&(O?-3Xa[* @';D!Q'X$.229gFG$4$8$8-$XM I**A9I/IqSTuX\XfXfXlXlNlpqNq '') =LLNm - :r &&//00
 774>>668R8R
 22=BRap2q("//..0D0DdhhooFdFdG E!Ejj!4!45G88??88$W]]iPV^e^k^kl!/488'6&7*&DO%-)8&#$6==*<)=
)J& <<gmm\%*\\2DV[b[h[h%itT4-&" 11W<?QTY?YY//..0D0DdhhooFdFdG jj0GHHOOGX5OI!LE((::5k:ZE),,T^^-A-ABM)-)>)>}Z_)>)`ab)c&LL!;<E 	##%5>!!e<<E CBP FEP KJC :9sv   ?C6AA5'A@G7AAAA@(B/AAAA@/,G6AA'AA@
A@@AA@
A@,@'AA@/
A@>@9AAA
AA)rr  rU  rP  r\  rY  rn  rD  r>  rv  rL  rj  rH  ru   rs   rk   rl   rx   ro   rq   rg   ri   rv   r#   )r   ra   r]   NN)NTr   NNNNra   r]   NN)r   r   NNN)NNNNNNNN)r   r   )r   )NN)
r      r   r!  r"  r   NNNN)	r   r]   r   r  r   NNNN)I__name__
__module____qualname____firstlineno____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   r   r   rc   r   rC   intr   r-   r   r   boolTensorr   no_grad	Generatordictr   r   r   staticmethodr   r   r  r   r  r  r  r  r  r  r+  r/  r7  r;  propertyr@  rH   r   rM  rQ  rV  r   r]  r   rd  rg  rk  ro  rs  rw  r   EXAMPLE_DOC_STRINGr   r  __static_attributes____classcell__)r8   s   @r)   rN   rN      s   2 ]'=T -18
28
 $8
 *	8

 58
 "$668
 '8
 18
 118
 #T)8
 8
z &'#'&*$(>4d3i>4  #>4 !	>4
 >4 t#>4 {{T!>4F 37,0%&-16:59>B#'&*$(Tld3iTl tCy4/Tl &*	Tl
  #Tl ||d*Tl !&t 3Tl  %||d2Tl ).t(;Tl !Tl Tl t#Tl {{T!Tll ]]_
 ",037,0)) ) 	)
 ) ??T))  S>D0) ell"T)) )` ,0#"'+(,<| u||  PS \a\h\h  , rs		+.	8;	DG	UX	lo			 	  or-2\\HMfk	  nq-2\\HMfk	  6%,, 6ell 6afamam 6 6
 6ELL 6 6chcoco 6 6
 fj,1ELL,@MR___cMc  Y]+.:LORVJ	 ( 
 "&#'  $J	
 Dj 
 ( $' $(&*,0'+-- "- 	-
 - - - {{T!- t#- ??T)- $- 
-b $%#$ $(&*,0'+$$ "$ !	$
 $ $ {{T!$ t#$ ??T)$ $$ 
$N koll5:\\MP]`cg]g	 koll5:\\MP]`cg]g	 $ $ & &   $ $ * * , , % % * * R R H H R R # # & & & &   ]]_12 #'26 #%%)# # #"%-1(,-1/3<@ %&DH'+-1-1596:>B/29=#($(%(;?')  26BF9B#'S_
=d3i_
= tCy4/_
= 	_
=
 _
= _
= _
= !_
= Ud"_
= 9_
= _
= _
= _
=  _
= $dl_
=  !_
=" $dl#_
=$ !&%_
=& *.cT)9'_
=( )_
=*  #+_
=, ??T%//%::TA-_
=. $/_
=0 ||d*1_
=2 ||d*3_
=4  %||d25_
=6 !&t 37_
=8 ).t(;9_
=: e,;_
=< "DK/$6=_
=> !?_
=@ TzA_
=B  #C_
=D $(S>D#8E_
=F "%G_
=H I_
=J K_
=L sCx.4/M_
=N 'Sz4'784?O_
=P -1IQ_
=R !S_
= 3 _
=r+   rN   )   r  g      ?gffffff?)NNNN)r   )=r  r3   typingr   r   numpyr  r   transformersr   r   r   r   	callbacksr
   r   loadersr   r   models.autoencodersr   r   models.transformersr   
schedulersr   utilsr   r   r   utils.torch_utilsr   rv   r   pipeline_utilsr   rX   r   pipeline_outputr   rZ   r   r   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr  r  r  r  r  r*   r   r-   rC   r>   rL   rN   r$   r+   r)   <module>r     s(         l l A ? Q > 9 O O - - . * / 4 ))MM			H	%# R 

 
 	

 
  '+(,"&!%8*t8* %,,%8* Cy4	8*
 K$8*x4P=$&9;N P=r+   