
    
3j{M             
       v   S SK r S SKrS SKJr  S SKJrJr  S SKrS SK	r
S SKrS SKJrJrJr  SSKJrJr  SSKJrJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-  \" 5       (       a  S SK.J/s  J0r1  Sr2OSr2\Rf                  " \45      r5Sr6\ " S S5      5       r7 S*S\Rp                  S\Rr                  S-  S\:4S jjr;    S+S\<S\<S\=S \=4S! jjr>    S,S"\<S-  S#\:\R~                  -  S-  S$\@\<   S-  S%\@\=   S-  4S& jjrAS-S' jrB " S( S)\&\\5      rCg).    N)	dataclass)AnyCallable)Gemma3ForConditionalGenerationGemmaTokenizerGemmaTokenizerFast   )MultiPipelineCallbacksPipelineCallback)FromSingleFileMixinLTX2LoraLoaderMixin)AutoencoderKLLTX2AudioAutoencoderKLLTX2Video)LTX2VideoTransformer3DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )LTX2TextConnectors)LTX2PipelineOutput)LTX2VocoderLTX2VocoderWithBWETFaZ  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTX2ConditionPipeline
        >>> from diffusers.pipelines.ltx2.export_utils import encode_video
        >>> from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
        >>> from diffusers.utils import load_image

        >>> pipe = LTX2ConditionPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
        >>> pipe.enable_model_cpu_offload()

        >>> first_image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
        ... )
        >>> last_image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
        ... )
        >>> first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
        >>> last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
        >>> conditions = [first_cond, last_cond]
        >>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, static"

        >>> frame_rate = 24.0
        >>> video = pipe(
        ...     conditions=conditions,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=768,
        ...     height=512,
        ...     num_frames=121,
        ...     frame_rate=frame_rate,
        ...     num_inference_steps=40,
        ...     guidance_scale=4.0,
        ...     output_type="np",
        ...     return_dict=False,
        ... )
        >>> video = (video * 255).round().astype("uint8")
        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
        ...     fps=frame_rate,
        ...     audio=audio[0].float().cpu(),
        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
        ...     output_path="video.mp4",
        ... )
        ```
c                       \ rS rSr% Sr\R                  R                  \\R                  R                     -  \R                  -  \
R                  -  \S'   Sr\\S'   Sr\\S'   Srg	)
LTX2VideoConditiond   av  
Defines a single frame-conditioning item for LTX-2 Video - a single frame or a sequence of frames.

Attributes:
    frames (`PIL.Image.Image` or `List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
        The image (or video) to condition the video on. Accepts any type that can be handled by
        VideoProcessor.preprocess_video.
    index (`int`, defaults to `0`):
        The index at which the image or video will conditionally affect the video generation.
    strength (`float`, defaults to `1.0`):
        The strength of the conditioning effect. A value of `1.0` means the conditioning effect is fully applied.
framesr   index      ?strength N)__name__
__module____qualname____firstlineno____doc__PILImagelistnpndarraytorchTensor__annotations__r"   intr$   float__static_attributes__r%       j/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/pipelines/ltx2/pipeline_ltx2_condition.pyr   r   d   sN     IIOOd399??33bjj@5<<OOE3NHer6   r   encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr<   r=   moder?   AttributeError)r8   r9   r:   s      r7   retrieve_latentsrC   y   s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSSr6   base_seq_lenmax_seq_len
base_shift	max_shiftc                 4    XC-
  X!-
  -  nX5U-  -
  nX-  U-   nU$ Nr%   )image_seq_lenrD   rE   rF   rG   mbmus           r7   calculate_shiftrN      s3     
	K$>?A%%A		Q	BIr6   num_inference_stepsdevice	timestepssigmasc                    Ub  Ub  [        S5      eUb  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
X2S.UD6  U R                  n[        U5      nX14$ Ub  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
XBS.UD6  U R                  n[        U5      nX14$ U R                  " U4S	U0UD6  U R                  nX14$ )a  
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

Args:
    scheduler (`SchedulerMixin`):
        The scheduler to get timesteps from.
    num_inference_steps (`int`):
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
        must be `None`.
    device (`str` or `torch.device`, *optional*):
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
    timesteps (`list[int]`, *optional*):
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
        `num_inference_steps` and `sigmas` must be `None`.
    sigmas (`list[float]`, *optional*):
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
        `num_inference_steps` and `timesteps` must be `None`.

Returns:
    `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
    second element is the number of inference steps.
zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesrQ   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)rQ   rP   rR   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)rR   rP   rP   r%   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__rQ   len)	schedulerrO   rP   rQ   rR   kwargsaccepts_timestepsaccept_sigmass           r7   retrieve_timestepsra      s}   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	M)MfM''	!)n )) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	GvGG''	!)n )) 	 3MFMfM''	))r6   c                     UR                  [        [        SUR                  5      5      SS9nU R                  [        [        SU R                  5      5      SS9nXU-  -  nX%-  SU-
  U -  -   n U $ )a  
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891).

Args:
    noise_cfg (`torch.Tensor`):
        The predicted noise tensor for the guided diffusion process.
    noise_pred_text (`torch.Tensor`):
        The predicted noise tensor for the text-guided diffusion process.
    guidance_rescale (`float`, *optional*, defaults to 0.0):
        A rescale factor applied to the noise predictions.

Returns:
    noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
r   T)dimkeepdim)stdr-   rangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaleds         r7   rescale_noise_cfgrn      s{    " ""tE!_5I5I,J'KUY"ZHmmU1inn%= >mMG#''9: 6!>N:NR[9[[Ir6   c            M       ^  ^  \ rS rSrSrSr/ r/ SQrS\S\	S\
S\S	\\-  S
\S\S\\-  4U 4S jjr     SS\\\   -  S\S\S\S\R.                  S-  S\R0                  S-  4S jjr           SS\\\   -  S\\\   -  S-  S\S\S\R6                  S-  S\R6                  S-  S\R6                  S-  S\R6                  S-  S\S\S\R.                  S-  S\R0                  S-  4S jjr          SS  jr\SS!\R6                  S"\S#\S$\R6                  4S% jj5       r\ SS!\R6                  S&\S'\S(\S"\S#\S$\R6                  4S) jj5       r \ SS!\R6                  S+\R6                  S,\R6                  S-\!S$\R6                  4
S. jj5       r"\ SS!\R6                  S+\R6                  S,\R6                  S-\!S$\R6                  4
S/ jj5       r#\ SS!\R6                  S0\!\R6                  -  S1\RH                  S-  4S2 jj5       r%\ SS!\R6                  S"\S-  S#\S-  S$\R6                  4S3 jj5       r&\  SS!\R6                  S4\S5\S"\S-  S#\S-  S$\R6                  4S6 jj5       r'\S!\R6                  S+\R6                  S,\R6                  4S7 j5       r(\S!\R6                  S+\R6                  S,\R6                  4S8 j5       r)S9\S:\S;\S$\4S< jr*     SS@\+\\+   -  S-  S'\S(\S&\S\R.                  S-  S$\,\\R6                     \\!   \\   4   4SA jjr-S!\R6                  SB\R6                  SC\\R6                     SD\\!   SE\\   SF\SG\S$\,\R6                  \R6                  \R6                  4   4SH jr.           SS@\+\\+   -  S-  SI\SJ\S'\S(\S&\S0\!S\R0                  S-  S\R.                  S-  S1\RH                  S-  S!\R6                  S-  S$\,\R6                  \R6                  \R6                  4   4SK jjr/         SSI\SJ\SM\S5\S0\!S\R0                  S-  S\R.                  S-  S1\RH                  S-  S!\R6                  S-  S$\R6                  4SN jjr0 SSO\R6                  SP\R6                  SQ\S\1S-  S$\R6                  4
SR jjr2 SSO\R6                  SP\R6                  SQ\S\1S-  S$\R6                  4
SS jjr3\4ST 5       r5\4SU 5       r6\4SV 5       r7\4SW 5       r8\4SX 5       r9\4SY 5       r:\4SZ 5       r;\4S[ 5       r<\4S\ 5       r=\4S] 5       r>\4S^ 5       r?\4S_ 5       r@\4S` 5       rA\4Sa 5       rB\4Sb 5       rC\R                  " 5       \E" \F5      SSSS=S>S?ScSdSSSeSLS*SLSSSSSSSSSSSSSSSLSSfSgSSSS!/S4%S@\+\\+   -  S-  S\\\   -  S\\\   -  S-  S'\S(\S&\Sh\!Si\Sj\\!   S-  Sk\\!   S-  Sl\!Sm\!Sn\!So\!Sp\!S-  Sq\!S-  Sr\!S-  Ss\!S-  St\\   S-  S0\!S-  S\S-  S1\RH                  \\RH                     -  S-  S!\R6                  S-  Su\R6                  S-  S\R6                  S-  S\R6                  S-  S\R6                  S-  S\R6                  S-  Sv\!\\!   -  Sw\!\\!   -  S-  Sx\Sy\Sz\S{\G\\14   S-  S|\H\\/S4   S-  S}\\   S\4JS~ jj5       5       rISrJU =rK$ )LTX2ConditionPipeline   z
Pipeline for video generation which allows image conditions to be inserted at arbitary parts of the video.

Reference: https://github.com/Lightricks/LTX-Video

TODO
z>text_encoder->connectors->transformer->vae->audio_vae->vocoder)r?   prompt_embedsnegative_prompt_embedsr]   vae	audio_vaetext_encoder	tokenizer
connectorstransformervocoderc	                   > [         T	U ]  5         U R                  UUUUUUUUS9  [        U SS 5      b  U R                  R
                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b   U R                  R                  R                  OSU l        [        U S5      b   U R                  R                  R"                  OSU l        [        U SS 5      b   U R                  R                  R&                  OS	U l        [        U SS 5      b   U R                  R                  R*                  OS
U l        [/        U R                  SS9U l        [        U SS 5      b  U R2                  R4                  U l        g SU l        g )N)rt   ru   rv   rw   rx   ry   rz   r]   rt          ru      ry   r   i>     bilinear)vae_scale_factorresamplerw      )super__init__register_modulesgetattrrt   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratioru   mel_compression_ratioaudio_vae_mel_compression_ratio$audio_vae_temporal_compression_ratiory   config
patch_sizetransformer_spatial_patch_sizepatch_size_ttransformer_temporal_patch_sizesample_rateaudio_sampling_ratemel_hop_lengthaudio_hop_lengthr   video_processorrw   model_max_lengthtokenizer_max_length)
selfr]   rt   ru   rv   rw   rx   ry   rz   r[   s
            r7   r   LTX2ConditionPipeline.__init__   s    	%!# 	 		
 3:$t2L2XDHH..^` 	* 4;43M3YDHH//_` 	+ 5<D+t4T4`DNN00fg 	, :A{TX9Y9eDNN55kl 	1 3:$t2T2`D##..fg 	+ 5<D-4P4\D##00bc 	,
 29{D1Q1]DNN!!--ch 	  5<D+t4T4`DNN!!00fi 	  .t?a?alvw/6t[$/O/[DNN++ 	!ae 	!r6   r   r   Npromptnum_videos_per_promptmax_sequence_lengthscale_factorrP   dtypec           	         U=(       d    U R                   nU=(       d    U R                  R                  n[        U[        5      (       a  U/OUn[        U5      n[        U SS5      bM  SU R                  l        U R                  R                  c%  U R                  R                  U R                  l	        U Vs/ s H  oR                  5       PM     nnU R                  USUSSSS9n	U	R                  n
U	R                  nU
R                  U5      n
UR                  U5      nU R                  XSS9nUR                  n[         R"                  " US	S
9nUR%                  SS5      R                  US9nUR&                  u  nnnUR)                  SUS5      nUR+                  Xr-  US	5      nUR+                  US	5      nUR)                  US5      nX4$ s  snf )a  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `list[str]`, *optional*):
        prompt to be encoded
    device: (`str` or `torch.device`):
        torch device to place the resulting embeddings on
    dtype: (`torch.dtype`):
        torch dtype to cast the prompt embeds to
    max_sequence_length (`int`, defaults to 1024): Maximum sequence length to use for the prompt.
rw   Nleft
max_lengthTpt)paddingr   
truncationadd_special_tokensreturn_tensors)	input_idsattention_maskoutput_hidden_statesrc   r   r	   r   r   )_execution_devicerv   r   
isinstancestrr\   r   rw   padding_side	pad_token	eos_tokenstripr   r   tohidden_statesr0   stackflattenshaperepeatview)r   r   r   r   r   rP   r   
batch_sizeptext_inputstext_input_idsprompt_attention_masktext_encoder_outputstext_encoder_hidden_statesrr   _seq_lens                    r7   _get_gemma_prompt_embeds.LTX2ConditionPipeline._get_gemma_prompt_embeds0  s   * 14110**00'44&&[
4d+7*0DNN'~~''/+/>>+C+C(%+,V'')V,nn *# % 
 %.. + : :'**62 5 8 8 @#00$ae  1  
 &:%G%G"%*[[1KQS%T"2::1a@CC%CP &++7A%,,Q0EqI%**:+MwXZ[ 5 : ::r J 5 < <=RTU V33; -s   9GTnegative_promptdo_classifier_free_guidancerr   rs   r   negative_prompt_attention_maskc                 @   U=(       d    U R                   n[        U[        5      (       a  U/OUnUb  [        U5      nOUR                  S   nUc  U R                  UUU	U
UUS9u  pWU(       a  Uc  U=(       d    Sn[        U[        5      (       a  X/-  OUnUb;  [        U5      [        U5      La$  [        S[        U5       S[        U5       S35      eU[        U5      :w  a!  [        SU S[        U5       S	U SU S
3	5      eU R                  UUU	U
UUS9u  phXWXh4$ )ab  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `list[str]`, *optional*):
        prompt to be encoded
    negative_prompt (`str` or `list[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
        Whether to use classifier free guidance or not.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    device: (`torch.device`, *optional*):
        torch device
    dtype: (`torch.dtype`, *optional*):
        torch dtype
r   )r   r   r   r   rP   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r   r   r   r\   r   r   type	TypeErrorrT   )r   r   r   r   r   rr   rs   r   r   r   r   rP   r   r   s                 r7   encode_prompt#LTX2ConditionPipeline.encode_promptq  s   R 1411'44&&VJ&,,Q/J 373P3P&;$7) 4Q 40M '+A+I-3O@J?\_@`@`j+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  FJEbEb&&;$7) Fc FB" 5Kkkr6   c           
      $  ^  US-  S:w  d	  US-  S:w  a  [        SU SU S35      eUbW  [        U 4S jU 5       5      (       d=  [        ST R                   SU Vs/ s H  oT R                  ;  d  M  UPM     sn 35      eUb  Ub  [        S	U S
U S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUb  Uc  [        S5      eUb  Uc  [        S5      eUb  Ub  UR                  UR                  :w  a&  [        SUR                   SUR                   S35      eUR                  UR                  :w  a&  [        SUR                   SUR                   S35      eU	b)  U	R                  S:w  a  [        SU	R                   S35      eU
b)  U
R                  S:w  a  [        SU	R                   S35      eUS:  d  US:  a  U(       d  [        S5      eg g s  snf )Nr|   r   z8`height` and `width` have to be divisible by 32 but are z and r   c              3   @   >#    U  H  oTR                   ;   v   M     g 7frI   )_callback_tensor_inputs).0kr   s     r7   	<genexpr>5LTX2ConditionPipeline.check_inputs.<locals>.<genexpr>  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but got: `prompt_attention_mask` z% != `negative_prompt_attention_mask`    zOnly unpacked (5D) video latents of shape `[batch_size, latent_channels, latent_frames, latent_height, latent_width] are supported, but got ze dims. If you have packed (3D) latents, please unpack them (e.g. using the `_unpack_latents` method).r~   zuOnly unpacked (4D) audio latents of shape `[batch_size, num_channels, audio_length, mel_bins] are supported, but got zk dims. If you have packed (3D) latents, please unpack them (e.g. using the `_unpack_audio_latents` method).        zSpatio-Temporal Guidance (STG) is specified but no STG blocks are supplied. Please supply a list ofblock indices at which to apply STG in `spatio_temporal_guidance_blocks`)	rT   allr   r   r   r-   r   r   rg   )r   r   heightwidth"callback_on_step_end_tensor_inputsrr   rs   r   r   r?   audio_latentsspatio_temporal_guidance_blocks	stg_scaleaudio_stg_scaler   s   `              r7   check_inputs"LTX2ConditionPipeline.check_inputs  s     B;!urzQWX^W__dejdkklmnn-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa$)>)Fdee!-2P2Xvww$)?)K""&<&B&BB --:-@-@,A B.445Q8 
 %**.L.R.RR 55J5P5P4Q R6<<=Q@  7<<1#4HHO~ VZ[ 
 $););q)@''.||n 5>?  _/C"7Ba[  Cb"7a pHs   H1Hr?   r   r   returnc           
          U R                   u  p4pVnXR-  nXa-  n	Xq-  n
U R                  USUUU	UU
U5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      n U $ )
Nr   r   r   r~      r   r	   r      )r   reshapepermuter   )r?   r   r   r   num_channels
num_framesr   r   post_patch_num_framespost_patch_heightpost_patch_widths              r7   _pack_latents#LTX2ConditionPipeline._pack_latents  s     ?Fmm;
*e * :"0 .//!	
 //!Q1aAq9AA!QGOOPQSTUr6   r   r   r   c           
          U R                  S5      nU R                  XaX#SXTU5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      R                  SS5      n U $ )
Nr   r   r~   r   r   r   r   r	   r   )sizer   r   r   )r?   r   r   r   r   r   r   s          r7   _unpack_latents%LTX2ConditionPipeline._unpack_latents,  st     \\!_
//*&\gqr//!Q1aAq9AA!QGOOPQSTU]]^_abcr6   r#   latents_meanlatents_stdscaling_factorc                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-
  U-  U-  n U $ Nr   r   r   r   rP   r   r?   r   r   r   s       r7   _normalize_latents(LTX2ConditionPipeline._normalize_latents9  su     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X)^;kIr6   c                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-  U-  U-   n U $ r   r   r   s       r7   _denormalize_latents*LTX2ConditionPipeline._denormalize_latentsD  su     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X'.8<Gr6   noise_scaler9   c                 r    [        U R                  X R                  U R                  S9nX-  SU-
  U -  -   nU$ )Nr9   rP   r   r   )r   r   rP   r   )r?   r  r9   noisenoised_latentss        r7   _create_noised_state*LTX2ConditionPipeline._create_noised_stateO  s=    
 W]]i^e^k^kl$,K7/JJr6   c                    Ubf  Ubc  U R                   u  p4pVXR-  nXa-  nU R                  USXrX5      n U R                  SSSSSS5      R                  SS5      R                  SS5      n U $ U R	                  SS5      R                  SS5      n U $ )Nr   r   r   r~   r   r	   r   )r   r   r   r   	transpose)	r?   r   r   r   r   latent_lengthlatent_mel_binspost_patch_latent_lengthpost_patch_mel_binss	            r7   _pack_audio_latents)LTX2ConditionPipeline._pack_audio_latentsX  s     !l&> HO}}DJm'4'C$"1">ooB 8H[G ooaAq!Q7??1EMMaQRSG
  ''1-55a;Gr6   r  num_mel_binsc                    Ub`  Ub]  U R                  S5      nU R                  XQUSXC5      n U R                  SSSSSS5      R                  SS5      R                  SS5      n U $ U R	                  SSU45      R                  SS5      n U $ )Nr   r   r	   r   r~   r   r   )r   r   r   r   	unflattenr  )r?   r  r  r   r   r   s         r7   _unpack_audio_latents+LTX2ConditionPipeline._unpack_audio_latentsn  s     !l&> aJoojrS_lGooaAq!Q7??1EMMaQRSG  ''B+=>HHANGr6   c                     UR                  U R                  U R                  5      nUR                  U R                  U R                  5      nX-
  U-  $ rI   r   rP   r   r?   r   r   s      r7   _normalize_audio_latents.LTX2ConditionPipeline._normalize_audio_latents  sB     $w~~w}}E!nnW^^W]]C&+55r6   c                     UR                  U R                  U R                  5      nUR                  U R                  U R                  5      nX-  U-   $ rI   r  r  s      r7   _denormalize_audio_latents0LTX2ConditionPipeline._denormalize_audio_latents  sB     $w~~w}}E!nnW^^W]]C%55r6   start_framesequence_num_framestarget_num_framesc                 V    U R                   n[        X#U-
  5      nUS-
  U-  U-  S-   nU$ )aY  
Trim a conditioning sequence to the allowed number of frames.

Args:
    start_frame (int): The target frame number of the first frame in the sequence.
    sequence_num_frames (int): The number of frames in the sequence.
    target_num_frames (int): The target number of frames in the generated video.
Returns:
    int: updated sequence length
r   )r   min)r   r  r   r!  r   r   s         r7   trim_conditioning_sequence0LTX2ConditionPipeline.trim_conditioning_sequence  s=     ::,+.MN
 1n5DqH
r6         y   
conditionsc           
         / / / pnUc  / n[        U[        5      (       a  U/nU R                  n	US-
  U	-  S-   n
[        U5       GH  u  p[        UR                  [
        R                  R                  5      (       a  UR                  /nO[        UR                  [        R                  5      (       a:  UR                  R                  S:X  a   [        R                  " UR                  SS9nOk[        UR                  [        R                  5      (       a6  UR                  R                  S:X  a  UR                  R                  S5      nOUR                  nU R                  R                  XUSS9nUR                   nUS:  a  X-  nX:  a"  ["        R%                  SU S	U S
U
 S35        GMm  UR'                  S5      n[)        US-
  U	-  S-   S5      nU R+                  UUU5      nUSS2SS2SU24   nUR-                  UR/                  U R0                  R2                  US95        UR-                  UR4                  5        UR-                  U5        GM     XgU4$ )aX  
Preprocesses the condition images/videos to torch tensors.

Args:
    conditions (`LTX2VideoCondition` or `List[LTX2VideoCondition]`, *optional*, defaults to `None`):
        A list of image/video condition instances.
    height (`int`, *optional*, defaults to `512`):
        The desired height in pixels.
    width (`int`, *optional*, defaults to `768`):
        The desired width in pixels.
    num_frames (`int`, *optional*, defaults to `121`):
        The desired number of frames in the generated video.
    device (`torch.device`, *optional*, defaults to `None`):
        The device on which to put the preprocessed image/video tensors.

Returns:
    `Tuple[List[torch.Tensor], List[float], List[int]]`:
        Returns a 3-tuple of lists of length `len(conditions)` as follows:
            1. The first list is a list of preprocessed video tensors of shape [batch_size=1, num_channels,
               num_frames, height, width].
            2. The second list is a list of conditioning strengths.
            3. The third list is a list of indices in latent space to insert the corresponding condition.
Nr   r	   r   )axiscrop)resize_modezThe starting latent index z of condition z6 is too big for the specified number of latent frames z!. This condition will be skipped.r   )r   rP   )r   r   r   	enumerater!   r+   r,   r.   r/   rg   expand_dimsr0   r1   	unsqueezer   preprocess_videor"   loggerwarningr   maxr$  appendr   rt   r   r$   )r   r)  r   r   r   rP   conditioning_framesconditioning_strengthsconditioning_indicesframe_scale_factorlatent_num_framesi	conditionvideo_like_condcondition_pixelslatent_start_idxcond_num_frames	start_idxtruncated_cond_framess                      r7   preprocess_conditions+LTX2ConditionPipeline.preprocess_conditions  sT   > MOPRTV5IJj"455$J!@@'!^0BBQF%j1LA)**CIIOO<<#,#3#3"4I,,bjj99i>N>N>S>SWX>X"$..1A1A"JI,,ell;;	@P@P@U@UYZ@Z"+"2"2"<"<Q"? #,"2"2#33DDF  E  
  )!##3#G 401A0B.QRPS T)):(;;\^ .33A6O-15GG!KQOI$($C$CI`j$k!/16L7L6L0LM&&'7':':X^':'_`")))*<*<= ''(89M 2P #<PPPr6   conditioning_maskcondition_latentscondition_strengthscondition_indiceslatent_heightlatent_widthc                     [         R                  " U5      n[        X4U5       H=  u  pnU	R                  S5      nX-  U-  nX-   nXSS2X24'   XSS2X24'   XSS2X24'   M?     XU4$ )a  
Applies visual conditioning frames to an initial latent.

Args:
    latents (`torch.Tensor`):
        Initial packed (patchified) latents of shape [batch_size, patch_seq_len, hidden_dim].
    conditioning_mask (`torch.Tensor`, *optional*):
        Initial packed (patchified) conditioning mask of shape [batch_size, patch_seq_len, 1] with values in
        [0, 1] where 0 means that the denoising model output will be fully used and 1 means that the condition
        will be fully used (with intermediate values specifying a blend of the denoised and latent values).

Returns:
    `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]`:
        Returns a 3-tuple of tensors where:
            1. The first element is the packed video latents (with unchanged shape [batch_size, patch_seq_len,
               hidden_dim]) with the conditions applied
            2. The second element is the packed conditioning mask with conditioning strengths applied
            3. The third element holds the clean conditioning latents.
r   N)r0   
zeros_likezipr   )r   r?   rE  rF  rG  rH  rI  rJ  clean_latentscondr$   
latent_idxnum_cond_tokensstart_token_idxend_token_idxs                  r7   apply_visual_conditioning/LTX2ConditionPipeline.apply_visual_conditioning  s    < ((1*-.?Vg*h&DJ"iilO(8<GO+=M 9=A445BJa!>>?>B!_::; +i =88r6   r   num_channels_latentsc           
         X@R                   -  nXPR                   -  nUS-
  U R                  -  S-   nX#XU4nUSXU4nUbZ  U R                  XR                  R                  U R                  R
                  U R                  R                  R                  5      nO[        R                  " XUS9nUR                  U5      nU R                  XR                  U R                  5      nU R                  UU R                  U R                  5      nUR                  S:w  d   UR                  S S UR                  S S :w  a-  [!        SUR                   SUR                  S S U4-    S35      e[#        U
[$        5      (       a1  [&        R)                  U R*                  R,                   S35        U
S	   n
U R/                  XXVU	S
9u  nnn/ nU H  n[1        U R                  R3                  U5      U
SS9nU R                  UU R                  R                  U R                  R
                  5      R5                  XS9nU R                  UU R                  U R                  5      nUR7                  U5        M     U R9                  UUUUUUUS9u  nnn[;        UR                  XR<                  UR>                  S9nSU-
  U-  nUU-  USU-
  -  -   nUUU4$ )Nr   rP   r   r	   r   z$Provided `latents` tensor has shape z, but the expected shape is r   z~ does not support using a list of generators. The first generator in the list will be used for all (pseudo-)random operations.r   )rP   r>   )r9   r:   )rI  rJ  r  r#   ) r   r   r   rt   r   r   r   r   r0   zeros	new_zerosr   r   r   rg   r   rT   r   r-   r2  r3  r[   r&   rC  rC   encoder   r5  rT  r   rP   r   )r   r)  r   rV  r   r   r   r  r   rP   r9   r?   rI  rJ  r:  r   
mask_shaperE  condition_framesrG  rH  rF  condition_tensorcondition_latentrN  r  scaled_masks                              r7   prepare_latents%LTX2ConditionPipeline.prepare_latents  s!    "D"DD B BB'!^0S0SSVWW3DUab !%6|T
--..0D0DdhhooFdFdG kk%eDG#--j9$$88$:^:^
 !..tBBDDhDh
 <<1bq 15F5L5LRa5P P6w}}oEabsbybyz|{|b}  BV  AX  cX  bY  YZ  [  i&&NN>>**+ ,I J "!ICGC]C]& D^ D
@-/@  0/ 01YT\   $66 $(("7"79M9M bb,   $11 $"E"EtGkGk  $$%56 !1 594R4R'% 5S 5
1"M W]]i^e^k^kl..+=+%1{?(CC)=88r6   r   audio_latent_lengthc
                    U	bl  U R                  U	5      n	U R                  XR                  R                  U R                  R                  5      n	U R                  XU5      n	U	R                  XvS9$ X@R                  -  n
XX:4n[        U[        5      (       a*  [        U5      U:w  a  [        S[        U5       SU S35      e[        XXvS9n	U R                  U	5      n	U	$ )NrX  z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r  )r  r  ru   r   r   r  r   r   r   r-   r\   rT   r   )r   r   rV  rc  r  r  r   rP   r9   r?   r  r   s               r7   prepare_audio_latents+LTX2ConditionPipeline.prepare_audio_latentsv  s     ..w7G33G^^=X=XZ^ZhZhZtZtuG//iPG::V:99&*N*NN3FXi&&3y>Z+GA#i.AQ R&<'gi 
 u&V**73r6   r=   denoised_outputstep_idxc                 L    Uc  U R                   nXUR                  U   -  -
  nU$ rI   r]   rR   )r   r=   rg  rh  r]   	sample_x0s         r7   convert_velocity_to_x0,LTX2ConditionPipeline.convert_velocity_to_x0  s1     Iy/?/?/III	r6   c                 L    Uc  U R                   nX-
  UR                  U   -  nU$ rI   rj  )r   r=   rg  rh  r]   sample_vs         r7   convert_x0_to_velocity,LTX2ConditionPipeline.convert_x0_to_velocity  s0     I,	0@0@0JJr6   c                     U R                   $ rI   )_guidance_scaler   s    r7   guidance_scale$LTX2ConditionPipeline.guidance_scale      ###r6   c                     U R                   $ rI   )_guidance_rescalert  s    r7   rj   &LTX2ConditionPipeline.guidance_rescale      %%%r6   c                     U R                   $ rI   )
_stg_scalert  s    r7   r   LTX2ConditionPipeline.stg_scale      r6   c                     U R                   $ rI   )_modality_scalert  s    r7   modality_scale$LTX2ConditionPipeline.modality_scale  rw  r6   c                     U R                   $ rI   )_audio_guidance_scalert  s    r7   audio_guidance_scale*LTX2ConditionPipeline.audio_guidance_scale      )))r6   c                     U R                   $ rI   )_audio_guidance_rescalert  s    r7   audio_guidance_rescale,LTX2ConditionPipeline.audio_guidance_rescale  s    +++r6   c                     U R                   $ rI   )_audio_stg_scalert  s    r7   r   %LTX2ConditionPipeline.audio_stg_scale  s    $$$r6   c                     U R                   $ rI   )_audio_modality_scalert  s    r7   audio_modality_scale*LTX2ConditionPipeline.audio_modality_scale  r  r6   c                 L    U R                   S:  =(       d    U R                  S:  $ Nr#   )rs  r  rt  s    r7   r   1LTX2ConditionPipeline.do_classifier_free_guidance  #    $$s*Q0J0JS0PQr6   c                 L    U R                   S:  =(       d    U R                  S:  $ )Nr   )r}  r  rt  s    r7   do_spatio_temporal_guidance1LTX2ConditionPipeline.do_spatio_temporal_guidance  s!    #%G4+@+@3+FGr6   c                 L    U R                   S:  =(       d    U R                  S:  $ r  )r  r  rt  s    r7   do_modality_isolation_guidance4LTX2ConditionPipeline.do_modality_isolation_guidance  r  r6   c                     U R                   $ rI   )_num_timestepsrt  s    r7   num_timesteps#LTX2ConditionPipeline.num_timesteps  s    """r6   c                     U R                   $ rI   )_current_timesteprt  s    r7   current_timestep&LTX2ConditionPipeline.current_timestep  r{  r6   c                     U R                   $ rI   )_attention_kwargsrt  s    r7   attention_kwargs&LTX2ConditionPipeline.attention_kwargs  r{  r6   c                     U R                   $ rI   )
_interruptrt  s    r7   	interruptLTX2ConditionPipeline.interrupt  r  r6   g      8@(   g      @Fpil
frame_raterO   rR   rQ   ru  r   r  rj   r  r   r  r  r   r   decode_timestepdecode_noise_scaleuse_cross_timestepoutput_typereturn_dictr  callback_on_step_endr   c&                     [        U#[        [        45      (       a  U#R                  n$U=(       d    UnU=(       d    UnU=(       d    UnU=(       d    UnU R	                  UUUU$UUUUUUUUUS9  Xl        Xl        Xl        Xl        Xl	        UU l
        UU l        UU l        U"U l        SU l        SU l        Ub  [        U[         5      (       a  Sn&O3Ub!  [        U["        5      (       a  [%        U5      n&OUR&                  S   n&Ub  [        U["        5      (       d  U/nUc
  U	b  U	S   OSnU R(                  n'U R+                  UUU R,                  UUUUUU%U'S9
u  nnnnU R,                  (       a.  [.        R0                  " UU/SS9n[.        R0                  " UU/SS9nS	n([3        U S
S5      b  [3        U R4                  SS	5      n(U R7                  UUU(S9u  n)n*n+US-
  U R8                  -  S-   n,X@R:                  -  n-XPR:                  -  n.Ub'  [<        R?                  S5        UR&                  u    n/n,n-n.U R@                  RB                  RD                  n0U RG                  UU&U-  U0UUUU[.        RH                  U'UU5      u  nn1n2U R,                  (       a  [.        R0                  " U1U1/5      n1Xg-  n3U RJ                  U RL                  -  [O        U RP                  5      -  n4[S        U3U4-  5      n5Ub&  [<        R?                  S5        UR&                  u    n/n5n/[3        U SS5      b   U RT                  RB                  RV                  OSn6U6U RX                  -  n7[3        U SS5      b   U RT                  RB                  RZ                  OSn8U R]                  U&U-  U8U5U6U[.        RH                  U'UUS9	nU	c  [^        R`                  " SSU-  U5      OU	n	[c        U Rd                  RB                  Rg                  SS5      U Rd                  RB                  Rg                  SS5      U Rd                  RB                  Rg                  SS5      U Rd                  RB                  Rg                  SS5      U Rd                  RB                  Rg                  SS5      5      n9[h        Rj                  " U Rd                  5      n:[m        U:UU'U
U	U9S9u    n/[m        U Rd                  UU'U
U	U9S9u  p[o        [%        U
5      XRd                  Rp                  -  -
  S5      n;[%        U
5      U l9        U R@                  Rt                  Rw                  UR&                  S   U,U-U.URx                  US9n<U R@                  Rz                  R}                  UR&                  S   U5URx                  5      n=U R,                  (       aH  U<R                  SSU<R                  S-
  -  -   5      n<U=R                  SSU=R                  S-
  -  -   5      n=U R                  US9 n>[        U
5       GH  u  n?n@U R                  (       a  M  W@U l        U R,                  (       a  [.        R0                  " U/S -  5      OUnAUAR                  UR                  5      nAU R,                  (       a  [.        R0                  " U/S -  5      OUnBUBR                  UR                  5      nBW@R                  WAR&                  S   5      nCUCR                  S!5      SU1R                  S!5      -
  -  nDU R@                  R                  S"5         U RA                  UAUBU)U*UDUCUCU+U+U,U-U.UU5U<U=SSSUU"SS#9u  nEnFSSS5        WERO                  5       nEWFRO                  5       nFU R,                  (       Gad  WER                  S 5      u  nGnEU R                  UUEU?U Rd                  5      nEU R                  UUGU?U Rd                  5      nGU R                  S-
  UEUG-
  -  nHWFR                  S 5      u  nInFU R                  UUFU?U:5      nFU R                  UUIU?U:5      nIU R                  S-
  UFUI-
  -  nJU R                  (       d  U R                  (       a  U?S:X  a_  U)R                  S SS9S   nKU*R                  S SS9S   nLU+R                  S SS9S   nMU<R                  S SS9S   nNU=R                  S SS9S   nOWCR                  S SS9S   nCWDR                  S SS9S   nDO@S=nHnJU)nKU*nLU+nMU<nNU=nOU R                  UWEU?U Rd                  5      nEU R                  UWFU?U:5      nFU R                  (       a  U R@                  R                  S$5         U RA                  UR                  UR                  S%9UR                  UR                  S%9WKWLWDWCUCWMUMU,U-U.UU5WNWOSUSUU"SS#9u  nPnQSSS5        WPRO                  5       nPWQRO                  5       nQU R                  UUPU?U Rd                  5      nPU R                  UUQU?U:5      nQU R                  WEUP-
  -  nRU R                  WFUQ-
  -  nSOS=nRnSU R                  (       a  U R@                  R                  S&5         U RA                  UR                  UR                  S%9UR                  UR                  S%9WKWLWDWCUCWMUMU,U-U.UU5WNWOS'SSUU"SS#9u  nTnUSSS5        WTRO                  5       nTWURO                  5       nUU R                  UUTU?U Rd                  5      nTU R                  UUUU?U:5      nUU R                  S-
  WEUT-
  -  nVU R                  S-
  WFUU-
  -  nWOS=nVnWWEWH-   WR-   WV-   nXWFWJ-   WS-   WW-   nYU R                  S:  a  [        WXWEU R                  S(9nEOWXnEU R                  S:  a  [        WYWFU R                  S(9nFOWYnFWER                  S5      nZUESU1SUZ -
  -  U2RO                  5       U1SUZ -  -   R                  UER                  5      n[U R                  UU[U?U Rd                  5      nEU R                  UWFU?U:5      nFU Rd                  R                  UEW@USS)9S   nU:R                  UFU@USS)9S   nU#bJ  0 n\U$ H  n][        5       U]   W\U]'   M     U#" U U?W@W\5      n^U^R                  S*U5      nU^R                  S+U5      nU?[%        U
5      S-
  :X  d)  U?S-   U;:  a0  U?S-   U Rd                  Rp                  -  S:X  a  U>R                  5         [        (       d  GM  [        R                  " 5         GM     SSS5        U R                  UU,U-U.U R                  U R                  5      nU R                  UU RT                  R                  U RT                  R                  5      nU R                  UU5U7S,9nU S-:X  a`  U R                  UU R                  R                  U R                  R                  U R                  RB                  R                  5      nUn_Un`GOUR                  UR                  5      nU R                  RB                  R                  (       d  SnCO[        UR&                  UU'UR                  S.9na[        U["        5      (       d  U/U&-  nUc  UnO[        U["        5      (       d  U/U&-  n[.        R                  " UU'UR                  S/9nC[.        R                  " UU'UR                  S/9SS2SSSS4   nSU-
  U-  UWa-  -   nU R                  UU R                  R                  U R                  R                  U R                  RB                  R                  5      nUR                  U R                  R                  5      nU R                  R                  UWCSS)9S   n_U R                  R                  U_U S09n_UR                  U RT                  R                  5      nU RT                  R                  USS)9S   nbU R                  Ub5      n`U R                  5         U!(       d  W_W`4$ [        W_W`S19$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNG= f)2u'  
Function invoked when calling the pipeline for generation.

Args:
    conditions (`List[LTXVideoCondition], *optional*`):
        The list of frame-conditioning items for the video generation.
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
        instead.
    height (`int`, *optional*, defaults to `512`):
        The height in pixels of the generated image. This is set to 480 by default for the best results.
    width (`int`, *optional*, defaults to `768`):
        The width in pixels of the generated image. This is set to 848 by default for the best results.
    num_frames (`int`, *optional*, defaults to `121`):
        The number of video frames to generate
    frame_rate (`float`, *optional*, defaults to `24.0`):
        The frames per second (FPS) of the generated video.
    num_inference_steps (`int`, *optional*, defaults to 40):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    sigmas (`List[float]`, *optional*):
        Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
        their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
        will be used.
    timesteps (`List[int]`, *optional*):
        Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
        in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
        passed will be used. Must be in descending order.
    guidance_scale (`float`, *optional*, defaults to `4.0`):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality. Used for the video modality (there is
        a separate value `audio_guidance_scale` for the audio modality).
    stg_scale (`float`, *optional*, defaults to `0.0`):
        Video guidance scale for Spatio-Temporal Guidance (STG), proposed in [Spatiotemporal Skip Guidance for
        Enhanced Video Diffusion Sampling](https://arxiv.org/abs/2411.18664). STG uses a CFG-like estimate
        where we move the sample away from a weak sample from a perturbed version of the denoising model.
        Enabling STG will result in an additional denoising model forward pass; the default value of `0.0`
        means that STG is disabled.
    modality_scale (`float`, *optional*, defaults to `1.0`):
        Video guidance scale for LTX-2.X modality isolation guidance, where we move the sample away from a
        weaker sample generated by the denoising model withy cross-modality (audio-to-video and video-to-audio)
        cross attention disabled using a CFG-like estimate. Enabling modality guidance will result in an
        additional denoising model forward pass; the default value of `1.0` means that modality guidance is
        disabled.
    guidance_rescale (`float`, *optional*, defaults to 0.0):
        Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
        Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
        [Common Diffusion Noise Schedules and Sample Steps are
        Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
        using zero terminal SNR. Used for the video modality.
    audio_guidance_scale (`float`, *optional* defaults to `None`):
        Audio guidance scale for CFG with respect to the negative prompt. The CFG update rule is the same for
        video and audio, but they can use different values for the guidance scale. The LTX-2.X authors suggest
        that the `audio_guidance_scale` should be higher relative to the video `guidance_scale` (e.g. for
        LTX-2.3 they suggest 3.0 for video and 7.0 for audio). If `None`, defaults to the video value
        `guidance_scale`.
    audio_stg_scale (`float`, *optional*, defaults to `None`):
        Audio guidance scale for STG. As with CFG, the STG update rule is otherwise the same for video and
        audio. For LTX-2.3, a value of 1.0 is suggested for both video and audio. If `None`, defaults to the
        video value `stg_scale`.
    audio_modality_scale (`float`, *optional*, defaults to `None`):
        Audio guidance scale for LTX-2.X modality isolation guidance. As with CFG, the modality guidance rule
        is otherwise the same for video and audio. For LTX-2.3, a value of 3.0 is suggested for both video and
        audio. If `None`, defaults to the video value `modality_scale`.
    audio_guidance_rescale (`float`, *optional*, defaults to `None`):
        A separate guidance rescale factor for the audio modality. If `None`, defaults to the video value
        `guidance_rescale`.
    spatio_temporal_guidance_blocks (`list[int]`, *optional*, defaults to `None`):
        The zero-indexed transformer block indices at which to apply STG. Must be supplied if STG is used
        (`stg_scale` or `audio_stg_scale` is greater than `0`). A value of `[29]` is recommended for LTX-2.0
        and `[28]` is recommended for LTX-2.3.
    noise_scale (`float`, *optional*, defaults to `None`):
        The interpolation factor between random noise and denoised latents at each timestep. Applying noise to
        the `latents` and `audio_latents` before continue denoising. If not set, will be inferred from the
        sigma schedule.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        The number of videos to generate per prompt.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will be generated by sampling using the supplied random `generator`.
    audio_latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will be generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    prompt_attention_mask (`torch.Tensor`, *optional*):
        Pre-generated attention mask for text embeddings.
    negative_prompt_embeds (`torch.FloatTensor`, *optional*):
        Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
        provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
    negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
        Pre-generated attention mask for negative text embeddings.
    decode_timestep (`float`, defaults to `0.0`):
        The timestep at which generated video is decoded.
    decode_noise_scale (`float`, defaults to `None`):
        The interpolation factor between random noise and denoised latents at the decode timestep.
    use_cross_timestep (`bool` *optional*, defaults to `False`):
        Whether to use the cross modality (audio is the cross modality of video, and vice versa) sigma when
        calculating the cross attention modulation parameters. `True` is the newer (e.g. LTX-2.3) behavior;
        `False` is the legacy LTX-2.0 behavior.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between
        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ltx.LTX2PipelineOutput`] instead of a plain tuple.
    attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    callback_on_step_end (`Callable`, *optional*):
        A function that calls at the end of each denoising steps during the inference. The function is called
        with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
        callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
        `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`List`, *optional*):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.
    max_sequence_length (`int`, *optional*, defaults to `1024`):
        Maximum sequence length to use with the `prompt`.

Examples:

Returns:
    [`~pipelines.ltx.LTX2PipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.ltx.LTX2PipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is a list with the generated images.
)r   r   r   r   rr   rs   r   r   r?   r   r   r   r   FNr   r   r#   )
r   r   r   r   rr   rs   r   r   r   rP   r   r   rw   r   )r   zGot latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred.zwGot audio_latents of shape [batch_size, num_channels, audio_num_frames, mel_bins], `audio_num_frames` will be inferred.ru   @   r}   )rV  rc  r  r  r   rP   r9   r?   max_image_seq_len   base_image_seq_lenr   rF   gffffff?rG   gffffff @)rR   rM   )fps)r   )r   )totalr   r   cond_uncond)r   audio_hidden_statesencoder_hidden_statesaudio_encoder_hidden_statestimestepaudio_timestepsigmaencoder_attention_maskaudio_encoder_attention_maskr   r   r   r  audio_num_framesvideo_coordsaudio_coordsisolate_modalitiesr   perturbation_maskr  r  r  
uncond_stgr   uncond_modalityT)rj   )r  r?   rr   )r  latentr  rX  )r  )r!   audio)sr   r   r
   tensor_inputsr   rs  r}  r  ry  r  r  r  r  r  r  r  r   r-   r\   r   r   r   r   r0   catr   rw   rx   r   r   r2  infory   r   in_channelsra  float32r   r   r4   r   roundru   mel_binsr   latent_channelsre  r.   linspacerN   r]   getcopydeepcopyra   r4  orderr  ropeprepare_video_coordsrP   
audio_ropeprepare_audio_coordsr   rg   progress_barr.  r  r   r   expandr0  squeezecache_contextchunkrl  ru  r  r  r  r   r   r  r  rj   rn   r  r   rp  steplocalspopupdateXLA_AVAILABLExm	mark_stepr   r   r   r  r   r   r  r  rt   r   timestep_conditioningr   tensordecoder   postprocess_videorz   maybe_free_model_hooksr   )cr   r)  r   r   r   r   r   r  rO   rR   rQ   ru  r   r  rj   r  r   r  r  r   r  r   r9   r?   r   rr   r   rs   r   r  r  r  r  r  r  r  r   r   r   rP   tokenizer_padding_sideconnector_prompt_embedsconnector_audio_prompt_embedsconnector_attention_maskr:  rI  rJ  r   rV  rE  rN  
duration_saudio_latents_per_secondr  r  r  num_channels_latents_audiorM   audio_schedulernum_warmup_stepsr  r  r  r;  tlatent_model_inputaudio_latent_model_inputr  video_timestepnoise_pred_videonoise_pred_audionoise_pred_video_uncond_textvideo_cfg_deltanoise_pred_audio_uncond_textaudio_cfg_deltavideo_prompt_embedsaudio_prompt_embedsprompt_attn_maskvideo_pos_idsaudio_pos_idsnoise_pred_video_uncond_stgnoise_pred_audio_uncond_stgvideo_stg_deltaaudio_stg_delta noise_pred_video_uncond_modality noise_pred_audio_uncond_modalityvideo_modality_deltaaudio_modality_deltanoise_pred_video_gnoise_pred_audio_gbszdenoised_sample_condcallback_kwargsr   callback_outputsvideor  r  generated_mel_spectrogramssc                                                                                                      r7   __call__LTX2ConditionPipeline.__call__  s   h *-=?U,VWW1E1S1S.3E~)6Y3E~!7!K;K 	/Q'#9"7+I',K+ 	 	
"  .#-!1%9" /%9"'=$!1!% *VS"9"9JJvt$<$<VJ&,,Q/J!*Z*F*F$J '-'9&)sK'' +(,(H(H"7'#9"7+I 3  
	
!"* ++!II'=}&MSTUM$)II/MOd.ekl$m!!'4d+7%,T^^^V%T"[_[j[j0?U \k \
X!>@X
 (!^0S0SSVWW"D"DD B BBKK t DK==@Aq#]L  $//66BB484H4H.. MM5
1"M ++ %		+<>O*P Q,
$$t'<'<<uTEnEn?oo 	! !.F!FG$KK J )6(;(;%Aq"A9@{TX9Y9et~~,,55km&$*N*NN5<T;PT5U5aDNN!!11gh 	# 22..!; 0%#--! 3 

 TZSaS!&9"9;NOgmNN!!%%&94@NN!!%%&:DANN!!%%&94@NN!!%%lD9NN!!%%k48
 --7!
1 *<NN*
&	 s9~0CnnFZFZ0ZZ\]^!)n '',,AAMM!/gnnbl B 
 ''22GG"$4m6J6J
 ++'..tdl>O>ORS>S6T/TUL'..tdl>O>ORS>S6T/TUL %89\!),1>>)*&AEAaAaUYYy1}%=gn"%7%:%:=;N;N%O"6:6V6VEII}o12\i ) ,D+F+F}GZGZ+[(88$6$<$<Q$?@!)!3!3B!71?P?X?XY[?\;\!]%%33MB9=9I9I&8,D.E4Q!/'/&/G5M#4,*&)9%1%1+08<*.+=)9$)- :J :6$&6 C2 $4#9#9#; #3#9#9#; 333EUE[E[\]E^B02B'+'B'B7L\^_aeaoao'p$373N3N!=q$..40 (,':':Q'>CSVrCr&sOEUE[E[\]E^B02B'+'B'B=Rbdegv'w$373N3N%'CQ40 (,'@'@1'D(+GG'O
 774;^;^62I2O2OPQWX2O2YZ[2\/2O2U2UVW]^2U2_`a2b//G/M/MaUV/M/WXY/Z,,8,>,>qa,>,H,KM,8,>,>qa,>,H,KM $,>>!>#;A#>)7)=)=aQ)=)G)J899Oo*A'*G''?$$0M$0M'+'B'B7L\^_aeaoao'p$'+'B'B=Rbdegv'w$33))77ESWScSc*1**=;N;N**O0=0@0@}GZGZ0@0[2E8K%3+3"*3C9I'8#0". *-=)6)6/4<[.2/A-=(-/ Td TP35P F4 3N2S2S2U/2M2S2S2U/262M2M!<a3/ 372M2M%'BA3/ '+nn8HKf8f&gO&*&:&:>NQl>l&mO899Oo66))778IJ]a]m]m*1**=;N;N**O0=0@0@}GZGZ0@0[2E8K%3+3"*3C9I'8#0". *-=)6)6/3<@.2/A-=(-/ ^n ^Z8:Z K4 8X7]7]7_47W7]7]7_47;7R7R!A1dnn84 8<7R7R%'GO84 -1,?,?!,C(+KK,( -1,E,E,I(+KK,( CDC(+? &6%G/%Y\p%p"%5%G/%Y\p%p" ((1,'8*,<tOdOd($ (:$..2'8*,<tOjOj($ (:$ '++A. %,=ds,C(CD}GZGZG\_pquru_vGvv"%++, %
 $(#>#>wH\^_aeaoao#p #'#>#>}N^`acr#s  ..--.>7X]-^_`a !0 4 45Eq-ej 4 klm n'3&(O?-3Xa[* @';D!Q'X$.229gFG$4$8$8-$XM I**A9I/IqSTuX\XfXfXlXlNlpqNq '') =LLNM - :R &&//00
 774>>668R8R
 22=BRap2q("//..0D0DdhhooFdFdG E!Ejj!4!45G88??88$W]]iPV^e^k^kl!/488'6&7*&DO%-)8&#$6==*<)=
)J& <<gmm\%*\\2DV[b[h[h%itT4-&" 11W<?QTY?YY//..0D0DdhhooFdFdG jj0GHHOOGX5OI!LE((::5k:ZE),,T^^-A-ABM)-)>)>}Z_)>)`ab)c&LL!;<E 	##%5>!!e<<e CBT FER KJI :9sv   DAA(A@H
AAAA@'%B/AAAA@9*IAA1AA@
A@$@AA@'
A@6@1AA@9
AAAAAA
AA)r  r  r  r  r  r  ry  rs  r  r  r  r}  r   r   r   r   r   r   r   r   r   r   )r   r   r}   NN)NTr   NNNNr   r}   NN)
NNNNNNNNNN)r   r   )r#   rI   )NN)Nr&  r'  r(  N)Nr      r&  r'  r(  r#   NNNN)	r   r}   r   r  r   NNNN)Lr&   r'   r(   r)   r*   model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   r   r   r   r-   r3   r0   rP   r   r   boolr1   r   r   staticmethodr   r   r4   r   r  	Generatorr  r  r  r  r  r$  r   tuplerC  rT  ra  re  r   rl  rp  propertyru  rj   r   r  r  r  r   r  r   r  r  r  r  r  r  no_gradr   EXAMPLE_DOC_STRINGdictr   r  r5   __classcell__)r[   s   @r7   rp   rp      s    ]T5
25
 $5
 *	5

 55
 "$665
 '5
 15
 115
v &'#'&*$(>4d3i>4  #>4 !	>4
 >4 t#>4 {{T!>4H 37,0%&-16:59>B#'&*$(Tld3iTl tCy4/Tl &*	Tl
  #Tl ||d*Tl !&t 3Tl  %||d2Tl ).t(;Tl !Tl Tl t#Tl {{T!Tlv ,0#"'+(,KZ u||  PS \a\h\h  ,  st		+.	8;	DG	UX	lo			 	  or-2\\HMfk	   or-2\\HMfk	   gk,1ELL,@MR___cMc   Z^+.:LORVJ	 (  "&#'  $J	
 Dj 
 $ 6%,, 6ell 6afamam 6 6
 6ELL 6 6chcoco 6 6c PS hk ps & LP&*PQ&.@)AADHPQ PQ 	PQ
 PQ t#PQ 
tELL!4;S	9	:PQd)9)9 !<<)9  -	)9
 "%[)9  9)9 )9 )9 
u||U\\5<<7	8)9Z LP$' $(&*,0'+U9&.@)AADHU9 U9 "	U9
 U9 U9 U9 U9 {{T!U9 t#U9 ??T)U9 $U9 
u||U\\5<<7	8U9r $%#$ $(&*,0'+ " !	
   {{T! t# ??T) $ 
D koll5:\\MP]`cg]g	 koll5:\\MP]`cg]g	 $ $ & &   $ $ * * , , % % * * R R H H R R # # & & & &   ]]_12 LP"&26 #%%)(, # #"%-1(,-1/3<@$(,-DH'+-1-1596:>B/29=#(  26BF9B#'MO
=&.@)AADHO
= d3iO
= tCy4/	O
=
 O
= O
= O
= O
= !O
= Ud"O
= ;%O
= O
= O
= O
=  O
=  $dl!O
=" #O
=$ $dl%O
=& !&'O
=( *.cT)9)O
=* T\+O
=,  #Tz-O
=. ??T%//%::TA/O
=0 $1O
=2 ||d*3O
=4 ||d*5O
=6  %||d27O
=8 !&t 39O
=: ).t(;;O
=< e,=O
=> "DK/$6?O
=@ !AO
=B CO
=D EO
=F sCx.4/GO
=H 'Sz4'784?IO
=J -1IKO
=L !MO
= 3 O
=r6   rp   )Nr=   )   r  g      ?gffffff?)NNNN)r   )Dr  rV   dataclassesr   typingr   r   numpyr.   	PIL.Imager+   r0   transformersr   r   r   	callbacksr
   r   loadersr   r   models.autoencodersr   r   models.transformersr   
schedulersr   utilsr   r   r   utils.torch_utilsr   r   r   pipeline_utilsr   rx   r   pipeline_outputr   rz   r   r   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr&   r2  r%  r   r1   r!  r   rC   r3   r4   rN   rP   r-   ra   rn   rp   r%   r6   r7   <module>r;     s|     !      [ [ A ? Q > 9 O O - - . * / 4 ))MM			H	%1 h   * `h
TLL
T-2__t-C
TY\
T  

 
 	

 
  '+(,"&!%8*t8* %,,%8* Cy4	8*
 K$8*x4K=-/BDW K=r6   