
    3j(C                        S SK Jr  S SKJr  S SKJr  S SKJrJr  SSK	J
r  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  \" 5       (       a  S SKr\" SS9\ " S S\5      5       5       r)\ " S S\$5      5       r*S r+S r, " S S\(5      r- " S S\"5      r.\ " S  S!\!5      5       r/ " S" S#\ 5      r0\" S$S%9 " S& S'\5      5       r1/ S(Qr2g))    )	dataclass)pi)strict)Tensorbroadcast_tensors   )initialization)Cache)PreTrainedConfig)BaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_availabletorch_compilable_check   )AudioFlamingo3Config)&AudioFlamingo3ForConditionalGenerationAudioFlamingo3Model!AudioFlamingo3ModelOutputWithPastAudioFlamingo3PreTrainedModel)AudioFlamingo3Processor)CONFIG_MAPPING)MoonshineRotaryEmbeddingNznvidia/music-flamingo-2601-hf)
checkpointc                   ^    \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\
\S'   S	r\S	-  \S
'   S rSrg	)MusicFlamingoConfig-   a5  
audio_bos_token_id (`int`, *optional*, defaults to 151670):
    The beginning-of-audio token index used to mark the start of audio spans.
audio_eos_token_id (`int`, *optional*, defaults to 151671):
    The end-of-audio token index used to mark the end of audio spans.
audio_frame_step (`float`, *optional*, defaults to 0.01):
    Duration in seconds of one input mel frame (trained with hop_length 160 at sampling_rate 16000).

Example:

```python
>>> from transformers import MusicFlamingoForConditionalGeneration, MusicFlamingoConfig, AudioFlamingo3EncoderConfig, Qwen2Config

>>> # Initializing an MusicFlamingoEncoder config
>>> audio_config = AudioFlamingo3EncoderConfig()

>>> # Initializing a Qwen2 config
>>> text_config = Qwen2Config()

>>> # Initializing an MusicFlamingo configuration
>>> configuration = MusicFlamingoConfig(audio_config, text_config)

>>> # Initializing a model from the musicflamingo style configuration
>>> model = MusicFlamingoForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```ivP audio_bos_token_idiwP audio_eos_token_idg{Gz?audio_frame_stepNrope_parametersc                    U R                   c  SSSS.U l         [        U R                  [        5      (       aN  U R                  S   S;   a  SU R                  S'   [        U R                  S      " S
0 U R                  D6U l        O U R                  c  [        S   " 5       U l        [        U R
                  [        5      (       aU  U R
                  R                  SS5      U R
                  S'   [        U R
                  S      " S
0 U R
                  D6U l        O U R
                  c  [        S   " 5       U l        U R                   S	   U l        U R                  R                  U l	        [        R                  " U 40 UD6  g )Ndefaultg     @g?)	rope_type
rope_thetapartial_rotary_factor
model_type)Nmusicflamingo_encoderaudioflamingo3_encoderqwen2r(    )r$   
isinstanceaudio_configdictr   text_configgetmax_position_embeddingshidden_sizehead_dimr   __post_init__)selfkwargss     q/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/musicflamingo/modular_musicflamingo.pyr7   !MusicFlamingoConfig.__post_init__R   sQ   '&$),$D 
 d''..  .2QQ2J!!,/ .t/@/@/N O dRVRcRc dD& ./G H JDd&&---1-=-=-A-A,PW-XD\*-d.>.>|.LMaPTP`P`aD%-g68D'+';';L'I$))55&&t6v6    )r0   r6   r4   r$   r2   )__name__
__module____qualname____firstlineno____doc__r!   int__annotations__r"   r#   floatr$   r1   r7   __static_attributes__r.   r<   r:   r   r   -   s=    : %$$$"e"#'OTD['7r<   r   c                   t   ^  \ rS rSr     SU 4S jjrS\S\S\4S jr\	S 5       r
S rS	 rS
 rS rSrU =r$ )MusicFlamingoProcessorl   c                    > [         TU ]  UUUUUS9  U ?XPl        X`l        UR                  U5      U l        UR                  U5      U l        g)a+  
audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
    Special token used to represent audio inputs in the chat template.
audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
    Special token used to represent the beginning of audio.
audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
    Special token used to represent the end of audio.
max_audio_len (`int`, *optional*, defaults to 1200):
    Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
)chat_templateaudio_tokenmax_audio_lenN)super__init__default_transcription_promptaudio_bos_tokenaudio_eos_tokenconvert_tokens_to_idsr!   r"   )	r8   feature_extractor	tokenizerrJ   rK   rP   rQ   rL   	__class__s	           r:   rN   MusicFlamingoProcessor.__init__n   s`    ( 	'#' 	 	
 -.."+"A"A/"R"+"A"A/"Rr<   audio_inputs	audio_idxreturnc                 d    US   U   nU R                   U R                  U-  -   U R                  -   $ )Nnum_audio_tokens)rP   rK   rQ   )r8   rW   rX   r[   s       r:   replace_audio_token*MusicFlamingoProcessor.replace_audio_token   s;    '(:;IF##d&6&69I&IIDL`L```r<   c                 H    U R                   U R                  U R                  /$ N)audio_token_idr!   r"   )r8   s    r:   	audio_ids MusicFlamingoProcessor.audio_ids   s!    ##T%<%<d>U>UVVr<   c                     [        S5      eNz/This method is not supported for MusicFlamingo.NotImplementedErrorr8   argsr9   s      r:   apply_transcription_request2MusicFlamingoProcessor.apply_transcription_request       !"STTr<   c                     [        S5      eNz5MusicFlamingo does not need to overwrite this method.re   rg   s      r:   decodeMusicFlamingoProcessor.decode       !"YZZr<   c                     [        S5      erm   re   rg   s      r:   batch_decode#MusicFlamingoProcessor.batch_decode   rp   r<   c                     [        S5      erd   re   rg   s      r:   "_strip_assistant_prefix_and_quotes9MusicFlamingoProcessor._strip_assistant_prefix_and_quotes   rk   r<   )rP   r!   rQ   r"   )Nz<sound>z<|sound_bos|>z<|sound_eos|>i  )r=   r>   r?   r@   rN   r1   rB   strr\   propertyra   ri   rn   rr   ru   rE   __classcell__rU   s   @r:   rG   rG   l   sm     ''SBa a a a W WU[[U Ur<   rG   c                     U R                   " / U R                  S S QSPSP76 n U R                  SS9u  p[        R                  " U* U4SS9n U R                  S5      $ )Nr   dim)reshapeshapeunbindtorchstackflatten)xx1x2s      r:   rotate_halfr      s]    			'1773B<''Q'AXX"XFBbS"I2&A99R=r<   c                 N   U R                   nU R                  [        R                  5      n UR                  U 5      nUR                  U 5      nUR                  S   nU SUS 24   nU SS U24   nXa-  [        U5      U-  -   n[        R                  " Xe4SS9R                  U5      $ )Nr|   .r}   )dtypetor   float64r   r   cat)hidden_statescossinoriginal_dtyperot_dimpassthroughrotateds          r:   apply_rotary_time_embr      s    "((N!$$U]]3M
&&
C
&&
CiimGWX.KC'M*G}W!5!;<G99g+477GGr<   c            	          ^  \ rS rSrSrSS\4U 4S jjjrS r\R                  " 5       S\
S\S\\
\
4   4S	 j5       rS
rU =r$ )MusicFlamingoRotaryEmbedding   a  Rotary time embedding module used by MusicFlamingo checkpoints.

This is a checkpoint-faithful integration, not a direct implementation of the RoTE formulation described in
(Goel et al., 2024): https://arxiv.org/abs/2410.12109. It applies axial rotary embeddings over the window index
within each audio sample and the encoder time index within each window, then modulates both axes with absolute
timestamps in seconds.
configc                 x   > [         TU ]  XS9  U R                  U R                  5      nU R	                  SUSS9  g )Ndeviceposition_anglesF)
persistent)rM   rN   _compute_position_anglesinv_freqregister_buffer)r8   r   r   r   rU   s       r:   rN   %MusicFlamingoRotaryEmbedding.__init__   s=    /77F.ERr<   c                 2   [         R                  " [        U R                  5      UR                  UR
                  S9nX R                  -  S[        -  -  nUR                  S5      U-  n[         R                  " USSS9nUR                  UR
                  S9$ )Nr   r   r   r|   r}   )r   )
r   arangerB   max_seq_len_cachedr   r   r   	unsqueezerepeat_interleaver   )r8   r   	positionsr   s       r:   r   5MusicFlamingoRotaryEmbedding._compute_position_angles   s    LLT%<%<!=hoo]e]k]kl	 7 771r6B	#--b1H<11/1"M!!!77r<   
timestampsseq_lenrY   c                    USS2S4   R                  U R                  R                  U R                  R                  S9nU R                  R
                  S-  U-  n[        R                  " X4-  5      U R                  -  nUR                  S5      U R                  -  n[        R                  " USSS9nUSS2SSS24   nU R                  SU SSS2SS24   n[        Xg5      u  pg[        R                  " Xg4SS9nU* S-  [        -  R                  U5      n	XR                  S5      -  nUR                  5       UR!                  5       4$ )zBCompute 2D axial rotary embeddings for window and time dimensions.Nr   r      r|   r   r}   )r   r   r   r   r   r#   r   roundr   r   r   r   r   r   r   r   r   )
r8   r   r   window_startswindow_durationwindow_positionswindow_freqs
time_freqsfreqsangles
             r:   forward$MusicFlamingoRotaryEmbedding.forward   s+   
 #1a4(++4==3G3Gt}}ObOb+c++66:WD ;;}'FG$JaJaa'11"5E..|QBG $AtQJ/))(73D!QJ?
#4\#N 		<4"=q2%))%0++yy{EIIK''r<   r.   r_   )r=   r>   r?   r@   rA   r   rN   r   r   no_gradr   rB   tupler   rE   ry   rz   s   @r:   r   r      sZ    S2 S S
8 ]]_(& (3 (5;P ( (r<   r   c                   F    \ rS rSrSr\R                  " 5       S 5       rSrg)MusicFlamingoPreTrainedModel   Nc                     [         R                  " X5        [        U[        5      (       a=  UR	                  UR
                  5      n[        R                  " UR                  U5        g g r_   )	r   _init_weightsr/   r   r   r   initcopy_r   )r8   modulebuffer_values      r:   r   *MusicFlamingoPreTrainedModel._init_weights   sK    %%d3f:;;!::6??KLJJv--|< <r<   r.   )	r=   r>   r?   r@   _no_split_modulesr   r   r   rE   r.   r<   r:   r   r      s     
]]_= =r<   r   c                       \ rS rSrSrg) MusicFlamingoModelOutputWithPast   r.   N)r=   r>   r?   r@   rE   r.   r<   r:   r   r      s    r<   r   c                     ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\S\R                  4S jr
\\" S	S
9S\R                  S\R                  S\R                  S\\   S\\-  4
S j5       5       r\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   4S jj5       5       rSrU =r$ )MusicFlamingoModel   r   c                 D   > [         TU ]  U5        [        U5      U l        g r_   )rM   rN   r   pos_embr8   r   rU   s     r:   rN   MusicFlamingoModel.__init__   s     3F;r<   	input_idspost_lengthsmax_post_lengthrY   c                 r   XR                   R                  :H  n[        R                  " [        R                  R
                  R                  UR                  5       SSS9SS9n[        R                  " US:H  5      u  pg[        R                  " US:H  5      u  phX-
  R                  [        R                  5      n	UR                  5       n
UR                  5       n[        X:H  SU
 SU 35        U R                   R                  S	-  n[        R                  " X2R                  [        R                   S
9U-  n[        R"                  " [        R$                  " SUR                  S9[        R&                  " USS9S S /5      n[        R&                  " U	SS9n[        R(                  " XSS9n[        R(                  " U[        R                  " U	R*                  S   UR                  S95      n[        R                  " UR*                  S   UR                  S9UU   -
  nUR-                  S5      U-  U-  U-   $ )N)   r   r   )valuer   r}   r|   z6Audio features and audio tokens do not match, tokens: z, features: r   r   r   T)right)r   r`   r   diffnn
functionalpadrB   wherer   longsumr   r#   r   r   float32r   zeroscumsumsearchsortedr   r   )r8   r   r   r   audio_token_maskr   _startsendssample_lengthsn_audio_tokensn_audio_featuresaudio_embed_frame_stepframe_offsetscumsum_postcumsum_samplessample_indicessample_start_rowswindow_indicess                      r:   _build_audio_timestamps*MusicFlamingoModel._build_audio_timestamps   s    %(B(BBzz%((--112B2F2F2H&XY1Z`abKK	*	++dbj)-++EJJ7)--/'++-.D^DTT`aq`rs	
 "&!=!=!ALL1D1DEMMZ]ss 	
 iiQ|7J7J!KU\\ZflmMnorprMs tun!<++NtT "..ELL)=)=a)@I\I\]
 LL++A.|7J7JKN_`nNoo 	
 ''*_<?UUXeeer<   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.custom_introinput_featuresinput_features_maskr9   c                    U R                   " U4USS.UD6nUR                  nU R                   R                  UR                  S5      R	                  [
        R                  5      5      u  pxU R                  X8UR                  S   5      n	U R                  U	R	                  UR                  5      UR                  S   S9u  p[        XjU5      nU R                  U5      n[
        R                  " UR                  S   UR                  S9SSS24   USS2S4   :  nXR	                  UR                  5         Ul        U$ )	aR  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padded feature indices.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Token ids containing the audio token ID placeholders, for reconstructing rotary time embedding timestamps.
T)r   return_dictr|   r   )r   r   r   N)audio_towerlast_hidden_state _get_feat_extract_output_lengthsr   r   r   r   r   r   r   r   r   multi_modal_projectorr   pooler_output)r8   r   r   r   r9   audio_outputr   r   r   audio_timestampsr   r   audio_embeds
valid_masks                 r:   get_audio_features%MusicFlamingoModel.get_audio_features   s@   " ''
 3
 	
 %66**KKL_LcLcdfLgLjLjkpkukuLvw77	Q^QdQdegQhi<< 0 3 3M4H4H IS`SfSfgiSj<k-m#F11-@ \\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1--@S@S2T%U"r<   Nattention_maskposition_idspast_key_valuesinputs_embeds	use_cachec	           	         Uc  U R                  5       " U5      nSn
UbX  UbU  U R                  X#USS9R                  n
U R                  XU
S9nUR	                  XR                  UR                  5      5      nU R                  " SUUUUUS.U	D6n[        UR                  UR                  UR                  UR                  U
S9$ )z
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padding feature indices.
NT)r   r   )r  audio_features)r  r  r  r  r  )r   r  r   
attentionsaudio_hidden_statesr.   )get_input_embeddingsr  r   get_placeholder_maskmasked_scatterr   r   language_modelr   r   r  r   r  )r8   r   r   r   r  r  r  r  r  r9   r   special_audio_maskoutputss                r:   r   MusicFlamingoModel.forwardD  s    $   557	BM%)*?22yVZ 3 m 
 "&!:!:| "; " *889K__]j]q]qMrsM%% 
')%+
 
 0%77#33!//)) ,
 	
r<   )r   )NNNNNNNN)r=   r>   r?   r@   r   rN   r   
LongTensorrB   FloatTensorr   r   r   r   r   r   r   r   r  r
   boolr   rE   ry   rz   s   @r:   r   r      s   <2 <'f##'f &&'f 	'f
 
		'fR  w)) #\\ ##	
 +, 
+	+ @  .23737.204(,26!%.
##d*.
 ))D0.
 #\\D0	.

 t+.
 &&-.
 .
 ((4/.
 $;.
 +,.
  .
r<   r   z
    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, rotary time embedding, a multi-modal projector, and a Qwen2 language model.
    r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )%MusicFlamingoForConditionalGenerationiw  r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r_   )rM   rN   r   model	post_initr   s     r:   rN   .MusicFlamingoForConditionalGeneration.__init__}  s&     '/
r<   c                 >    U R                   R                  " XU40 UD6$ r_   )r  r  )r8   r   r   r   r9   s        r:   r  8MusicFlamingoForConditionalGeneration.get_audio_features  s     zz,,^R[f_effr<   )r  )	r=   r>   r?   r@   r   rN   r  rE   ry   rz   s   @r:   r  r  w  s    2 
g gr<   r  )r   rG   r  r   r   )3dataclassesr   mathr   huggingface_hub.dataclassesr   r   r   r    r	   r   cache_utilsr
   configuration_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   +audioflamingo3.configuration_audioflamingo3r   &audioflamingo3.modeling_audioflamingo3r   r   r   r   (audioflamingo3.processing_audioflamingo3r   autor   moonshine.modeling_moonshiner   r   rG   r   r   r   r   r   r   r  __all__r.   r<   r:   <module>r/     s    "  . + &   3 : - & u u N  O ! C  :;:7. :7  <:7z 4U4 4U 4Un
H'(#; '(T=#@ = 	'H 	 	B
, B
J 
g,R g
gr<   