
    3jW                     0   S SK Jr  S SKJr  S SKJr  S SKJr  S SKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(  \#" 5       (       a  S SKr " S S\RR                  5      r*\! " S S\5      5       r+\ " S S\5      5       r, " S S\RR                  5      r-S r.S r/\!" SS 9 " S! S"\+5      5       r0\!" S#S 9\ " S$ S%\5      5       5       r1\!" S&S 9 " S' S(\+\5      5       r2/ S)Qr3g)*    )Callable)	dataclass)pi)Optional)Tensorbroadcast_tensorsnn   )initialization)ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)ROPE_INIT_FUNCTIONS)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_availabletorch_compilable_check   )	AutoModel   )MusicFlamingoConfigNc                      ^  \ rS rSr% Sr\R                  \S'   SS\4U 4S jjjr	\
   SS\S-  S\S   S	\S-  S
\S\4   4S jj5       r\R                   " 5       S\S	\S
\\\4   4S j5       rS rSrU =r$ )MusicFlamingoRotaryEmbedding.   a  Rotary time embedding module used by MusicFlamingo checkpoints.

This is a checkpoint-faithful integration, not a direct implementation of the RoTE formulation described in
(Goel et al., 2024): https://arxiv.org/abs/2410.12109. It applies axial rotary embeddings over the window index
within each audio sample and the encoder time index within each window, then modulates both axes with absolute
timestamps in seconds.
inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  U R                  U R                  5      nU R                  SUSS9  g )N	rope_typedefaultr!   F)
persistentoriginal_inv_freqposition_angles)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr"   rope_parametersr$   compute_default_rope_parametersr   attention_scalingregister_bufferclone_compute_position_anglesr!   )selfr"   devicerope_init_fnr!   r(   	__class__s         r/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/musicflamingo/modeling_musicflamingo.pyr*   %MusicFlamingoRotaryEmbedding.__init__9   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuU77F.ER    r5   ztorch.deviceseq_lenreturnztorch.Tensorc           	      j   U R                   S   nU R                   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        XT-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  nX4$ )
aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetapartial_rotary_factorg      ?head_dimNr   r   dtyper5   rB   )r.   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r"   r5   r;   baser?   r@   dimattention_factorr!   s	            r8   r/   <MusicFlamingoRotaryEmbedding.compute_default_rope_parametersK   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r:   
timestampsc                    USS2S4   R                  U R                  R                  U R                  R                  S9nU R                  R
                  S-  U-  n[        R                  " X4-  5      U R                  -  nUR                  S5      U R                  -  n[        R                  " USSS9nUSS2SSS24   nU R                  SU SSS2SS24   n[        Xg5      u  pg[        R                  " Xg4SS9nU* S-  [        -  R                  U5      n	XR                  S5      -  nUR                  5       UR!                  5       4$ )zBCompute 2D axial rotary embeddings for window and time dimensions.Nr   rC      r   rO   )rL   r!   r5   rB   r"   audio_frame_steprI   roundr,   	unsqueezerepeat_interleaver(   r   catr   cossin)
r4   rR   r;   window_startswindow_durationwindow_positionswindow_freqs
time_freqsfreqsangles
             r8   forward$MusicFlamingoRotaryEmbedding.forwardk   s+   
 #1a4(++4==3G3Gt}}ObOb+c++66:WD ;;}'FG$JaJaa'11"5E..|QBG $AtQJ/))(73D!QJ?
#4\#N 		<4"=q2%))%0++yy{EIIK''r:   c                 2   [         R                  " [        U R                  5      UR                  UR
                  S9nX R                  -  S[        -  -  nUR                  S5      U-  n[         R                  " USSS9nUR                  UR
                  S9$ )NrC   r   rU   rV   rA   )
rI   rJ   rH   r,   r5   rB   r   rY   rZ   rL   )r4   r!   	positionsr(   s       r8   r3   5MusicFlamingoRotaryEmbedding._compute_position_angles   s    LLT%<%<!=hoo]e]k]kl	 7 771r6B	#--b1H<11/1"M!!!77r:   )r0   r"   r,   r-   r$   N)NNN)__name__
__module____qualname____firstlineno____doc__rI   r   __annotations__r   r*   staticmethodr   rH   tuplerM   r/   no_gradre   r3   __static_attributes____classcell__r7   s   @r8   r   r   .   s     llS2 S S$ -1+/"*#d**(* t* 
~u$	%	* *> ]]_(& (3 (5;P ( (&8 8r:   r   c                   ~   ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSr\R                  " 5       U 4S j5       rS	rU =r$ )
MusicFlamingoPreTrainedModel   r"   model)audiotextTNpast_key_valuesc                    > [         TU ]  U5        [        U[        5      (       a=  UR	                  UR
                  5      n[        R                  " UR                  U5        g g rj   )	r)   _init_weights
isinstancer   r3   r!   initcopy_r(   )r4   modulebuffer_valuer7   s      r8   r   *MusicFlamingoPreTrainedModel._init_weights   sK    f%f:;;!::6??KLJJv--|< <r:    )rk   rl   rm   rn   r   rp   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_attention_backendrI   rs   r   rt   ru   rv   s   @r8   rx   rx      sS    (&*##4"5N"&
]]_= =r:   rx   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g) MusicFlamingoModelOutputWithPast   z[
audio_hidden_states (`torch.FloatTensor`, *optional*):
    Projected audio hidden states.
Naudio_hidden_statesr   )
rk   rl   rm   rn   ro   r   rI   FloatTensorrp   rt   r   r:   r8   r   r      s    
 59**T18r:   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ ) MusicFlamingoMultiModalProjector   z
Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
to the LLM embedding space so they can replace `<sound>` tokens.
r"   c                   > [         TU ]  5         [        R                  " UR                  R
                  UR                  R
                  UR                  S9U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  UR                  S9U l        g )Nbias)r)   r*   r	   Linearaudio_configrF   text_configprojector_biaslinear_1r   projector_hidden_actactlinear_2r4   r"   r7   s     r8   r*   )MusicFlamingoMultiModalProjector.__init__   s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r:   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rj   )r   r   r   )r4   audio_featureshidden_statess      r8   re   (MusicFlamingoMultiModalProjector.forward   s2    n5/m4r:   )r   r   r   )
rk   rl   rm   rn   ro   r   r*   re   rt   ru   rv   s   @r8   r   r      s    

2 
 r:   r   c                     U R                   " / U R                  S S QSPSP76 n U R                  SS9u  p[        R                  " U* U4SS9n U R                  S5      $ )NrU   r   rV   )reshapeshapeunbindrI   stackflatten)xx1x2s      r8   rotate_halfr      s]    			'1773B<''Q'AXX"XFBbS"I2&A99R=r:   c                 N   U R                   nU R                  [        R                  5      n UR                  U 5      nUR                  U 5      nUR                  S   nU SUS 24   nU SS U24   nXa-  [        U5      U-  -   n[        R                  " Xe4SS9R                  U5      $ )NrU   .rV   )rB   rL   rI   float64r   r   r[   )r   r\   r]   original_dtyperot_dimpassthroughrotateds          r8   apply_rotary_time_embr      s    "((N!$$U]]3M
&&
C
&&
CiimGWX.KC'M*G}W!5!;<G99g+477GGr:   z
    The MusicFlamingo model (fine-tuned Whisper encoder, multi-modal projector, Qwen2 language model),
    without a language modeling head.
    custom_introc                   t  ^  \ rS rSrSrSrSrS\4U 4S jjr\	\
" SS9S\R                  S\R                  S	\R                  S
\\   S\\-  4
S j5       5       rS	\R                  S\R                  S\R                  4S jr\	\
        SS	\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S
\\   S\\-  4S jj5       5       rS	\R                  S\R                  S\S\R                  4S jrSrU =r$ )MusicFlamingoModel   Nr"   c                   > [         TU ]  U5        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        U5      U l	        [        U5      U l        U R                  5         g rj   )r)   r*   r   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   pos_emb	post_initr   s     r8   r*   MusicFlamingoModel.__init__   se     $001D1DE'33F4F4FG%Ef%M"3F;r:   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   input_featuresinput_features_mask	input_idskwargsr<   c                    U R                   " U4USS.UD6nUR                  nU R                   R                  UR                  S5      R	                  [
        R                  5      5      u  pxU R                  X8UR                  S   5      n	U R                  U	R	                  UR                  5      UR                  S   S9u  p[        XjU5      nU R                  U5      n[
        R                  " UR                  S   UR                  S9SSS24   USS2S4   :  nXR	                  UR                  5         Ul        U$ )	aR  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padded feature indices.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Token ids containing the audio token ID placeholders, for reconstructing rotary time embedding timestamps.
T)r   return_dictrU   r   )r;   r   r5   N)r   last_hidden_state _get_feat_extract_output_lengthssumrL   rI   long_build_audio_timestampsr   r   r5   r   r   rJ   pooler_output)r4   r   r   r   r   audio_outputr   _post_lengthsaudio_timestampsr\   r]   audio_embeds
valid_masks                 r8   get_audio_features%MusicFlamingoModel.get_audio_features   s@   " ''
 3
 	
 %66**KKL_LcLcdfLgLjLjkpkukuLvw77	Q^QdQdegQhi<< 0 3 3M4H4H IS`SfSfgiSj<k-m#F11-@ \\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1--@S@S2T%U"r:   inputs_embedsr   c           	      &   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)rB   r5   rU   r   6Audio features and audio tokens do not match, tokens: , features: )get_input_embeddingsrI   tensorr"   audio_token_idr   r5   allr   r   rY   	expand_asrL   r   numel)r4   r   r   r   special_audio_maskn_audio_tokensn_audio_featuress          r8   get_placeholder_mask'MusicFlamingoModel.get_placeholder_mask  s     !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r:   attention_maskposition_idsr}   	use_cachec	           	         Uc  U R                  5       " U5      nSn
UbX  UbU  U R                  X#USS9R                  n
U R                  XU
S9nUR	                  XR                  UR                  5      5      nU R                  " SUUUUUS.U	D6n[        UR                  UR                  UR                  UR                  U
S9$ )z
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padding feature indices.
NT)r   r   )r   r   )r   r   r   r}   r   )r   r}   r   
attentionsr   r   )r   r   r   r   masked_scatterrL   r5   r   r   r   r}   r   r   )r4   r   r   r   r   r   r}   r   r   r   r   r   outputss                r8   re   MusicFlamingoModel.forward  s    $   557	BM%)*?22yVZ 3 m 
 "&!:!:| "; " *889K__]j]q]qMrsM%% 
')%+
 
 0%77#33!//)) ,
 	
r:   r   max_post_lengthc                 r   XR                   R                  :H  n[        R                  " [        R                  R
                  R                  UR                  5       SSS9SS9n[        R                  " US:H  5      u  pg[        R                  " US:H  5      u  phX-
  R                  [        R                  5      n	UR                  5       n
UR                  5       n[        X:H  SU
 SU 35        U R                   R                  S	-  n[        R                  " X2R                  [        R                   S
9U-  n[        R"                  " [        R$                  " SUR                  S9[        R&                  " USS9S S /5      n[        R&                  " U	SS9n[        R(                  " XSS9n[        R(                  " U[        R                  " U	R*                  S   UR                  S95      n[        R                  " UR*                  S   UR                  S9UU   -
  nUR-                  S5      U-  U-  U-   $ )N)r   r   r   )valuer   rV   rU   r   r   rT   rC   r   T)right)r"   r   rI   diffr	   
functionalpadrH   whererL   r   r   r   rW   rJ   r5   float32r[   zeroscumsumsearchsortedr   rY   )r4   r   r   r   audio_token_maskr   r   startsendssample_lengthsr   r   audio_embed_frame_stepframe_offsetscumsum_postcumsum_samplessample_indicessample_start_rowswindow_indicess                      r8   r   *MusicFlamingoModel._build_audio_timestampsQ  s    %(B(BBzz%((--112B2F2F2H&XY1Z`abKK	*	++dbj)-++EJJ7)--/'++-.D^DTT`aq`rs	
 "&!=!=!ALL1D1DEMMZ]ss 	
 iiQ|7J7J!KU\\ZflmMnorprMs tun!<++NtT "..ELL)=)=a)@I\I\]
 LL++A.|7J7JKN_`nNoo 	
 ''*_<?UUXeeer:   )r   r   r   r   )NNNNNNNN)rk   rl   rm   rn   _tp_plan_pp_plan_keep_in_fp32_modules_strictr   r*   r   r   rI   r   r   
LongTensorr   r   rr   r   r   r   r   boolr   re   rH   r   rt   ru   rv   s   @r8   r   r      s    HH#' 2   w)) #\\ ##	
 +, 
+	+ @"))":?:K:K"]b]n]n"0  .23737.204(,26!%.
##d*.
 ))D0.
 #\\D0	.

 t+.
 &&-.
 .
 ((4/.
 $;.
 +,.
 
1	1.
  .
`'f##'f &&'f 	'f
 
		'f 'fr:   r   zY
    Base class for MusicFlamingo causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)#MusicFlamingoCausalLMOutputWithPasti{  a2  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    Hidden states of the audio encoder after projection.
Nlosslogitsr}   r   r   r   r   )rk   rl   rm   rn   ro   r  rI   r   rp   r  r}   r   r   rr   r   r   rt   r   r:   r8   r  r  {  s    	 &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r:   r  z
    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, rotary time embedding, a multi-modal projector, and a Qwen2 language model.
    c                     ^  \ rS rSrS/rSrS\4U 4S jjrS r\	\
          SS\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\\-  4S jj5       5       rSS.S\4U 4S jjjrSrU =r$ )%MusicFlamingoForConditionalGenerationi  embed_positionsNr"   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr   )r)   r*   r   rz   r	   r   r   rF   
vocab_sizelm_headr   r   s     r8   r*   .MusicFlamingoForConditionalGeneration.__init__  sS     '/
yy!3!3!?!?ASASA^A^ejkr:   c                 >    U R                   R                  " XU40 UD6$ rj   )rz   r   )r4   r   r   r   r   s        r8   r   8MusicFlamingoForConditionalGeneration.get_audio_features  s     zz,,^R[f_effr:   r   r   r   r   r   r}   r   labelsr   logits_to_keepr   r<   c                    U R                   " SUUUUUUUU	S.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )aW  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padding feature indices.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss.

Example:

```python
>>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor

>>> model_id = "nvidia/audio-flamingo-3-hf"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
```)r   r   r   r   r   r}   r   r   N)r  r  r  )r  r  r}   r   r   r   r   )rz   r   r   rH   slicer  loss_functionr"   r   r  r  r}   r   r   r   )r4   r   r   r   r   r   r}   r   r  r   r  r   r   r   slice_indicesr  r  s                    r8   re   -MusicFlamingoForConditionalGeneration.forward  s    > ** 

) 3)%+'

 

  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD 3#33!//)) ' ; ;
 	
r:   F)is_first_iterationr$  c                   > UR                  SS 5      nUR                  SS 5      n[        TU ]  " U0 UD6nU(       d  UR                  SS5      (       d  Ub  XFS'   Ub  XVS'   U$ )Nr   r   r   F)popr)   prepare_inputs_for_generationrD   )r4   r$  argsr   r   r   model_inputsr7   s          r8   r'  CMusicFlamingoForConditionalGeneration.prepare_inputs_for_generation  ss    $4d;$jj)>Ew<dMfM\%5%5k5%I%I)1?-.".6I23r:   )r  rz   )
NNNNNNNNNr   )rk   rl   rm   rn   r  _tied_weights_keysr   r*   r   r   r   rI   r  r   r   r   r  rH   r   r   rr   r  re   r'  rt   ru   rv   s   @r8   r  r    sX    %6#6 2 g  .23737.204(,26*.!%-.:
##d*:
 ))D0:
 #\\D0	:

 t+:
 &&-:
 :
 ((4/:
   4':
 $;:
 ell*:
 +,:
 
4	4:
  :
x OT t  r:   r  )r  r   rx   )4collections.abcr   dataclassesr   mathr   typingr   rI   r   r   r	    r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   autor   configuration_musicflamingor   Moduler   rx   r   r   r   r   r   r  r  __all__r   r:   r8   <module>r=     sC  , % !   / / & !   ) ` ` 6 - & u u  < V8299 V8r =? = =& 9'> 9 9ryy .
H bf5 bfbfJ 
 9+ 9 9( 
W,H/ W
Wt jr:   