
    3j>!                         S SK rSSKJr  SSKJr  SSKJrJrJ	r	  SSK
Jr  SSKJrJrJr  \" 5       (       a  S SKr\R"                  " \5      r " S S	\S
S9r\ " S S\5      5       rS/rg)    N   )
AudioInput)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)auto_docstringis_torch_availableloggingc                   2    \ rS rSrSS0SSSS.SSS	.S
.rSrg)MusicFlamingoProcessorKwargs&   paddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       t/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/musicflamingo/processing_musicflamingo.pyr   r   &   s2     t
 #%)#
 #"
Ir"   r   F)totalc                   L  ^  \ rS rSr\r     SU 4S jjr\  SS\\	\   -  S\
S-  S\S-  S\\   S\4
U 4S	 jjj5       r  SS\
S-  S\\	\   -  S-  S\\   4U 4S
 jjjrS rS\
4S jrS\S\S\4S jr\S\	\   4U 4S jj5       r\S\	\   4S j5       r\S 5       rSrU =r$ )MusicFlamingoProcessor7   Nc                    > X@l         UR                  U5      U l        Xpl        [        TU ]  XUS9  XPl        X`l        UR                  U5      U l        UR                  U5      U l	        g)a+  
audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
    Special token used to represent audio inputs in the chat template.
audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
    Special token used to represent the beginning of audio.
audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
    Special token used to represent the end of audio.
max_audio_len (`int`, *optional*, defaults to 1200):
    Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
)chat_templateN)
audio_tokenconvert_tokens_to_idsaudio_token_idmax_audio_lensuper__init__audio_bos_tokenaudio_eos_tokenaudio_bos_token_idaudio_eos_token_id)	selffeature_extractor	tokenizerr)   r*   r0   r1   r-   	__class__s	           r#   r/   MusicFlamingoProcessor.__init__;   si    ( ''==kJ**]S.."+"A"A/"R"+"A"A/"Rr"   textaudiooutput_labelskwargsreturnc                 $  > SU;   a+  US   S:w  a"  [        U R                  R                   S35      eU(       a  SUS'   [        TU ]  " SX!S.UD6nU(       a0  UR                  S5      nSXfU R                  R                  :H  '   XeS	'   [        USS
9$ )a   
output_labels (bool, *optional*, default=False):
    Whether to return labels for training.

Returns:
    [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
    audio features (`input_features`, `input_features_mask`).
r   r   z% only supports `return_tensors='pt'`.Treturn_mm_token_type_idsr:   r9   mm_token_type_idsilabels)datatensor_typer   )	
ValueErrorr7   r   r.   __call__popr6   pad_token_idr   )r4   r9   r:   r;   r<   model_inputsrB   r7   s          r#   rF   MusicFlamingoProcessor.__call__X   s    " v%&1A*Bd*J 7 788]^__15F-.w'IeI&I!%%&9:F<@FT^^8889%+"4@@r"   c                    > [         TU ]  " SXS.UD6  UbA  Ub=  [        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eg g g )Nr@   zGot z
 text but z audios; they must match 1:1.r   )r.   validate_inputslenrE   )r4   r:   r9   r<   r7   s       r#   rL   &MusicFlamingoProcessor.validate_inputsv   sc     	AeA&A 1c$i3u:6MtCI;jUDabcc 7N 1r"   c                 2    US-
  S-  S-   nUS-
  S-  S-   nU$ )N      r   )r4   audio_lengthsconv_output_lengthsaudio_tokens_lengthss       r#   _get_audio_token_length.MusicFlamingoProcessor._get_audio_token_length   s2    ,q0Q6: 3a 7A=A##r"   c           
      :   [        US   U R                  R                  -  5      n[        U R                  U R                  R                  -  5      n/ n/ nU H  n[        UR                  S   5      n[        SX-   S-
  U-  5      n	X:  a;  [        R                  SXS   -  S SU R                   SU R                   S35        Un	UR                  U	5        [        XU-  5      n
[        U	5       H,  nX-  n[        US-   U-  U
5      nUR                  X|U 5        M.     M     U R                  R                  U5      nU R                  " U40 UD6nUR                  S	5      US
'   [        R                  " [        R                  " US
   R!                  S5      U5       Vs/ s H  oR!                  5       PM     sn5      nU R#                  U5      US'   / n[        [%        U5      5       H$  nU R'                  UUS9nUR                  U5        M&     UU4$ s  snf )Nr   r   rP   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_masknum_audio_tokens)	audio_idx)intr5   chunk_lengthr-   shapemaxloggerwarningappendminrangefetch_audiorG   torchstacksplitsumrU   rM   replace_audio_token)r4   r:   r<   window_sizemax_windowsper_sample_windowsflat_chunksaudio_el	n_samplesn_wintime_capistartendaudio_inputssrR   audio_replacementsidxreplacement_texts                       r#   _process_audio%MusicFlamingoProcessor._process_audio   s<   &1D4J4J4W4WWX$,,0F0F0S0SST(*(*HHNN1-.II3a7KGHE"&y/3J'J3&O{[_[m[mZn  oF  GK  GY  GY  FZ  Z\  ] $%%e,9k&9:H5\1q5K/:""8##67 "   &&2259--kDVD.:.>.>?O.P*+
 #kk,7L*M*Q*QRT*UWijkjUUWjk
 ,0+G+G+V'(U$C#77PS7T%%&67 % /// ls   "Hrw   r\   c                 d    US   U   nU R                   U R                  U-  -   U R                  -   $ )Nr[   )r0   r*   r1   )r4   rw   r\   r[   s       r#   rk   *MusicFlamingoProcessor.replace_audio_token   s;    '(:;IF##d&6&69I&IIDL`L```r"   c                     > [         TU ]  S/-   $ )NrY   )r.   model_input_names)r4   r7   s    r#   r   (MusicFlamingoProcessor.model_input_names   s    w(,A+BBBr"   c                     S/$ )zNInput names returned always by subprocessors but not used in model's `forward`r[   r   r4   s    r#   unused_input_names)MusicFlamingoProcessor.unused_input_names   s     ###r"   c                 H    U R                   U R                  U R                  /$ )N)r,   r2   r3   r   s    r#   	audio_ids MusicFlamingoProcessor.audio_ids   s!    ##T%<%<d>U>UVVr"   )r0   r2   r1   r3   r*   r,   r-   )Nz<sound>z<|sound_bos|>z<|sound_eos|>i  )NF)NN)r   r   r   r   r   valid_processor_kwargsr/   r
   r	   listr   boolr   r   rF   r   rL   rU   r|   dictr]   strrk   propertyr   r   r   r!   __classcell__)r7   s   @r#   r&   r&   7   s^   9 ''S:  $(%*	A$y/)A D A d{	A
 56A 
A A> $(37	dD 	d $y/)D0	d )*		d 	d$
(0J (0Ta a a a C49 C C $DI $ $ W Wr"   r&   )numpynpaudio_utilsr   feature_extraction_utilsr   processing_utilsr   r   r   tokenization_utils_baser	   utilsr
   r   r   rg   
get_loggerr   ra   r   r&   __all__r   r"   r#   <module>r      sz   ,  % 4 H H 0 @ @  
		H	%#35 " GW^ GW GWT $
$r"   