
    3j3                         S SK rSSKJrJr  SSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJrJrJr  \" 5       (       a  S SKr\R$                  " \5      r " S S	\S
S9r\ " S S\	5      5       rS/rg)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)auto_docstringis_torch_availableloggingc                   2    \ rS rSrSS0SSSS.SSS	.S
.rSrg)GlmAsrProcessorKwargs&   paddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       f/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/glmasr/processing_glmasr.pyr   r   &   s2     t
 #%)#
 #"
Ir#   r   F)totalc                     ^  \ rS rSr\r    SU 4S jjr\  SS\\	\   -  S\
S-  S\S-  S\\   S	\4
U 4S
 jjj5       r  SS\
S-  S\\	\   -  S-  S\\   4U 4S jjjrSS jrS\
4S jrS\S\S	\4S jr\S	\	\   4U 4S jj5       r\S	\	\   4S j5       r SS\\	\   -  \
-  S\\	\   -  S-  S\\   S	\4S jjrSS.S jrS rS\S	\4S jrSrU =r$ )GlmAsrProcessor7   Nc                 r   > X@l         UR                  U5      U l        XPl        X`l        [
        TU ]  XUS9  g)aG  
audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
    Special token used to represent audio inputs in the chat template.
default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
    Default prompt to use for transcription tasks when applying transcription requests.
max_audio_len (`int`, *optional*, defaults to 655):
    Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
    655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
)chat_templateN)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr*   r+   r.   r/   	__class__s          r$   r1   GlmAsrProcessor.__init__;   s=    $ ''==kJ,H)**]Sr#   Ftextaudiooutput_labelskwargsreturnc                 $  > SU;   a+  US   S:w  a"  [        U R                  R                   S35      eU(       a  SUS'   [        TU ]  " SX!S.UD6nU(       a0  UR                  S5      nSXfU R                  R                  :H  '   XeS	'   [        USS
9$ )a   
output_labels (bool, *optional*, default=False):
    Whether to return labels for training.

Returns:
    [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
    audio features (`input_features`, `input_features_mask`).
r   r   z% only supports `return_tensors='pt'`.Treturn_mm_token_type_idsr8   r7   mm_token_type_idsilabels)datatensor_typer   )	
ValueErrorr5   r   r0   __call__popr4   pad_token_idr   )r2   r7   r8   r9   r:   model_inputsr@   r5   s          r$   rD   GlmAsrProcessor.__call__S   s    " v%&1A*Bd*J 7 788]^__15F-.w'IeI&I!%%&9:F<@FT^^8889%+"4@@r#   c                    > [         TU ]  " SXS.UD6  UbA  Ub=  [        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eg g g )Nr>   zGot z
 text but z audios; they must match 1:1.r   )r0   validate_inputslenrC   )r2   r8   r7   r:   r5   s       r$   rJ   GlmAsrProcessor.validate_inputsq   sc     	AeA&A 1c$i3u:6MtCI;jUDabcc 7N 1r#   c                 d    SnS H  u  p4nUSU-  -   US-
  -
  S-
  U-  S-   nM     X-
  U-  S-   nU$ )N   ))   r   rO   )rO   r      rP   rO   r   )r2   audio_lengthsmerge_factorr   kernel_sizestride
num_tokenss          r$   _get_audio_token_length'GlmAsrProcessor._get_audio_token_length|   sY    ,B(G&*Q[8K!OLqPU[[^__M -C $2|CaG
r#   c           
      :   [        US   U R                  R                  -  5      n[        U R                  U R                  R                  -  5      n/ n/ nU H  n[        UR                  S   5      n[        SX-   S-
  U-  5      n	X:  a;  [        R                  SXS   -  S SU R                   SU R                   S35        Un	UR                  U	5        [        XU-  5      n
[        U	5       H,  nX-  n[        US-   U-  U
5      nUR                  X|U 5        M.     M     U R                  R                  U5      nU R                  " U40 UD6nUR                  S	5      US
'   [        R                  " [        R                  " US
   R!                  S5      U5       Vs/ s H  oR!                  5       PM     sn5      nU R#                  U5      US'   / n[        [%        U5      5       H$  nU R'                  UUS9nUR                  U5        M&     UU4$ s  snf )Nr   r   rO   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_masknum_audio_tokens)	audio_idx)intr3   chunk_lengthr/   shapemaxloggerwarningappendminrangefetch_audiorE   torchstacksplitsumrV   rK   replace_audio_token)r2   r8   r:   window_sizemax_windowsper_sample_windowsflat_chunksaudio_el	n_samplesn_wintime_capistartendaudio_inputssrQ   audio_replacementsidxreplacement_texts                       r$   _process_audioGlmAsrProcessor._process_audio   s<   &1D4J4J4W4WWX$,,0F0F0S0SST(*(*HHNN1-.II3a7KGHE"&y/3J'J3&O{[_[m[mZn  oF  GK  GY  GY  FZ  Z\  ] $%%e,9k&9:H5\1q5K/:""8##67 "   &&2259--kDVD.:.>.>?O.P*+
 #kk,7L*M*Q*QRT*UWijkjUUWjk
 ,0+G+G+V'(U$C#77PS7T%%&67 % /// ls   "Hrx   r]   c                 0    US   U   nU R                   U-  $ )Nr\   )r+   )r2   rx   r]   r\   s       r$   rl   #GlmAsrProcessor.replace_audio_token   s%    '(:;IF"222r#   c                     > [         TU ]  S/-   $ )NrZ   )r0   model_input_names)r2   r5   s    r$   r   !GlmAsrProcessor.model_input_names   s    w(,A+BBBr#   c                     S/$ )zNInput names returned always by subprocessors but not used in model's `forward`r\   r   )r2   s    r$   unused_input_names"GlmAsrProcessor.unused_input_names   s     ###r#   promptc           	      ~   [        U[        5      (       a  U/nO[        U[        [        45      (       a*  U(       a#  [	        S U 5       5      (       a  [        U5      nO[        [        U5      5      n[        5       (       a]  U Vs/ s HP  n[        U[        R                  5      (       a,  UR                  5       R                  5       R                  5       OUPMR     nn[        U5      nUS:X  a  [        S5      eUc  U R                  /U-  nO[        U[        5      (       a  U/U-  nO[        U[        [        45      (       a  [        U5      U:w  a  [        S[        U5       SU S35      e/ nU HT  nUc  UR                  U R                  5        M#  [        U[        5      (       a  UR                  U5        MK  [!        S5      e   O[!        S5      e[#        Xt5       V	V
s/ s H-  u  pS	[        U
[        5      (       a  S
U
S.OS
U
S.SU	S./S./PM/     nn	n
U R$                  " U4SSSS.UD6$ s  snf s  sn
n	f )a  
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

Args:
    audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
        the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
    prompt (`str` or `list[str]`, *optional*):
        Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
        each sample uses `"Transcribe the input speech."`.
    **kwargs:
        Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
        `text_kwargs`, `audio_kwargs`, ...).

Returns:
    [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancestr).0els     r$   	<genexpr>>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?d^cXZ
2s@S@S^cs   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userr8   )typepath)r   r8   r7   )r   r7   )rolecontentT)tokenizeadd_generation_promptreturn_dict)r   r   listtupleallr   r   rh   TensordetachcpunumpyrK   rC   r.   rd   	TypeErrorzipapply_chat_template)r2   r8   r   r:   audio_itemsr   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r$   apply_transcription_request+GlmAsrProcessor.apply_transcription_request   s/   2 eS!!38'Ke}--%C?d^c?d<d<du+K1%89K!##kvwkvegJr5<<<X<Xryy{0668^``kvw%
?HII>889JFG$$h+Gu..6{j( F}OJ<Gkl  G<NN4#D#DEc**NN4(#$MNN  Z[[ ,/w+D
 ,E' # &j#66 ")*=&-
C!'=	 
 ,E 	 
 ''
"&	

 
 	
S x4
s   AH4&4H9)strip_prefixc                    U R                   R                  " U0 UD6nU(       a   U Vs/ s H  oPR                  U5      PM     nnU$ s  snf )aB  
Forward arguments to [`~PreTrainedTokenizer.decode`] and optionally remove the assistant framing the model
was trained to produce.

AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
)r4   decode"_strip_assistant_prefix_and_quotes)r2   r   argsr:   decodedr7   s         r$   r   GlmAsrProcessor.decode  sJ     ..''88QXYQX>>tDQXGY Zs   Ac                 &    U R                   " U0 UD6$ )z)BC as previous examples used batch_decode)r   )r2   r   r:   s      r$   batch_decodeGlmAsrProcessor.batch_decode  s    {{D+F++r#   c                 `   UR                  5       nS H7  nUR                  U5      (       d  M  U[        U5      S R                  5       n  O   UR                  S5      (       a  USS R                  5       n[        U5      S:  a(  US   US   :X  a  US   S;   a  USS R                  5       nU$ )	zY
Remove the assistant prefix and surrounding quotes from a decoded transcription string.
)z"The spoken content of the audio isz!The transcription of the audio isz!The content of the input audio isN.r[   rP   r   >   "'rO   )strip
startswithrK   endswith)r2   r7   strippedprefixs       r$   r   2GlmAsrProcessor._strip_assistant_prefix_and_quotes  s    
 ::<
F
 ""6**#CKM288:
 S!!}**,Hx=A(1+""=(1+Q[B["~++-Hr#   )r+   r-   r.   r/   )Nz<|pad|>z&Please transcribe this audio into texti  )NF)NN)rQ   torch.Tensorr;   r   r   ) r   r   r   r    r   valid_processor_kwargsr1   r   r
   r   r   boolr	   r   rD   r   rJ   rV   r}   dictr^   r   rl   propertyr   r   r   r   r   r   r"   __classcell__)r5   s   @r$   r'   r'   7   s   2 %MT0  $(%*	A$y/)A D A d{	A
 ./A 
A A> $(37	dD 	d $y/)D0	d )*		d 	d(0J (0T3 3 3 3 C49 C C $DI $ $ *.O
T#Y+O
 d3i$&O
 ./	O

 
O
b */ ,s s  r#   r'   )r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   r   rh   
get_loggerr   rb   r   r'   __all__r   r#   r$   <module>r      sw   ,  9 4 H H 0 @ @  
		H	%,E " {n { {| 
r#   