
    3jhH                        S SK Jr  S SKrSSKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0J1r1  \" 5       (       a
  S SK2r2S SK2J3r3  \Rh                  " \55      r6 " S S\(5      r7\ " S S\'5      5       r8 " S S\*5      r9S1S jr:\ " S S\,5      5       r; " S  S!\3Rx                  5      r= " S" S#\5      r> " S$ S%\%5      r? " S& S'\?5      r@ " S( S)\$5      rA\" S*S+9 " S, S-\#5      5       rB\" S*S+9 " S. S/\"5      5       rC/ S0QrDg)2    )CallableN   )ACT2FN)
AudioInputmake_list_of_audio)Cache)BatchFeature)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringis_torch_availablelogging)can_return_tuplemerge_with_config_defaultsno_inherit_decorator)capture_outputs   )&AudioFlamingo3ForConditionalGenerationAudioFlamingo3Model!AudioFlamingo3MultiModalProjectorAudioFlamingo3PreTrainedModel)AudioFlamingo3ProcessorAudioFlamingo3ProcessorKwargs)GlmRotaryEmbedding)LlamaAttentioneager_attention_forwardrotate_half   )GlmAsrConfigGlmAsrEncoderConfig)nnc                       \ rS rSrSrg)GlmAsrProcessorKwargs2    N__name__
__module____qualname____firstlineno____static_attributes__r)       c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/glmasr/modular_glmasr.pyr'   r'   2       Cr0   r'   c            	          ^  \ rS rSr    SU 4S jjrSS jr SS\\\   -  \-  S\\\   -  S-  S\	\
   S\4S	 jjrS
rU =r$ )GlmAsrProcessor5   Nc           	      *   > [         TU ]  UUUUUUS9  g)aG  
audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
    Special token used to represent audio inputs in the chat template.
default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
    Default prompt to use for transcription tasks when applying transcription requests.
max_audio_len (`int`, *optional*, defaults to 655):
    Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
    655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
)chat_templateaudio_tokendefault_transcription_promptmax_audio_lenN)super__init__)selffeature_extractor	tokenizerr7   r8   r9   r:   	__class__s          r1   r<   GlmAsrProcessor.__init__7   s)    $ 	'#)E' 	 	
r0   returnc                 d    SnS H  u  p4nUSU-  -   US-
  -
  S-
  U-  S-   nM     X-
  U-  S-   nU$ )N   )r"   r   r"   )r"   r   r   r   r"   r)   )r=   audio_lengthsmerge_factorpaddingkernel_sizestride
num_tokenss          r1   _get_audio_token_length'GlmAsrProcessor._get_audio_token_lengthR   sY    ,B(G&*Q[8K!OLqPU[[^__M -C $2|CaG
r0   audiopromptkwargsc           	      ~   [        U[        5      (       a  U/nO[        U[        [        45      (       a*  U(       a#  [	        S U 5       5      (       a  [        U5      nO[        [        U5      5      n[        5       (       a]  U Vs/ s HP  n[        U[        R                  5      (       a,  UR                  5       R                  5       R                  5       OUPMR     nn[        U5      nUS:X  a  [        S5      eUc  U R                  /U-  nO[        U[        5      (       a  U/U-  nO[        U[        [        45      (       a  [        U5      U:w  a  [        S[        U5       SU S35      e/ nU HT  nUc  UR                  U R                  5        M#  [        U[        5      (       a  UR                  U5        MK  [!        S5      e   O[!        S5      e[#        Xt5       V	V
s/ s H-  u  pS	[        U
[        5      (       a  S
U
S.OS
U
S.SU	S./S./PM/     nn	n
U R$                  " U4SSSS.UD6$ s  snf s  sn
n	f )a  
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

Args:
    audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
        the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
    prompt (`str` or `list[str]`, *optional*):
        Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
        each sample uses `"Transcribe the input speech."`.
    **kwargs:
        Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
        `text_kwargs`, `audio_kwargs`, ...).

Returns:
    [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancestr).0els     r1   	<genexpr>>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>u   s     ?d^cXZ
2s@S@S^cs   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userrN   )typepath)r[   rN   text)r[   r]   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rT   rU   listtupleallr   r   torchTensordetachcpunumpylen
ValueErrorr9   append	TypeErrorzipapply_chat_template)r=   rN   rO   rP   audio_itemsrW   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r1   apply_transcription_request+GlmAsrProcessor.apply_transcription_requestZ   s/   2 eS!!38'Ke}--%C?d^c?d<d<du+K1%89K!##kvwkvegJr5<<<X<Xryy{0668^``kvw%
?HII>889JFG$$h+Gu..6{j( F}OJ<Gkl  G<NN4#D#DEc**NN4(#$MNN  Z[[ ,/w+D
 ,E' # &j#66 ")*=&-
C!'=	 
 ,E 	 
 ''
"&	

 
 	
S x4
s   AH4&4H9r)   )Nz<|pad|>z&Please transcribe this audio into texti  )rF   torch.TensorrB   rz   rS   )r+   r,   r-   r.   r<   rL   rU   rc   r   r   r'   r	   rx   r/   __classcell__r@   s   @r1   r4   r4   5   sv     %M
6 *.O
T#Y+O
 d3i$&O
 ./	O

 
O
 O
r0   r4   c                       \ rS rSrSrg)GlmAsrRotaryEmbedding   r)   Nr*   r)   r0   r1   r~   r~      s    r0   r~   c                 R   UR                  U5      nUR                  U5      nUR                  S   nU SS U24   U SUS 24   pUSS U24   USUS 24   pXr-  [        U5      U-  -   nX-  [        U	5      U-  -   n[        R                  " X/SS9n[        R                  " X/SS9nX4$ )N.)dim)	unsqueezeshaper!   rf   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r1   apply_rotary_pos_embr      s    
--
&C
--
&C2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr0   c                      ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S-  S\
\   S	\	\R                  \R                  4   4S
 jjrSrU =r$ )GlmAsrAttention   config	layer_idxc                   > [         TU ]  X5        SU l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l
        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR
                  SS9U l        g )NFT)bias)r;   r<   	is_causalr%   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projr=   r   r   r@   s      r1   r<   GlmAsrAttention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^eijr0   Nhidden_statesposition_embeddingsrP   rB   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nUu  p[        XgX5      u  pg[        R                  " U R                  R                  [        5      nU" U UUU4S U R                  (       d  SOU R                  U R                  S.UD6u  pUR                   " / UQSP76 R#                  5       nU R%                  U5      nX4$ )Nr   r"   r   g        )attention_maskdropoutscaling)r   r   r   view	transposer   r   r   r   get_interfacer   _attn_implementationr    trainingattention_dropoutr   reshape
contiguousr   )r=   r   r   rP   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                 r1   forwardGlmAsrAttention.forward   s\    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ (?(M(MKK,,.E)
 %8		%

  #}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r0   )r   r   r   r   r   rS   r+   r,   r-   r.   r#   intr<   rf   rg   rd   r   r   r   r/   r{   r|   s   @r1   r   r      s    k| k k IM!)||!) #5<<#=>E!) +,	!)
 
u||U\\)	*!) !)r0   r   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )	GlmAsrMLP   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  5      U l        [        UR                     U l
        g rS   )r;   r<   r%   r   r   intermediate_sizefc1fc2r   
hidden_actact_fnr=   r   r@   s     r1   r<   GlmAsrMLP.__init__   s\    99V//1I1IJ99V55v7I7IJV../r0   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rS   )r   r   r   )r=   r   s     r1   r   GlmAsrMLP.forward   s2    /M2/r0   )r   r   r   )
r+   r,   r-   r.   r<   rf   rg   r   r/   r{   r|   s   @r1   r   r      s    0U\\  r0   r   c            	          ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S-  S\
\   S	\R                  4S
 jjrSrU =r$ )GlmAsrEncoderLayer   r   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  5      U l
        g )N)r   r   )r;   r<   r   r   	self_attnr   mlpr%   	LayerNorminput_layernormpost_attention_layernormr   s      r1   r<   GlmAsrEncoderLayer.__init__   sb    !--(LV$!||F,>,>?(*V5G5G(H%r0   Nr   r   rP   rB   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r   r   r)   )r   r   r   r   )r=   r   r   rP   residual_s         r1   r   GlmAsrEncoderLayer.forward  s|     !,,];>> 
' 3
 

 !0 !55mD/ 0r0   )r   r   r   r   r   rS   r   r|   s   @r1   r   r      su    I| I I IM|| #5<<#=>E +,	
 
 r0   r   c                       \ rS rSrSrg)GlmAsrPreTrainedModeli   r)   Nr*   r)   r0   r1   r   r      r2   r0   r   c                      ^  \ rS rSr% \\S'   SrSrS/r\	\
S.rS\4U 4S jjr\\\S\\   4S	 j5       5       5       rS
rU =r$ )GlmAsrEncoderi$  r   input_featuresrN   r   )r   
attentionsc           	        > [         TU ]  U5        [        R                  " UR                  UR
                  SSS9U l        [        R                  " UR
                  UR
                  SSSS9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR
                  5      U l        [        US9U l        SU l        U R%                  5         g s  snf )Nr   r"   )rI   rH   r   )rI   rJ   rH   )r   F)r;   r<   r%   Conv1dnum_mel_binsr   conv1conv2
ModuleListrangenum_hidden_layersr   layersr   normr~   
rotary_embgradient_checkpointing	post_initr   s      r1   r<   GlmAsrEncoder.__init__.  s     YYv22F4F4FTU_`a
YYv1163E3EST]^hij
mmDI&JbJbDcdDcy2Dcd
 LL!3!34	/v>&+# es   DrP   c                    [         R                  R                  U R                  U5      5      n[         R                  R                  U R	                  U5      5      nUR                  SS5      nUnU R                  U[        R                  " UR                  S   UR                  S9S S S 24   S9nU R                   H  nU" U4SU0UD6nM     U R                  U5      n[        US9$ )Nr"   r   device)r   r   )last_hidden_state)r%   
functionalgelur   r   r   r   rf   aranger   r   r   r   r   )r=   r   rP   inputs_embedsr   r   encoder_layers          r1   r   GlmAsrEncoder.forward;  s     **4::n+EF**4::m+DE%//15%"oo]5H5H5KTaThTh(ijnpqjq(r . 
 "[[M)-kM`kdjkM ) 		-0)MJJr0   )r   r   r   r   r   r   )r+   r,   r-   r.   r$   __annotations__main_input_nameinput_modalities_no_split_modulesr   r   _can_record_outputsr<   r   r   r   r   r   r   r/   r{   r|   s   @r1   r   r   $  sl    &O-.+%
2   K7I0J K    Kr0   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GlmAsrMultiModalProjectoriO  r   c                 >  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  S-  5      U l        [        R                  " UR                  R                  S-  UR                  R                  5      U l	        g )Nr   )
r;   r<   r%   r   audio_configr   text_configr   linear_1linear_2r   s     r1   r<   "GlmAsrMultiModalProjector.__init__P  sm    		&"5"5"G"GI[I[IgIgjkIkl		&"4"4"@"@1"DfFXFXFdFder0   )r  r  )r+   r,   r-   r.   r#   r<   r/   r{   r|   s   @r1   r   r   O  s    f| f fr0   r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                   ~    \ rS rSr\\" SS9S\R                  S\R                  S\	\
   S\\-  4S j5       5       rS	rg
)GlmAsrModeliV  zgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r  r   input_features_maskrP   rB   c                 $   U R                   " U4SS0UD6nUR                  nUR                  UR                  S   SU R                  R
                  R                  5      nU R                  U5      nUR                  S5      nS H  u  pn
USU-  -   U	S-
  -
  S-
  U
-  S-   nM     SnX{-
  U-  S-   n[        R                  " UR                  S   UR                  S	9S S S 24   US S 2S 4   :  nXmR                  UR                  5         Ul        U$ )
Nrb   Tr   r   rE   r   r"   rD   r   )audio_towerr   r   r   r   r  r   multi_modal_projectorsumrf   r   r   topooler_output)r=   r   r  rP   audio_outputsaudio_hidden_statesaudio_embedsrF   rH   rI   rJ   rG   post_lengths
valid_masks                 r1   get_audio_featuresGlmAsrModel.get_audio_features\  s.    ((TTTVT+==199  #R)A)A)S)S
 112EF+//3,B(G&*Q[8K!OLqPU[[^__M -C%4EI\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
&2==ATAT3U&V#r0   r)   N)r+   r,   r-   r.   r   r   rf   FloatTensorrg   r   r   rd   r   r  r/   r)   r0   r1   r
  r
  V  sb     ~)) #\\ +,	
 
+	+ r0   r
  c                   j  ^  \ rS rSrSS0rU 4S jr          SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
S-  S\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\4U 4S jjjrSrU =r$ )GlmAsrForConditionalGenerationiy  zlm_head.weightz(model.language_model.embed_tokens.weightc                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rS   )r;   r<   r
  modelr   r   s     r1   r<   'GlmAsrForConditionalGeneration.__init__  s&      (
r0   N	input_idsr   r  r   r   past_key_valuesr   labels	use_cachelogits_to_keeprP   rB   c                 6   > [         TU ]  " SUUUUUUU	U
S.UD6$ )ap  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

>>> model_id = "zai-org/GLM-ASR-Nano-2512"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
>>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

>>> inputs = inputs.to(model.device, dtype=model.dtype)

>>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

>>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
>>> print(decoded_outputs)
```)r  r   r   r   r   r!  r"  r#  r)   )r;   r   )r=   r  r   r  r   r   r   r   r!  r"  r#  rP   r@   s               r1   r   &GlmAsrForConditionalGeneration.forward  s>    T w 

)%+')

 

 
	
r0   )r  )
NNNNNNNNNr   )r+   r,   r-   r.   _tied_weights_keysr<   rf   
LongTensorr  rg   r   boolr   r   r   r   r   r/   r{   r|   s   @r1   r  r  y  s    +,VW .23737.204(,26*.!%-.4
##d*4
 ))D04
 #\\D0	4

 t+4
 &&-4
 4
 ((4/4
   4'4
 $;4
 ell*4
 +,4
 
 4
 4
r0   r  )r   r  r
  r4   r   )Nr"   )Ecollections.abcr   rj   npactivationsr   audio_utilsr   r   cache_utilsr   feature_extraction_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   r   utils.output_capturingr   &audioflamingo3.modeling_audioflamingo3r   r   r   r   (audioflamingo3.processing_audioflamingo3r   r   glm.modeling_glmr   llama.modeling_llamar   r    r!   configuration_glmasrr#   r$   rf   r%   
get_loggerr+   loggerr'   r4   r~   r   r   Moduler   r   r   r   r   r
  r  __all__r)   r0   r1   <module>r?     sh   %  ! 9   4 9 R 5 & T T _ _ 5  n 1 W W C  
		H	% @9 ? s
- s
 s
l 5. 4$ *)n *) *)Z		  3  F @9 ?(K) (KVf A f 
% 
< 
<
%K <

<
~r0   