
    3j:                        S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSKJ	r	  SSK
JrJrJr  SS	KJr  SS
KJrJrJrJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJr  SSK J!r!   " S S\5      r" " S S\5      r# " S S\5      r$\" SS9 " S S\5      5       r% " S S\RL                  5      r'\" SS9\ " S S \5      5       5       r(\" S!S9 " S" S#\$5      5       r)\" S$S9 " S% S&\$\	5      5       r*/ S'Qr+g)(    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)merge_with_config_defaults)capture_outputs   )	AutoModel)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       \ rS rSrSrg)VoxtralAttention+    N__name__
__module____qualname____firstlineno____static_attributes__r       e/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   +       r%   r   c                       \ rS rSrSrg)VoxtralEncoderLayer/   r   Nr   r   r%   r&   r)   r)   /   r'   r%   r)   c                   (    \ rS rSrSrSrSrSrSrSr	g)VoxtralPreTrainedModel3   TNr   )
r    r!   r"   r#   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr$   r   r%   r&   r,   r,   3   s      "&!r%   r,   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                   V    \ rS rSr\\S.r\\ SS\	\
   S\\-  4S jj5       5       rSrg)	VoxtralEncoder<   )
attentionshidden_statesNkwargsreturnc           	         U R                   R                  U R                  R                  S   -  U R                  R                  S   -  nUR
                  S   U:w  a"  [        SU SUR
                  S    SU S35      eUR                  U R                  R                  R                  U R                  R                  R                  S9n[        R                  R                  U R                  U5      5      n[        R                  R                  U R	                  U5      5      nUR                  SSS	5      nU R                  R                  nXV-   R                  UR                  5      n[        R                  R!                  XpR                   U R"                  S
9n[%        U R&                  5       H  u  pU	" UUS9nM     U R)                  U5      n[+        US9$ )a  
Args:
    input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
    attention_mask (`torch.Tensor`)`, *optional*):
        Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
        but it is not used. By default the silence in the input log mel spectrogram are ignored.
r   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .dtypedevicer   r   )ptraining)attention_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr@   rA   r   
functionalgelupermuteembed_positionsdropoutrC   	enumeratelayers
layer_normr
   )
selfinput_featuresrD   r:   expected_seq_lengthinputs_embeds	embed_posr9   idxencoder_layers
             r&   forwardVoxtralEncoder.forwardG   s   ( #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--m||VZVcVc-d"+DKK"8C)-M #9 6)+
 	
r%   r   N)r    r!   r"   r#   r   r)   _can_record_outputsr   r   r   r   tupler
   r^   r$   r   r%   r&   r6   r6   <   sS     ',
   +
 +,	+

 
+	++
   +
r%   r6   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VoxtralMultiModalProjectorw   rF   c                 ^  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rW   rF   	__class__s     r&   rj   #VoxtralMultiModalProjector.__init__x   sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr%   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r`   )rp   rr   rs   )rW   audio_featuresr9   s      r&   r^   "VoxtralMultiModalProjector.forward~   s2    n5/m4r%   )rr   rp   rs   )	r    r!   r"   r#   r   rj   r^   r$   __classcell__ru   s   @r&   rd   rd   w   s    n} n r%   rd   zL
    Base class for Voxtral outputs, with hidden states and attentions.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)VoxtralModelOutputWithPast   z[
audio_hidden_states (`torch.FloatTensor`, *optional*):
    Projected audio hidden states.
Naudio_hidden_statesr   )
r    r!   r"   r#   __doc__r   torchFloatTensor__annotations__r$   r   r%   r&   r}   r}      s    
 59**T18r%   r}   z
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a Llama language model,
    without a language modeling head.
    c                     ^  \ rS rSrU 4S jr\\" SS9S\R                  S\	\
   S\\-  4S j5       5       rS	\R                  S
\R                  S\R                  4S jr\\       SS	\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\S-  S
\R                  S-  S\S-  S\	\
   S\\-  4S jj5       5       rSrU =r$ )VoxtralModel   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        U5      U l	        U R                  5         g r`   )ri   rj   r   from_configrl   audio_towerrn   language_modelrd   multi_modal_projector	post_initrt   s     r&   rj   VoxtralModel.__init__   sY     $001D1DE'33F4F4FG%?%G"r%   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r3   rX   r:   r;   c                     U R                   " U4SS0UD6nUR                  nUR                  SU R                  R                  R
                  5      nU R                  U5      nXSl        U$ )a)  
input_features (`torch.FloatTensor`):
    Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
    `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
    `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
return_dictTr=   )r   rE   reshaperF   rl   rm   r   pooler_output)rW   rX   r:   audio_outputsr   audio_embedss         r&   get_audio_featuresVoxtralModel.get_audio_features   sj     ((TTTVT+==199"dkk>V>V>h>hi112EF&2#r%   	input_idsrZ   rx   c           	      &   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r?   r=   r   z6Audio features and audio tokens do not match, tokens: z, features: )get_input_embeddingsr   tensorrF   audio_token_idlongrA   allsumrK   	unsqueeze	expand_asrM   r   numel)rW   r   rZ   rx   special_audio_maskn_audio_tokensn_audio_featuress          r&   get_placeholder_mask!VoxtralModel.get_placeholder_mask   s     !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r%   NrD   position_idspast_key_values	use_cachec           	         Uc  U R                  5       " U5      nS n	UbW  UbT  U R                  USS9R                  n	U R                  XU	S9n
UR	                  XR                  UR                  5      5      nU R                  " SUUUUUS.UD6n[        UR                  UR                  UR                  UR                  U	S9$ )NT)r   )rZ   rx   )rD   r   r   rZ   r   )rE   r   r9   r8   r   r   )r   r   r   r   masked_scatterrM   rA   r   r}   rE   r   r9   r8   )rW   r   rX   rD   r   r   rZ   r   r:   r   r   outputss               r&   r^   VoxtralModel.forward   s       557	BM%)*?22>t2TbbL "&!:!:| "; " *889K__]j]q]qMrsM+/+>+> ,
)%+',
 ,
 *%77#33!//)) ,
 	
r%   )r   r   r   )NNNNNNN)r    r!   r"   r#   rj   r   r   r   r   r   r   rb   r
   r   
LongTensorr   Tensorr   boolr}   r^   r$   rz   r{   s   @r&   r   r      s\     w#//;ABT;U	+	+ &"))":?:K:K"]b]n]n"0  .237.204(,26!%'
##d*'
 ))D0'
 t+	'

 &&-'
 '
 ((4/'
 $;'
 +,'
 
+	+'
  '
r%   r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a Llama language model.
    c                   n  ^  \ rS rSrS/rU 4S jrS r\\         SS\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\S-  S\	R                  S-  S\	R                  S-  S\S-  S\\	R                  -  S\\   S\\-  4S jj5       5       rU 4S jrSrU =r$ )VoxtralForConditionalGeneration   rR   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g rg   )ri   rj   r   modelr   rk   rn   ro   
vocab_sizelm_headr   rt   s     r&   rj   (VoxtralForConditionalGeneration.__init__  sS     !&)
yy!3!3!?!?ASASA^A^ejkr%   c                 :    U R                   R                  " U0 UD6$ r`   )r   r   )rW   argsr:   s      r&   r   2VoxtralForConditionalGeneration.get_audio_features  s    zz,,d=f==r%   Nr   rX   rD   r   r   rZ   labelsr   logits_to_keepr:   r;   c
                    U R                   " SUUUUUUUS.U
D6nUR                  n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.U
D6n[        UUUR                  UR                  UR                  S9$ )a  
Example:

```python
>>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

>>> processor = AutoProcessor.from_pretrained(repo_id)
>>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

>>> conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
            },
            {"type": "text", "text": "What can you tell me about this audio?"},
        ],
    }
]

>>> inputs = processor.apply_chat_template(conversation)
>>> inputs = inputs.to(device, dtype=torch.bfloat16)

>>> outputs = model.generate(**inputs, max_new_tokens=30)
>>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
```)r   rX   rD   r   r   rZ   r   N)logitsr   r   )lossr   r   r9   r8   r   )r   rE   
isinstanceintslicer   loss_functionrF   rn   r   r   r   r9   r8   )rW   r   rX   rD   r   r   rZ   r   r   r   r:   r   r9   slice_indicesr   r   s                   r&   r^   'VoxtralForConditionalGeneration.forward  s    ` ** 	
))%+'	
 	
  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD &#33!//))
 	
r%   c                    > UR                  SS 5      nUR                  SS5      n[        TU ]  " U0 UD6nU(       d  UR                  SS5      (       d  X5S'   U$ )NrX   is_first_iterationFr   T)popgetri   prepare_inputs_for_generation)rW   r   r:   rX   r   model_inputsru   s         r&   r   =VoxtralForConditionalGeneration.prepare_inputs_for_generation[  s^      $4d;#ZZ(<eDw<dMfMVZZT%B%B-;)*r%   )r   r   )	NNNNNNNNr   )r    r!   r"   r#   _keep_in_fp32_modules_strictrj   r   r   r   r   r   r   r   r   r   r   r   r   rb   r   r^   r   r$   rz   r{   s   @r&   r   r      s2    %6#6 >  .237.204(,26*.!%-.I
##d*I
 ))D0I
 t+	I

 &&-I
 I
 ((4/I
   4'I
 $;I
 ell*I
 +,I
 
'	'I
  I
V r%   r   )r,   r6   r   r   ),dataclassesr   r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr	   r
   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r)   r,   r6   Modulerd   r}   r   r   __all__r   r%   r&   <module>r      s2    "   !   ) 
 ' a a 7 5   1	* 		0 	6  
3
& 3

3
l  
 9!8 9 9 `
) `
`
F 
e&<o e
eP jr%   