
    3j=                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
JrJrJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSKJrJrJrJrJrJrJ r J!r!  SSK"J#r#  \RH                  " \%5      r& " S S\RN                  5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S S\ 5      r-\ " S S\!5      5       r.\ " S S \5      5       r/\" S!S"9 " S# S$\.5      5       r0/ S%Qr1g)&zHPyTorch ViViT model - modular file inheriting transformer core from ViT.    )IterableN)nn   )initialization)create_bidirectional_mask)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )PreTrainedModelViTAttentionViTEmbeddingsViTLayerViTMLPViTModel	ViTPoolerViTPreTrainedModel   )VivitConfigc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	VivitTubeletEmbeddings,   ae  
This class turns `pixel_values` of shape `(batch_size, num_frames, num_channels, height, width)` into the initial
`hidden_states` (tubelet embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer encoder.

The seq_length equals (num_frames // tubelet_size[0]) * (height // tubelet_size[1]) * (width // tubelet_size[2]).
configc                 N  > [         TU ]  5         UR                  nUR                  n[	        U[
        5      (       a  UOX34nUR                  US   -  US   US   -  -  US   US   -  -  U l        X0l        [        R                  " UR                  UR                  X"S9U l        g )Nr   r   r   )kernel_sizestride)super__init__tubelet_size
image_size
isinstancer   
num_framesnum_patchesr   Conv3dnum_channelshidden_size
projection)selfr    r&   r'   	__class__s       a/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/vivit/modular_vivit.pyr%   VivitTubeletEmbeddings.__init__5   s    **&&
#-j(#C#CZ*Ia
 ,q/1!}Q/1!}Q/1 	
 %))!3!3
    pixel_valuesreturnc                     UR                  SS5      nU R                  U5      R                  S5      R                  SS5      $ )Nr   r   )	transposer.   flatten)r/   r4   s     r1   forwardVivitTubeletEmbeddings.forwardE   s;    #--a3|,44Q7AA!QGGr3   )r'   r*   r.   )__name__
__module____qualname____firstlineno____doc__r   r%   torchTensorr9   __static_attributes____classcell__r0   s   @r1   r   r   ,   s8    
{ 
 HELL HU\\ H Hr3   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	S\R                  4U 4S	 jjr
SS
\R                  S\S\R                  4S jjrSrU =r$ )VivitEmbeddingsK   zQ
Construct the CLS token, position and tubelet patch embeddings for video input.
r    c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        UR                  SS  U l        U ?g )Nr   )r$   r%   r   	Parameterr@   zerosr-   	cls_tokenr   patch_embeddingsr*   position_embeddingsr&   
patch_size
mask_token)r/   r    r*   r0   s      r1   r%   VivitEmbeddings.__init__P   s    ekk!Q8J8J&KL 6v >++77#%<<A{QPVPbPb0c#d  --ab1Or3   
embeddingsheightwidthr5   c                 j   > [         TU ]  XU5        X R                  S   -  nX0R                  S   -  ng )Nr   r   )r$   interpolate_pos_encodingrN   )r/   rQ   rR   rS   
new_height	new_widthr0   s         r1   rU   (VivitEmbeddings.interpolate_pos_encoding[   s4    (UCq11
__Q//	r3   r4   rU   c                    UR                   u  p4pVnU R                  U5      nU R                  R                  USS5      n	[        R
                  " X4SS9nU(       a  XR                  XU5      -   nOdX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eXR                  -   nU R                  U5      nU$ )	Nr   )dimr   zInput image size (*z) doesn't match model (z).)shaperL   rK   expandr@   catrU   r'   
ValueErrorrM   dropout)
r/   r4   rU   
batch_sizer)   r,   rR   rS   rQ   
cls_tokenss
             r1   r9   VivitEmbeddings.forwarda   s    >J>P>P;
e**<8
 ^^**:r2>
YY
7Q?
##&C&CJX]&^^J++u8J/J (% 9+,Adooa.@-AE  $&>&>>J\\*-
r3   )rK   rL   rN   rM   )F)r;   r<   r=   r>   r?   r   r%   r@   rA   intrU   boolr9   rB   rC   rD   s   @r1   rF   rF   K   sk    	{ 	05<< 0 0UX 0]b]i]i 0ELL D ]b]i]i  r3   rF   c                       \ rS rSrSrg)VivitAttentionx    Nr;   r<   r=   r>   rB   rj   r3   r1   rh   rh   x       r3   rh   c                       \ rS rSrSrg)VivitMLP|   rj   Nrk   rj   r3   r1   rn   rn   |   rl   r3   rn   c                       \ rS rSrSrg)
VivitLayer   rj   Nrk   rj   r3   r1   rq   rq      rl   r3   rq   c                       \ rS rSrSrg)VivitPooler   rj   Nrk   rj   r3   r1   rt   rt      rl   r3   rt   c                   ^    \ rS rSr% \\S'   SrSrSS/r\	R                  " 5       S 5       rSrg	)
VivitPreTrainedModel   r    vivit)videorF   rq   c                     [         R                  " X5        [        U[        5      (       aA  [        R
                  " UR                  5        [        R
                  " UR                  5        gg)zInitialize the weightsN)r   _init_weightsr(   rF   initzeros_rK   rM   )r/   modules     r1   r|   "VivitPreTrainedModel._init_weights   sH     	%%d3fo..KK(()KK223 /r3   rj   N)r;   r<   r=   r>   r   __annotations__base_model_prefixinput_modalities_no_split_modulesr@   no_gradr|   rB   rj   r3   r1   rw   rw      s7    !*L9
]]_4 4r3   rw   c                      ^  \ rS rSrSS\S\4U 4S jjjr\\" SS9\	   SS\
R                  S-  S	\S
\
R                  S-  S\\   S\4
S jj5       5       5       rSrU =r$ )
VivitModel   r    add_pooling_layerc                 D   > [         TU ]  U5        [        U5      U l        g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r$   r%   rF   rQ   )r/   r    r   r0   s      r1   r%   VivitModel.__init__   s    
 	 )&1r3   F)tie_last_hidden_statesNr4   rU   attention_maskkwargsr5   c                     U R                  XS9n[        U R                  UUS9nUnU R                   H  nU" Xc40 UD6nM     U R	                  U5      nU R
                  b  U R                  U5      OSn	[        XS9$ )a(
  
Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import VivitImageProcessor, VivitModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 3137, 768]
```)rU   )r    inputs_embedsr   N)last_hidden_statepooler_output)rQ   r   r    layers	layernormpoolerr	   )
r/   r4   rU   r   r   embedding_outputhidden_stateslayersequence_outputpooled_outputs
             r1   r9   VivitModel.forward   s    j  ??<?k2;;*)

 )[[E!-J6JM !..78<8OO4UY)Oiir3   )rQ   )T)NFN)r;   r<   r=   r>   r   rf   r%   r   r   r   r@   FloatTensorrA   r   r   r	   r9   rB   rC   rD   s   @r1   r   r      s    2{ 2t 2 2  E2 26)..2	^j''$.^j #'^j t+	^j
 +,^j 
$^j  3  ^jr3   r   a  
        ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for Kinetics-400.

        <Tip>

            Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
            setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
            position embeddings to the higher resolution.

        </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S-  S\R                  S-  S\S\\   S	\4
S
 jj5       5       rSrU =r$ )VivitForVideoClassificationi  r    c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r   r   )r$   r%   
num_labelsr   ry   r   Linearr-   Identity
classifier	post_init)r/   r    r0   s     r1   r%   $VivitForVideoClassification.__init__  ss      ++%@
 OUN_N_bcNc"))F$6$68I8IJikititiv 	r3   Nr4   labelsrU   r   r5   c                     U R                   " U4SU0UD6nUR                  nU R                  USS2SSS24   5      nSnUb  U R                  " X'U R                  40 UD6n[        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import numpy as np
>>> import torch

>>> from transformers import VivitImageProcessor, VivitForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
LABEL_116
```rU   Nr   )losslogitsr   
attentions)ry   r   r   loss_functionr    r
   r   r   )	r/   r4   r   rU   r   outputsr   r   r   s	            r1   r9   #VivitForVideoClassification.forward!  s    x $(::$
3K$
OU$
 "33Aq!9:%%fdkkLVLD$!//))	
 	
r3   )r   r   ry   )NNF)r;   r<   r=   r>   r   r%   r   r   r@   r   
LongTensorrf   r   r   r
   r9   rB   rC   rD   s   @r1   r   r     s    
{ 
  26*.).	i
''$.i
   4'i
 #'	i

 +,i
 
i
  i
r3   r   )r   rw   r   )2r?   collections.abcr   r@   r    r   r}   masking_utilsr   modeling_outputsr   r	   r
   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   vit.modeling_vitr   r   r   r   r   r   r   r   configuration_vivitr   
get_loggerr;   loggerModuler   rF   rh   rn   rq   rt   rw   r   r   __all__rj   r3   r1   <module>r      s   O $   & 6 b b & @ @ I 5	 	 	 - 
		H	%HRYY H>*m *Z	\ 		v 		 		) 	 4- 4 4 jj jj jjZ x
"6 x
x
v Pr3   