
    3j]                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJ	r	  SSK
Jr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJr  SSKJrJrJr  SSKJrJr  SSKJr  SSK J!r!   " S S\RD                  5      r# " S S\RD                  5      r$  S/S\RD                  S\RJ                  S\RJ                  S\RJ                  S\RJ                  S-  S\&S-  S\&S\\   4S jjr' " S S\RD                  5      r( " S  S!\RD                  5      r) " S" S#\5      r* " S$ S%\RD                  5      r+\ " S& S'\5      5       r,\ " S( S)\,5      5       r-\" S*S+9 " S, S-\,5      5       r./ S.Qr/g)0    )CallableIterableN)nn   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )VivitConfigc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	VivitTubeletEmbeddings'   ae  
This class turns `pixel_values` of shape `(batch_size, num_frames, num_channels, height, width)` into the initial
`hidden_states` (tubelet embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer encoder.

The seq_length equals (num_frames // tubelet_size[0]) * (height // tubelet_size[1]) * (width // tubelet_size[2]).
configc                 N  > [         TU ]  5         UR                  nUR                  n[	        U[
        5      (       a  UOX34nUR                  US   -  US   US   -  -  US   US   -  -  U l        X0l        [        R                  " UR                  UR                  X"S9U l        g )Nr   r      )kernel_sizestride)super__init__tubelet_size
image_size
isinstancer   
num_framesnum_patchesr   Conv3dnum_channelshidden_size
projection)selfr   r#   r$   	__class__s       b/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/vivit/modeling_vivit.pyr"   VivitTubeletEmbeddings.__init__0   s    **&&
#-j(#C#CZ*Ia
 ,q/1!}Q/1!}Q/1 	
 %))!3!3
    pixel_valuesreturnc                     UR                  SS5      nU R                  U5      R                  S5      R                  SS5      $ )Nr   r   )	transposer+   flatten)r,   r1   s     r.   forwardVivitTubeletEmbeddings.forward@   s;    #--a3|,44Q7AA!QGGr0   )r$   r'   r+   )__name__
__module____qualname____firstlineno____doc__r   r"   torchTensorr6   __static_attributes____classcell__r-   s   @r.   r   r   '   s8    
{ 
 HELL HU\\ H Hr0   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	S\R                  4S	 jr
SS
\R                  S\S\R                  4S jjrSrU =r$ )VivitEmbeddingsF   zQ
Construct the CLS token, position and tubelet patch embeddings for video input.
r   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR                  SS  U l        U R                  R"                  U l        g )Nr   )r!   r"   r   	Parameterr=   zerosr*   	cls_tokenr   patch_embeddingsr'   position_embeddingsDropouthidden_dropout_probdropoutr#   
patch_sizer$   )r,   r   r'   r-   s      r.   r"   VivitEmbeddings.__init__K   s    ekk!Q8J8J&KL 6v >++77#%<<A{QPVPbPb0c#d zz&"<"<= --ab1//::r0   
embeddingsheightwidthr2   c                    U R                   R                  nU R                  R                  S   S-
  n[        R
                  R                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                  S   nX R                  S   -  n	X0R                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr   g      ?r   r   bicubicF)sizemodealign_cornersdim)rI   r'   rJ   shaper=   jit
is_tracingrN   r   reshapepermuter   
functionalinterpolateviewcat)r,   rP   rQ   rR   r'   num_positionsclass_pos_embedpatch_pos_embedrZ   
new_height	new_widthsqrt_num_positionss               r.   interpolate_pos_encoding(VivitEmbeddings.interpolate_pos_encodingW   sY    ++770066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr0   r1   rj   c                    UR                   u  p4pVnU R                  U5      nU R                  R                  USS5      n	[        R
                  " X4SS9nU(       a  XR                  XU5      -   nOdX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eXR                  -   nU R                  U5      nU$ )	NrT   r   rY   r   zInput image size (*z) doesn't match model (z).)r[   rI   rH   expandr=   rc   rj   r$   
ValueErrorrJ   rM   )
r,   r1   rj   
batch_sizer&   r)   rQ   rR   rP   
cls_tokenss
             r.   r6   VivitEmbeddings.forward~   s    >J>P>P;
e**<8
 ^^**:r2>
YY
7Q?
##&C&CJX]&^^J++u8J/J (% 9+,Adooa.@-AE  $&>&>>J\\*-
r0   )rH   rM   r$   rI   rN   rJ   )F)r8   r9   r:   r;   r<   r   r"   r=   r>   intrj   boolr6   r?   r@   rA   s   @r.   rC   rC   F   sq    
;{ 
;%D5<< %D %DUX %D]b]i]i %DNELL D ]b]i]i  r0   rC   modulequerykeyvalueattention_maskscalingrM   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  US[        R                  S9R                  UR                  5      n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrT         r   r   )rZ   dtype)ptrainingr   )rV   r=   matmulr4   r   r`   softmaxfloat32tor~   rM   r   
contiguous)
ru   rv   rw   rx   ry   rz   rM   r{   attn_weightsattn_outputs
             r.   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r0   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\
\R                  \R                  4   4S	 jjrS
rU =r$ )VivitAttention   r   c                   > [         TU ]  5         Xl        UR                  U l        [	        USUR
                  UR                  -  5      U l        UR                  U l        U R                  S-  U l	        SU l
        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  SS9U l        g )Nhead_dimr}   F)biasT)r!   r"   r   num_attention_headsgetattrr*   r   attention_probs_dropout_probattention_dropoutrz   	is_causalr   Linearqkv_biasq_projk_projv_projo_projr,   r   r-   s     r.   r"   VivitAttention.__init__   s*   #)#=#= 
F4F4F&JdJd4de!'!D!D}}d*ii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii : :T]] JFL^L^eijr0   Nhidden_statesry   r{   r2   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  U R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
U R#                  U
5      n
X4$ )NrT   r   r           )rM   rz   )r[   r   r   rb   r4   r   r   r   get_interfacer   _attn_implementationr   r   r   rz   r^   r   r   )r,   r   ry   r{   input_shapehidden_shapequery_states
key_statesvalue_statesattention_interfacer   r   s               r.   r6   VivitAttention.forward   sE    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r0   )
r   r   r   r   r   r   r   r   rz   r   N)r8   r9   r:   r;   r   r"   r=   r>   r   r   tupler6   r?   r@   rA   s   @r.   r   r      sk    k{ k" /3)||) t+) +,	)
 
u||U\\)	*) )r0   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VivitMLP   r   c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r!   r"   r   r   
hidden_actactivation_fnr   r   r*   intermediate_sizefc1fc2r   s     r.   r"   VivitMLP.__init__   sb    #F$5$5699V//1I1IJ99V55v7I7IJr0   r   r2   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r,   r   s     r.   r6   VivitMLP.forward   s4    /**=9/r0   )r   r   r   r   r8   r9   r:   r;   r   r"   r=   r>   r6   r?   r@   rA   s   @r.   r   r      s1    K{ KU\\ ell  r0   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\R                  4S	 jjr
S
rU =r$ )
VivitLayer   r   c                 d  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        R
                  " UR                  UR                  S9U l	        [        U5      U l        [        R                  " UR                  5      U l        g )Neps)r!   r"   r   	attentionr   	LayerNormr*   layer_norm_epslayernorm_beforelayernorm_afterr   mlprK   rL   rM   r   s     r.   r"   VivitLayer.__init__   sz    '/ "V-?-?VEZEZ [!||F,>,>FDYDYZF#zz&"<"<=r0   Nr   ry   r{   r2   c                     UnU R                  U5      nU R                  " X40 UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU$ r   )r   r   rM   r   r   )r,   r   ry   r{   residual_s         r.   r6   VivitLayer.forward   s     !--m<>>-R6R]3%0 !,,];/]3%0r0   )r   rM   r   r   r   r   )r8   r9   r:   r;   r   r"   r=   r>   r   r   r6   r?   r@   rA   s   @r.   r   r      s[    >{ > /3|| t+ +,	
 
 r0   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VivitPooleri  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r!   r"   r   r   r*   pooler_output_sizedenser   
pooler_act
activationr   s     r.   r"   VivitPooler.__init__  s>    YYv1163L3LM
 !2!23r0   r   r2   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r,   r   first_token_tensorpooled_outputs       r.   r6   VivitPooler.forward  s6     +1a40

#566r0   )r   r   r   rA   s   @r.   r   r     s/    4{ 4
U\\ ell  r0   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	SS/r
SrSrSrSrSr\\S	.rS
r\R*                  " 5       U 4S j5       rSrU =r$ )VivitPreTrainedModeli!  r   vivitr1   )videoTrC   r   )r   
attentionsrI   c                    > [         TU ]  U5        [        U[        5      (       aA  [        R
                  " UR                  5        [        R
                  " UR                  5        gg)zInitialize the weightsN)r!   _init_weightsr%   rC   initzeros_rH   rJ   )r,   ru   r-   s     r.   r   "VivitPreTrainedModel._init_weights4  sH     	f%fo..KK(()KK223 /r0    )r8   r9   r:   r;   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_can_compile_fullgraphr   r   _can_record_outputs_input_embed_layerr=   no_gradr   r?   r@   rA   s   @r.   r   r   !  sv    $O!&*#*L9N"&!#$ ,
]]_4 4r0   r   c                      ^  \ rS rSrSS\S\4U 4S jjjr\\" SS9\	   SS\
R                  S-  S	\S
\
R                  S-  S\\   S\4
S jj5       5       5       rSrU =r$ )
VivitModeli=  r   add_pooling_layerc                   > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        [
        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R#                  5         gs  snf )z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r!   r"   r   rC   rP   r   
ModuleListrangenum_hidden_layersr   layersr   r*   r   	layernormr   pooler	post_init)r,   r   r   r   r-   s       r.   r"   VivitModel.__init__?  s    
 	 )&1mmvG_G_A`$aA`AZ%7A`$abf&8&8f>S>ST->k&)D	 %bs   CF)tie_last_hidden_statesNr1   rj   ry   r{   r2   c                     U R                  XS9n[        U R                  UUS9nUnU R                   H  nU" Xc40 UD6nM     U R	                  U5      nU R
                  b  U R                  U5      OSn	[        XS9$ )a(
  
Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import VivitImageProcessor, VivitModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 3137, 768]
```)rj   )r   inputs_embedsry   N)last_hidden_statepooler_output)rP   r	   r   r   r   r   r   )
r,   r1   rj   ry   r{   embedding_outputr   layersequence_outputr   s
             r.   r6   VivitModel.forwardM  s    j  ??<?k2;;*)

 )[[E!-J6JM !..78<8OO4UY)Oiir0   )r   rP   r   r   r   )T)NFN)r8   r9   r:   r;   r   rt   r"   r   r   r   r=   FloatTensorr>   r   r   r   r6   r?   r@   rA   s   @r.   r   r   =  s    { t    E2 26)..2	^j''$.^j #'^j t+	^j
 +,^j 
$^j  3  ^jr0   r   a  
        ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for Kinetics-400.

        <Tip>

            Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
            setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
            position embeddings to the higher resolution.

        </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S-  S\R                  S-  S\S\\   S	\4
S
 jj5       5       rSrU =r$ )VivitForVideoClassificationi  r   c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r   r   )r!   r"   
num_labelsr   r   r   r   r*   Identity
classifierr   r   s     r.   r"   $VivitForVideoClassification.__init__  ss      ++%@
 OUN_N_bcNc"))F$6$68I8IJikititiv 	r0   Nr1   labelsrj   r{   r2   c                     U R                   " U4SU0UD6nUR                  nU R                  USS2SSS24   5      nSnUb  U R                  " X'U R                  40 UD6n[        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import numpy as np
>>> import torch

>>> from transformers import VivitImageProcessor, VivitForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
LABEL_116
```rj   Nr   )losslogitsr   r   )r   r   r	  loss_functionr   r   r   r   )	r,   r1   r  rj   r{   outputsr   r  r  s	            r.   r6   #VivitForVideoClassification.forward  s    x $(::$
3K$
OU$
 "33Aq!9:%%fdkkLVLD$!//))	
 	
r0   )r	  r  r   )NNF)r8   r9   r:   r;   r   r"   r   r   r=   r  
LongTensorrt   r   r   r   r6   r?   r@   rA   s   @r.   r  r    s    
{ 
  26*.).	i
''$.i
   4'i
 #'	i

 +,i
 
i
  i
r0   r  )r   r   r  )Nr   )0collections.abcr   r   r=   r    r   r   activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_vivitr   Moduler   rC   r>   floatr   r   r   r   r   r   r   r  __all__r   r0   r.   <module>r"     s  * /   & ! 6 9 b b F & B B I 5 ,HRYY H>Lbii Lj !%II%<<% 
% <<	%
 LL4'% T\% % '(%8.)RYY .)bryy  + @"))  4? 4 46 pj% pj pjf x
"6 x
x
v Pr0   