
    3j}                        S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJrJr  SSKJrJr  SSKJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-  SSK.J/r/J0r0J1r1  SSK2J3r3J4r4J5r5  \'Rl                  " \75      r8 " S S\Rr                  5      r: " S S\Rr                  5      r;\% " S S\5      5       r< SLS\Rr                  S\Rz                  S\Rz                  S \Rz                  S!\Rz                  S-  S"\>S#\>4S$ jjr? " S% S&\Rr                  5      r@ " S' S(\Rr                  5      rA " S) S*\5      rB " S+ S,\Rr                  5      rC " S- S.\<5      rD " S/ S0\Rr                  5      rE " S1 S2\Rr                  5      rF " S3 S4\Rr                  5      rG " S5 S6\Rr                  5      rH " S7 S8\Rr                  5      rI " S9 S:\5      rJ " S; S<\Rr                  5      rK " S= S>\<5      rL\%" S?S@9\ " SA SB\#5      5       5       rM\%" SCS@9 " SD SE\<5      5       rN\%\ " SF SG\5      5       5       rO\%" SHS@9 " SI SJ\<\5      5       rP/ SKQrQg)M    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GenerationMixin)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastSeq2SeqLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )!InstructBlipVideoVisionEmbeddings<   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr$   r   )in_channelsout_channelskernel_sizestrider    )super__init__r+   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr+   	__class__s     z/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr2   *InstructBlipVideoVisionEmbeddings.__init__=   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"b    
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r$   Ng      ?r   r   r    bicubicF)sizemodealign_cornersdim)shaper?   r8   jit
is_tracingr6   r   reshapepermuter   
functionalinterpolateviewcat)rA   rF   rG   rH   r=   r>   class_pos_embedpatch_pos_embedrQ   
new_height	new_widthsqrt_num_positionss               rC   interpolate_pos_encoding:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingO   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCrE   pixel_valuesr`   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )N)dtyper    r$   rK   rP   )rR   r<   weightrd   toflatten	transposer:   expandr8   rZ   r`   r?   rM   )rA   rb   r`   
batch_size_rG   rH   target_dtypepatch_embedsclass_embedsrF   r?   s               rC   forward)InstructBlipVideoVisionEmbeddings.forwardw   s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
rE   )	r:   r+   r4   r5   r=   r>   r<   r6   r?   F)__name__
__module____qualname____firstlineno__r'   r2   r8   Tensorintr`   FloatTensorboolro   __static_attributes____classcell__rB   s   @rC   r)   r)   <   sr    c< c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  rE   r)   c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )"InstructBlipVideoQFormerEmbeddings   z;Construct the embeddings from word and position embeddings.c                 "  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R!                  S["        R$                  " UR                  5      R'                  S5      SS9  Xl        g )N)padding_idxepsposition_idsr$   rK   F)
persistent)r1   r2   r   	Embedding
vocab_sizer3   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_eps	layernormDropouthidden_dropout_probdropoutregister_bufferr8   arangeri   r+   r@   s     rC   r2   +InstructBlipVideoQFormerEmbeddings.__init__   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 rE   c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbY  U R                  U5      nU R	                  UR                  UR                  5      5      nXg-   nUb  [        R                  " X64SS9nOUnUR                  U R                  R                  R                  5      nU R                  U5      nU R                  U5      nU$ )Nr$   r   rP   )rM   r   cloner   r   rf   devicer8   rZ   r   re   rd   r   )rA   	input_idsr   query_embedspast_key_values_length
seq_lengthrF   r   s           rC   ro   *InstructBlipVideoQFormerEmbeddings.forward   s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J"&":":<??:K\K\;]"^#9J'"YY'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rE   )r+   r   r   r   r   NNNr   )	rr   rs   rt   ru   __doc__r2   ro   rz   r{   r|   s   @rC   r~   r~      s#    E"   rE   r~   c                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSr/ SQr\R                   " 5       U 4S j5       rSrU =r$ )	 InstructBlipVideoPreTrainedModel   r+   blip)videotextT)r~   InstructBlipVideoAttentionInstructBlipVideoEncoderLayerInstructBlipVideoQFormerLayer*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                 V  > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g[	        U[        [        45      (       a!  [        R                  " UR                  5        g[	        U[        5      (       a\  [        R                  " UR                   ["        R$                  " UR                   R&                  S   5      R)                  S5      5        gg)zInitialize the weights        )meanstdrK   r   N)r1   _init_weightsr+   initializer_range
isinstancer)   inittrunc_normal_r?   r:   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelzeros_query_tokensr~   copy_r   r8   r   rR   ri   )rA   modulefactorrB   s      rC   r   .InstructBlipVideoPreTrainedModel._init_weights   s     	f%..f?@@v88sOv55CVL!JLb cddKK++, BCCJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh DrE    )rr   rs   rt   ru   r%   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr8   no_gradr   rz   r{   r|   s   @rC   r   r      s\    ##(&*#"&N! ]]_
i 
irE   r   r   querykeyvalueattention_maskscalingr   c                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrK   rP   )ptrainingr$   r    )	r8   matmulrh   r   rW   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             rC   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rE   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	S\R                  S	\
\R                  \R                  S
-  \
\R                     S
-  4   4S jrSrU =r$ )r      z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr   )bias)requires_grad)r1   r2   r+   r3   r4   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr7   r8   zerosrZ   
zeros_liker   
projection)rA   r+   q_biasv_biasr   rB   s        rC   r2   #InstructBlipVideoAttention.__init__   ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrE   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr$   r    )rY   r   r   rh   r   )rA   r   r   r   s       rC   _shape!InstructBlipVideoAttention._shape  s5    {{3GQQRSUVWbbddrE   hidden_statesrI   Nc                    UR                  5       u  p4nU R                  U5      nUR                  X4SU R                  XPR                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        R                  " U R                  R                  [        5      n
U
" U UUU	4SU R                  (       d  SOU R                  U R                  S.UD6u  pUR                  X4S	5      R                  5       nU R                  U5      nX4$ )
z#Input shape: Batch x Time x Channelr   r    r   r$      Nr   )r   r   r   rK   )rM   r   rU   r   rV   r   get_interfacer+   _attn_implementationr   r   r   r   r   r   )rA   r   r   r   tgt_lenr4   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                rC   ro   "InstructBlipVideoAttention.forward  s    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook2((rE   )	r   r+   r4   r   r   r   r   r   r   )rr   rs   rt   ru   r   r2   r8   rv   rw   r   tuplero   rz   r{   r|   s   @rC   r   r      su    GD>eU\\ eC ec e")||") 
u||U\\D0%2E2LL	M	") ")rE   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoMLPi>  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r1   r2   r+   r	   
hidden_actactivation_fnr   r   r3   intermediate_sizefc1fc2r@   s     rC   r2   InstructBlipVideoMLP.__init__?  sb    #F$5$5699V//1I1IJ99V55v7I7IJrE   r   rI   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r  rA   r   s     rC   ro   InstructBlipVideoMLP.forwardF  s4    /**=9/rE   )r  r+   r  r  
rr   rs   rt   ru   r2   r8   rv   ro   rz   r{   r|   s   @rC   r  r  >  s)    KU\\ ell  rE   r  c                   ~   ^  \ rS rSrS\4U 4S jjr\S\R                  S\	\
   S\R                  4S j5       rSrU =r$ )	r   iM  r+   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Nr   )r1   r2   r3   r4   r   	self_attnr   r   r   layer_norm1r  mlplayer_norm2r@   s     rC   r2   &InstructBlipVideoEncoderLayer.__init__N  sm    ++3F;<<F<Q<QR'/<<F<Q<QRrE   r   r   rI   c                     UnU R                  U5      nU R                  " SSU0UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )Nr   r   )r  r  r  r  )rA   r   r   residualrk   s        rC   ro   %InstructBlipVideoEncoderLayer.forwardV  su     !((7>> 
'

 &0 ((7/%0rE   )r4   r  r  r  r  )rr   rs   rt   ru   r%   r2   r   r8   rv   r   r   rx   ro   rz   r{   r|   s   @rC   r   r   M  sR    S6 S || +, 
			 rE   r   c                   \   ^  \ rS rSrSrS\4U 4S jjr\S\\	   S\
\-  4S j5       rSrU =r$ )	InstructBlipVideoEncoderim  a
  
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InstructBlipVideoEncoderLayer`].

Args:
    config (`InstructBlipVideoConfig`):
        The corresponding vision configuration for the `InstructBlipVideoEncoder`.
r+   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r1   r2   r+   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rA   r+   rk   rB   s      rC   r2   !InstructBlipVideoEncoder.__init__w  sU    mmTYZ`ZrZrTs$tTsq%B6%JTs$tu&+# %u   A&r   rI   c                 P    UnU R                    H  nU" U40 UD6nM     [        US9$ )Nlast_hidden_state)r   r   )rA   inputs_embedsr   r   encoder_layers        rC   ro    InstructBlipVideoEncoder.forward}  s9     &![[M)M ) ??rE   )r+   r!  r   )rr   rs   rt   ru   r   r%   r2   r   r   r   r   r   ro   rz   r{   r|   s   @rC   r  r  m  sL    ,6 , @ +,@ 
	 	@ @rE   r  c                      ^  \ rS rSr% SrSr\\S'   \\	S.r
S\4U 4S jjr\\" SS9\  SS\R                   S	-  S
\S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )InstructBlipVideoVisionModeli  rb   r   r+   )r   
attentionsc                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r  )r1   r2   r+   r3   r)   rF   r  encoderr   r   r   post_layernorm	post_init)rA   r+   r4   rB   s      rC   r2   %InstructBlipVideoVisionModel.__init__  sY     &&	;FC/7 ll9:O:OPrE   F)tie_last_hidden_statesNr`   r   rI   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r`   r'  r   r&  pooler_outputr   )r   rF   r.  r&  r/  r   )rA   rb   r`   r   r   encoder_outputsr&  pooled_outputs           rC   ro   $InstructBlipVideoVisionModel.forward  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rE   c                     U R                   $ r  )rF   rA   s    rC   get_input_embeddings1InstructBlipVideoVisionModel.get_input_embeddings  s    rE   )r+   rF   r.  r/  r  )rr   rs   rt   ru   main_input_namer   r'   r   r   r   _can_record_outputsr2   r   r   r   r8   rx   ry   r   r   r   r   ro   r;  rz   r{   r|   s   @rC   r+  r+    s    $O))60
	< 	  E2 26).
''$.
 #'
 +,	

 
+	+
  3  
6 rE   r+  c                   h   ^  \ rS rSrSU 4S jjrS rS rS rS rS r	   SS\
\   4S	 jjrS
rU =r$ )r   i  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        SU l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)F)r1   r2   r+   r3   r   hasattrr   rw   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   r   attention_probs_dropout_probr   save_attentionrA   r+   is_cross_attentionrB   s      rC   r2   3InstructBlipVideoQFormerMultiHeadAttention.__init__  sb    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF#rE   c                     Xl         g r  attn_gradients)rA   rM  s     rC   save_attn_gradients>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s    ,rE   c                     U R                   $ r  rL  r:  s    rC   get_attn_gradients=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    """rE   c                     Xl         g r  attention_map)rA   rU  s     rC   save_attention_map=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s    *rE   c                     U R                   $ r  rT  r:  s    rC   get_attention_map<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!!rE   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrK   r   r    r$   r   )rM   r   rC  rY   rV   )rA   xnew_x_shapes      rC   transpose_for_scores?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rE   r   c                    US LnU(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      nUnO@U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n	U R                  U	5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUR                  nUb  X-   n[        R                  " SS9" U5      R                  U5      nU(       a=  U R                  (       a,  U R                  U5        UR!                  U R"                  5        U R%                  U5      n[        R
                  " X5      nUR'                  SSSS5      R)                  5       nUR+                  5       S S U R,                  4-   nUR.                  " U6 nX4$ )NrK   r   rP   r   r    r$   r   )r^  r   r   r   r8   r   rh   mathsqrtrC  rd   r   Softmaxrf   rG  rV  register_hookrN  r   rV   r   rM   rD  rY   )rA   r   r   encoder_hidden_statesencoder_attention_maskr   rI  	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                    rC   ro   2InstructBlipVideoQFormerMultiHeadAttention.forward  s    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<5H5HR5PQ+dii8P8P.QQ!1!7!7%/@ **,-=>AABXY$"5"5##O4))$*B*BC #',,"?%<J%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD--rE   )rD  rC  rU  rM  r+   r   r   r   r   rG  r   rq   NNN)rr   rs   rt   ru   r2   rN  rQ  rV  rY  r^  r   r   ro   rz   r{   r|   s   @rC   r   r     sF    $0-#+"% "#4. +,4. 4.rE   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )r   i$  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r  )r1   r2   r   r   r3   denser   r   r   r   r   r@   s     rC   r2   +InstructBlipVideoQFormerSelfOutput.__init__%  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rE   r   input_tensorrI   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r  ru  r   r   rA   r   rw  s      rC   ro   *InstructBlipVideoQFormerSelfOutput.forward+  5    

=1]3}'CDrE   r   ru  r   r  r|   s   @rC   r   r   $  6    >U\\  RWR^R^  rE   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\\	   S	\R                  4S
 jjr
SrU =r$ )!InstructBlipVideoQFormerAttentioni2  c                 b   > [         TU ]  5         [        X5      U l        [	        U5      U l        g r  )r1   r2   r   	attentionr   outputrH  s      rC   r2   *InstructBlipVideoQFormerAttention.__init__3  s&    CF_8@rE   Nr   r   re  rf  r   rI   c                 Z    U R                   " SUUUUS.UD6u  pgU R                  Xa5      nU$ )N)r   r   re  rf  r   r  r  )	rA   r   r   re  rf  r   r   rk   attention_outputs	            rC   ro   )InstructBlipVideoQFormerAttention.forward8  sF      
')"7#9	

 
  ;;{BrE   r  rq   rr  )rr   rs   rt   ru   r2   r8   rv   rx   r   r   ro   rz   r{   r|   s   @rC   r  r  2  s    A 48:>;? ||  ))D0   %0047	 
 !& 1 1D 8  +,  
   rE   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )$InstructBlipVideoQFormerIntermediateiK  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r  )r1   r2   r   r   r3   r  ru  r   r  strr	   intermediate_act_fnr@   s     rC   r2   -InstructBlipVideoQFormerIntermediate.__init__L  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rE   r   rI   c                 J    U R                  U5      nU R                  U5      nU$ r  ru  r  r  s     rC   ro   ,InstructBlipVideoQFormerIntermediate.forwardT  s&    

=100?rE   r  r  r|   s   @rC   r  r  K  s(    9U\\ ell  rE   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoQFormerOutputiZ  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r1   r2   r   r   r  r3   ru  r   r   r   r   r   r@   s     rC   r2   'InstructBlipVideoQFormerOutput.__init__[  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rE   r   rw  rI   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r  ry  rz  s      rC   ro   &InstructBlipVideoQFormerOutput.forwarda  r|  rE   r}  r  r|   s   @rC   r  r  Z  r~  rE   r  c                   T   ^  \ rS rSrU 4S jr    SS\\   4S jjrS rS r	Sr
U =r$ )	r   ih  c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr$   r   T)rI  F)r1   r2   chunk_size_feed_forwardseq_len_dimr  r  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionr  intermediater  r  intermediate_queryoutput_queryrA   r+   r  rB   s      rC   r2   &InstructBlipVideoQFormerLayer.__init__i  s    '-'E'E$:6B"7771<"CF_c"dD'+D$',D$@H4V<"Fv"N:6BrE   r   c           
      l   U R                   " U4SU0UD6nUS:  a  US S 2S U2S S 24   nU R                  (       a%  Uc  [        S5      eU R                  " U4UUUS.UD6n[	        U R
                  U R                  U R                  U5      n	UR                  S   U:  ag  [	        U R                  U R                  U R                  US S 2US 2S S 24   5      R                  U	R                  5      n
[        R                  " X/SS9n	U	$ [	        U R                  U R                  U R                  U5      n	U	$ )Nr   r   z>encoder_hidden_states must be given for cross-attention layers)r   re  rf  r$   rP   )r  r  r   r  r   feed_forward_chunk_queryr  r  rR   feed_forward_chunkrf   r   r8   rZ   )rA   r   r   re  rf  query_lengthr   r  query_attention_outputlayer_outputlayer_output_texts              rC   ro   %InstructBlipVideoQFormerLayer.forward}  si     >>
)
 
 !%5a,6I%J"''(0$%eff)-)<)<**#1*?+A	*
 *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,)JPQR  5'',,   	L rE   c                 J    U R                  U5      nU R                  X!5      nU$ r  )r  r  rA   r  intermediate_outputr  s       rC   r  0InstructBlipVideoQFormerLayer.feed_forward_chunk  s)    "//0@A{{#6IrE   c                 J    U R                  U5      nU R                  X!5      nU$ r  )r  r  r  s       rC   r  6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s+    "556FG(()<OrE   )
r  r  r  r  r  r  r  r  r  r  r   )rr   rs   rt   ru   r2   r   r   ro   r  r  rz   r{   r|   s   @rC   r   r   h  s<    C. "#3 +,3j
 rE   r   c                   R   ^  \ rS rSrU 4S jr\    SS\\   4S jj5       rSr	U =r
$ )InstructBlipVideoQFormerEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf r  )
r1   r2   r+   r   r  r  r  r   layerr!  r  s      rC   r2   (InstructBlipVideoQFormerEncoder.__init__  sY    ]]OTU[UmUmOnoOn)*6=Ono

 ',# pr#  r   c                     [        U R                  R                  5       H   nU R                  U   nU" UUU4UUS.UD6nM"     [	        US9$ )N)rf  r  r%  )r  r+   r  r  r   )	rA   r   r   re  rf  r  r   ilayer_modules	            rC   ro   'InstructBlipVideoQFormerEncoder.forward  sf     t{{445A::a=L(% (>) M 6 9+
 	
rE   )r+   r!  r  r   )rr   rs   rt   ru   r2   r   r   r   ro   rz   r{   r|   s   @rC   r  r    s:    ,  "#
 +,
 
rE   r  c                     ^  \ rS rSrSrSrSrSrSr\	\
" \SSS9/\
" \SSS9/S.rS	\4U 4S
 jjrS rS r\\\     SS\R*                  S\R,                  S-  S\R*                  S-  S\R.                  S-  S\R,                  S-  S\R,                  S-  S\\   S\\R,                     \-  4S jj5       5       5       rSrU =r$ )InstructBlipVideoQFormerModeli  z
Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
instruction as input.
Fr$   z
.attention)index
layer_namez.crossattention)r   r,  cross_attentionsr+   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r  )r1   r2   r+   r~   rF   r  r.  r0  r@   s     rC   r2   &InstructBlipVideoQFormerModel.__init__  s7     <VD6v>rE   c                 .    U R                   R                  $ r  rF   r   r:  s    rC   r;  2InstructBlipVideoQFormerModel.get_input_embeddings   s    ...rE   c                 $    XR                   l        g r  r  )rA   r   s     rC   set_input_embeddings2InstructBlipVideoQFormerModel.set_input_embeddings  s    */'rE   Nr   r   r   r   re  rf  r   rI   c                 >   Uc  Uc  [        S5      eUb  UR                  S   OSnU R                  UUUS9n	[        U R                  U	US9nUb  [        U R                  U	UUS9nU R
                  " U	4UUUUS.UD6n
U
R                  nUSS2SSS24   n[        UUS	9$ )
a  
query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
    Hidden states to be used in the attention computation. If cross-attention,
    will be used for the query (i.e., key and value will use the encoder_hidden_states).
Nz7You have to specify query_embeds when input_ids is Noner$   r   )r   r   r   )r+   r'  r   )r+   r'  r   re  )r   re  rf  r  r4  )r   rR   rF   r   r+   r.  r&  r   )rA   r   r   r   r   re  rf  r   r  embedding_outputr6  sequence_outputr7  s                rC   ro   %InstructBlipVideoQFormerModel.forward  s    $ !5VWW0<0H|))!,a??%% + 
 3;;*)
 "-%>{{.5&;	&" ,0<<,
)"7#9%,
 ,
 *;;'1a0;-'
 	
rE   )r+   rF   r.  )NNNNN)rr   rs   rt   ru   r   r   r   r   r   r   r   r   r>  r&   r2   r;  r  r   r   r   r8   
LongTensorrx   rv   r   r   r   r   ro   rz   r{   r|   s   @rC   r  r    sB   
 #( N 7EQ[gh
 EQ[lm
= /0   4804,0:>;?6
##6
 ))D06
 &&-	6

 llT)6
  %00476
 !& 1 1D 86
 +,6
 
u  	!$P	P6
    6
rE   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                       \ rS rSr% SrSr\\R                     S-  \	S'   Sr
\\R                     S-  \	S'   Sr\S-  \	S'   Sr\S-  \	S'   Sr\\-  S-  \	S'   S	\\   4S
 jrSrg)4InstructBlipVideoForConditionalGenerationModelOutputiB  a~  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
    Outputs of the language model.
Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrI   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r  r  r  N)getattrto_tuple).0krA   s     rC   	<genexpr>PInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>]  sC      
 ! WW Gq!**,- !s   25)r   keysr:  s   `rC   r  =InstructBlipVideoForConditionalGenerationModelOutput.to_tuple\  s%     
 YY[	
 
 	
rE   r   )rr   rs   rt   ru   r   r  r   r8   rx   r   r  r  r   r  r   r  r   r   r   r  rz   r   rE   rC   r  r  B  s     -1D%!!
"T
)0.2FE%##$t+28<N.5<KOOADHONR2_DtKR
%* 
rE   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c                     ^  \ rS rSrSrS/rS\4U 4S jjrS rS\	R                  S\	R                  4S	 jr\\        SS\	R                  S\	R                  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\S\S
-  S\\   S\\-  4S jj5       5       rSrU =r$ )r   ie  rb   r   r+   c                   > [         TU ]  U5        [        UR                  5      U l        [
        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        UR                  5      U l        [
        R                  " UR                  R                  UR                   R                  5      U l        [$        R&                  " UR                   5      U l        U R+                  5         g Nr$   )r1   r2   r+  vision_configvision_modelr   r7   r8   r   num_query_tokensqformer_configr3   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr0  r@   s     rC   r2   InstructBlipVideoModel.__init__n  s     89M9MNLLQ8O8OQWQfQfQrQr)st4V5J5JK#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG 	rE   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggz
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maplenr8   cudadevice_countloggerwarningrB  r  r  io_same_devicerA   r  s     rC   _preprocess_accelerate-InstructBlipVideoModel._preprocess_accelerate{  |    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
33:>D((7 4rE   r   r'  c           	      d   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  UR                  5      nU$ zJ
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
rd   r   rK   )
r;  r8   r   r+   image_token_idlongr   all	unsqueezerf   rA   r   r'  special_image_masks       rC   get_placeholder_mask+InstructBlipVideoModel.get_placeholder_mask       !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=@@AUAUV!!rE   Nqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskr`   	use_cacher   rI   c           	      t   UR                   u  ppnUR                  X-  XU5      nU R                  " SUU	S.UD6nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9nUR                  UR                  5      n[        R                  " UU/SS9nU R                  " SUUUUUS.UD6nUS   SS2SUR                  S5      2SS24   nU R                  U5      nUR                  XR                   R"                  U-  S5      nUcR  U R$                  R'                  5       " U5      nX@R                   R(                  :H  nUc  [        R                  " U5      nOiXR'                  5       " [        R*                  " U R                   R(                  [        R                  UR                  S95      :H  nUR-                  S5      nUR/                  S5      R                  UR                  5      nUR                  UR                  UR0                  5      nUR3                  UU5      nU R                   R4                  (       a  U R$                  " SUUU
S	.UD6nOU R$                  " SUUUUU
S
.UD6n[7        UUUS9$ )aU  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
rb   r`   r   NrK   r  rP   r$   r   r   r   re  rf  r'  r   r  )r'  r   r  r  r  r  r   )rR   rU   r  r8   onesrM   r  r   r   ri   	ones_likerepeat_interleaverf   rZ   r  r  r+   r  r  r;  video_token_idr   r  r  rd   masked_scatteruse_decoder_only_language_modelr  )rA   rb   r	  r
  r   r   r  r  r'  r`   r  r   rj   frameschannelrG   rH   r  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                              rC   ro   InstructBlipVideoModel.forward  sS   P 6B5G5G2
GU#++J,?RWX** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!7!:!:;O;V;V!W!&,@BX+Y_`!a 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t  //DDFyQM!*kk.H.H!H%!&!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=@@AUAUV 5 8 89M9M}ObOb c%445GI^_;;66)) +-# 	G )) +-"3'=# G D))#*
 	
rE   r  r  r  r   r  )NNNNNNFN)rr   rs   rt   ru   r=  _keep_in_fp32_modulesr%   r2   r  r8   r  rx   r  r   r   rv   ry   r   r   r   r  ro   rz   r{   r|   s   @rC   r   r   e  sZ    %O+,6 ?("e.>.> "uO`O` " 
 ;?.22659:>-1).!%q
''q
 !,,q
 !& 0 04 7	q

 $$t+q
 ((4/q
 !++d2q
 !& 0 04 7q
 ||d*q
 #'q
 $;q
 +,q
 
E	Eq
  q
rE   r   c                   B    \ rS rSr% SrSr\S-  \S'   Sr\	S-  \S'   Sr
g)'BaseModelOutputWithVisionQformerOutputsi  z
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
Nr  r  r   )rr   rs   rt   ru   r   r  r   r   r  r   rz   r   rE   rC   r%  r%    s)     9=N.5<KOOADHOrE   r%  a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                     ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S\R                  4S	 jrSU 4S jjrS rS rS\R$                  S\R&                  4S jr\\         SS\R&                  S\R&                  S\R$                  S
-  S\R&                  S
-  S\R$                  S
-  S\R$                  S
-  S\R$                  S
-  S\R&                  S
-  S\R$                  S
-  S\S\S
-  S\\   S\\-  4S jj5       5       r\R:                  " 5             S S\R&                  S\R$                  S
-  S\R$                  S
-  S\R$                  S
-  S\R$                  S
-  S\R&                  S
-  S\S\R$                  4S jj5       r\\  S!S\R&                  S\R$                  S\R$                  S
-  S\S
-  S\\   S\\-  4S jj5       5       r Sr!U =r"$ )"r   i"  r+   rb   Tr   c                   > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        R                  UR                  5      U l        [        R                   " UR                  R                  UR"                  R                  5      U l        UR&                  (       a!  [(        R*                  " UR"                  5      nO [,        R*                  " UR"                  5      nX l        U R1                  5         g r  )r1   r2   r+  _from_configr  r  r   r7   r8   r   r  r  r3   r   r  r  r   r  r  r  r"   r  r#   r  r0  )rA   r+   r  rB   s      rC   r2   2InstructBlipVideoForConditionalGeneration.__init__2  s     8EEfFZFZ[LLQ8O8OQWQfQfQrQr)st4AA&BWBWX#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN, 	rE   c                 :    U R                   R                  U5        g r  )r  set_output_embeddings)rA   new_embeddingss     rC   r+  ?InstructBlipVideoForConditionalGeneration.set_output_embeddingsF  s    11.ArE   rI   c                 6    U R                   R                  5       $ r  )r  get_output_embeddingsr:  s    rC   r/  ?InstructBlipVideoForConditionalGeneration.get_output_embeddingsI  s    ""88::rE   Nc                 X   > Uc  U R                   R                  5       $ [        TU ]  US9$ )N)modality)r  get_encoderr1   )rA   r2  rB   s     rC   r3  5InstructBlipVideoForConditionalGeneration.get_encoderL  s1    &&22447&&99rE   c                 6    U R                   R                  5       $ r  )r  get_decoderr:  s    rC   r6  5InstructBlipVideoForConditionalGeneration.get_decoderR  s    ""..00rE   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggr  r  r  s     rC   r  @InstructBlipVideoForConditionalGeneration._preprocess_accelerateU  r  rE   r   r'  c           	      d   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  UR                  5      nU$ r  )
r;  r8   r   r+   r  r  r   r  r  rf   r  s       rC   r  >InstructBlipVideoForConditionalGeneration.get_placeholder_maski  r  rE   r	  r
  r   r  r  labelsr`   r  r   c           
         U R                   " U4UUU
S.UD6nUR                  nUR                  nUR                  nUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                  R                  (       aT  U R                  " S	UUUS.UD6nUS   nSnU	b3  U R                  " S	UXR                  R                   R"                  S.UD6nO1U R                  " S	UUUUU	US.UD6nUR$                  nUR&                  n[)        UUUUUS9$ )
a	  
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.

Examples:

```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```r	  r
  r`   Nr'  r  r   )r  r<  r   )r'  r   r  r  r<  r  )r  r  r  r  r  r   )get_video_featuresr5  r  r  r;  r8   r  rf   r   rd   r  r  r+   r  r  loss_functionr  r   r  r  r  )rA   rb   r	  r
  r   r   r  r  r'  r<  r`   r  r   video_featuresr  r  r  r  r   r  r  s                        rC   ro   1InstructBlipVideoForConditionalGeneration.forwardx  s   ` CGBYBYC
/#9%=	C

 C
 !/ < <(88'66  557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +-# 	G QZFD!)) !&[[=T=T=_=_ci
 )) +-"3'=# G <<D^^FC)+#*
 	
rE   c                 X   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUS9n
U
R                  nUc  Uc  U R
                  R                  /U R
                  R                  -  S-  nXR
                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                   " U5      nUR#                  UR                  UR$                  5      nU R'                  XFS9nUR)                  X5      nXeS.nU R*                  R
                  R,                  (       d  XOS	'   U R*                  R.                  " S
0 UDUD6nU$ )aA  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
r  r   r>  r   r  r$   r?  )r'  r   r   r   )rB  r  rR   r@  r5  r+   video_token_indexr  r  bos_token_idr8   r   r  r   repeatr;  r  rf   rd   r  r  r  is_encoder_decodergenerate)rA   rb   r	  r
  r   r   r'  r`   generate_kwargsrj   rB  r  video_tokensstart_tokensr  inputsr   s                    rC   rI  2InstructBlipVideoForConditionalGeneration.generate   s   D 4))'')!''*
BFBYBY/#9%=	 CZ C
 !/ < <   $ = =>A]A]]`aa+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+;%%..KK?KrE   c           	         UR                   u  pgpn
UR                  Xg-  XU
5      nU R                  " S
UUS.UD6n[        UR                  UR
                  UR                  UR                  USS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR!                  USS9nUR!                  USS9nUR#                  UR                  5      n[        R$                  " X/SS9nU R&                  " S
UUUUUS	.UD6nUUl        US   SS2SUR                  S5      2SS24   nU R+                  U5      nUR                  X`R,                  R.                  U-  S5      nUUl        U$ )a  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.
r  N)r&  r5  r   r,  r  r  r   rK   r  rP   r$   r  r   )rR   rU   r  r%  r&  r5  r   r,  r8   r  rM   r  r   r   ri   r  r  rf   rZ   r  r  r  r+   r  )rA   rb   r	  r
  r`   r   rj   r  r  rG   rH   r  r  r  r   r  r  r  rB  s                      rC   r@  <InstructBlipVideoForConditionalGeneration.get_video_featuresF  s/   ( 6B5G5G2
GU#++J,?RWX595F5F 6
%%=6
 6

 A,>>(66(66%00) 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!7!:!:;O;V;V!W!&,@+Y_`!a,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,? (//
KK<X<X[a<acef'5$rE   r"  r  )	NNNNNNNFN)NNNNNFr  )#rr   rs   rt   ru   r%   r   r=  r   r#  r2   r+  r   Moduler/  r3  r6  r  r8   r  rx   r  r   r   ry   r   r   r   r  ro   r   rI  r%  r@  rz   r{   r|   s   @rC   r   r   "  s    $#$O!+,6 (B;ryy ;:1?("e.>.> "uO`O` " 
 ;?.22659:>26*.).!%D
''D
 !,,D
 !& 0 04 7	D

 $$t+D
 ((4/D
 !++d2D
 !& 0 04 7D
 ((4/D
   4'D
 #'D
 $;D
 +,D
 
E	ED
  D
L ]]_ 6::>-12626).C''C !++d2C !& 0 04 7	C
 ##d*C ((4/C ((4/C #'C 
		C CJ 
 ;?05D''D !++D !& 0 04 7	D
 #'+D +,D 
8	8D  DrE   r   )r+  r   r  r   r   )r   )Rra  collections.abcr   dataclassesr   typingr   r8   r    r   r   activationsr	   
generationr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r   autor!   r"   r#   configuration_instructblipvideor%   r&   r'   
get_loggerrr   r  rQ  r)   r~   r   rv   floatr   r   r  r   r  r+  r   r   r  r  r  r   r  r  r  r   r%  r   __all__r   rE   rC   <module>rf     s  ,  $ !    & ! ) 6 9  G & 6 j j 7 E I I  
		H	%G		 GT/ /d  i  i  iV %II%<<% 
% <<	%
 LL4'% % %.G) G)T299 $> @@ryy @@3#C 3l^. ^.B  		  2299 RYY R$> Rj!
bii !
H^
$D ^
B 
 
; 
 
: 
g
= g

g
T 
	P.H 	P  	P a0PRa aaHrE   