
    3j'                        S r SSKJr  SSKrSSKJs  Jr  SS	S jjr S       S
S jjr	 S           SS jjr
 S         SS jjrg)u  Vision utility functions for pre-computing very dynamic and
data-dependent tensors that can break model graph capturing.

All functions are standalone (no model weights) and compute tensors from
`grid_thw` + config scalars. They are used by vision encoders and can be
precomputed before `torch.compile` / `torch.export` tracing since they
use untraceable ops (`repeat_interleave`, `.tolist()`, `nonzero()`, loops).

Each `get_*` accepts an optional `kwargs` dict; if it contains the
precomputed tensor under the natural key (`"cu_seqlens"`, `"position_ids"`,
…), the function pops and returns it instead of computing. Vision encoders
write `x = get_vision_x(..., kwargs=kwargs)` and the matching key is
removed from the caller's kwargs as a side-effect of the pop.
    )annotationsNc                V   Ub  UR                  SS5      =nb  U$ [        R                  " U SS2S4   U SS2S4   -  U SS2S4   5      R                  S[        R                  R                  5       (       a  U R                  O[        R                  S9n[        R                  " USSS9$ )	um  Get cumulative sequence lengths from vision grid info, or pop from `kwargs` if precomputed.

Args:
    grid_thw: `(num_images_or_videos, 3)` — temporal, height, width per entry.
    kwargs: optional caller kwargs — if it contains `"cu_seqlens"` it is popped and returned.

Returns:
    `cu_seqlens`: `(total_patches + 1,)` int32 cumulative sequence boundaries.
N
cu_seqlens      r   )dimdtype)r   r   )value)
poptorchrepeat_interleavecumsumjit
is_tracingr	   int32Fpad)grid_thwkwargsr   s      S/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/vision_utils.pyget_vision_cu_seqlensr   #   s     VZZd-KKzX((!Q$(1a4.)H(STVWSW.Y``uyy';';'='=X^^5;; a J 55V1--    c                   Ub  UR                  SS5      =nb  U$ U R                  n[        U[        5      (       a.  [        R
                  " U/US9R                  [        U 5      5      n/ n[        U R                  5       UR                  5       5       GH=  u  u  pVpx[        U5      [        U5      [        U5      [        U5      4u  pVpx[        R                  " XdS9R                  S5      R                  SU5      n	U	R                  Xh-  XU-  U5      R                  SS5      R                  5       n	[        R                  " XtS9R                  S5      R                  US5      n
U
R                  Xh-  XU-  U5      R                  SS5      R                  5       n
UR                  [        R                   " X/SS9R#                  US5      5        GM@     [        R$                  " USS9$ )	u  Get (row, col) position IDs for vision rotary embeddings, or pop from `kwargs` if precomputed.

Args:
    grid_thw: `(num_images_or_videos, 3)`
    spatial_merge_size: merge block size — either a single `int` (same for all images)
        or a `(num_images_or_videos,)` tensor (per-image).
    kwargs: optional caller kwargs — if it contains `"position_ids"` it is popped and returned.

Returns:
    `position_ids`: `(total_tokens, 2)` long — (row, col) position per token.
Nposition_idsdevicer   r   r   r   )r   r   
isinstanceintr   tensorexpandlenziptolistarange	unsqueezereshape	transposeflattenappendstackrepeatcat)r   spatial_merge_sizer   r   r   thw
merge_sizehpos_idswpos_idss              r   get_vision_position_idsr6   5   s    vzz.$/OO|\__F$c**"\\+=*>vNUUVYZbVcdL!$X__%68J8Q8Q8S!T	q!!fc!fc!fc*oEa<<1;;A>EEb!L##AOZjR\]gghiklmuuw<<1;;A>EEaL##AOZjR\]gghiklmuuwEKK(<"ELLQPQRS "U 99\q))r   c                8   Ub-  UR                  SS5      nUR                  SS5      nUb  Ub  XV4$ / nS/nSnX!-  U-  nUS-  n	U R                  5        GHh  u  pn[        U
5      [        U5      [        U5      pn
X-  nX-  n[        R                  " X-  U-  5      R                  XU5      nXU-  -
  nXU-  -
  nUU-   U-  nUU-   U-  n[        R                  " USUSU4SS5      nUR                  U
UUUU5      nUR                  SSS	SS
5      R                  U
UU-  X5      nUS:g  R                  SS	/5      R                  S5      nUR                  S5      nUUS:g     nUR                  UU-   5        UR                  S5      U	-  US   -   nUR                  UR                  5       5        XzU-  U-  -  nGMk     [        R                  " USS9n[        R                  " X`R                  [        R                   S9n[        R"                  " U5      nXV4$ )ux  Get window attention indices, or pop `"window_index"`/`"cu_window_seqlens"` from `kwargs` if both precomputed.

Args:
    grid_thw: `(num_images_or_videos, 3)`
    spatial_merge_size: merge block size from vision config.
    window_size: window size from vision config.
    patch_size: patch size from vision config.
    kwargs: optional caller kwargs — if it contains both `"window_index"` and `"cu_window_seqlens"` they are popped and returned.

Returns:
    `window_index`: `(total_tokens,)` long — reorder indices for windowed attention.
    `cu_window_seqlens`: `(num_windows + 1,)` int32 — cumulative window boundaries.
Nwindow_indexcu_window_seqlensr   r   constantir         r   r   )r   r	   )r   r%   r    r   r&   r(   r   r   permutesumr+   r   extendr.   r!   r   r   unique_consecutive)r   r/   window_size
patch_sizer   r8   r9   window_index_idvit_merger_window_sizespatial_merge_unitgrid_tgrid_hgrid_w
llm_grid_h
llm_grid_windexpad_hpad_wnum_windows_hnum_windows_windex_paddedseqlens	index_newcu_seqlens_tmps                           r   get_vision_window_indexrT   V   s`   ( zz.$7"JJ':DA#(9(E22L cO(>*L+Q."*//"3!$Vc&k3v;1
1
V0:=>FFv[ef&6L)LL&6L)LL#e+0FF#e+0FFuuUQq%$8*dK#++M#9=J`
 $++Aq!Q:BBMM13I
  4',,aV4<<R@#++B/ !56	I78 *-??BSTVBWW  !6!6!89J.;;- #40 99\q1L%6V[VaVab001BC**r   c           	     z   Ub-  UR                  SS5      nUR                  SS5      nUb  Ub  XE4$ UnUnU R                  n[        S5       V	s/ s H  n	/ PM     n
n	[        S5       V	s/ s H  n	/ PM     nn	U R                  5        GH  u  pn[	        U5      [	        U5      [	        U5      pn[
        R                  " SUS-
  XS9n[
        R                  " SUS-
  XS9nUR	                  5       nUR	                  5       nUS-   R                  US-
  S9nUS-   R                  US-
  S9nUU-
  nUU-
  nUU-  nUU-  nUSS2S4   USSS24   -   R                  5       USS2S4   USSS24   -   R                  5       USS2S4   USSS24   -   R                  5       USS2S4   USSS24   -   R                  5       /nSU-
  SS2S4   SU-
  SSS24   -  R                  5       SU-
  SS2S4   USSS24   -  R                  5       USS2S4   SU-
  SSS24   -  R                  5       USS2S4   USSS24   -  R                  5       /n[
        R                  " XS9R                  X-  U5      n[
        R                  " XS9R                  X-  U5      nUSS2SS2SS4   U-  USSSS2SS24   -   R                  SS	5      R                  5       R                  U5      n[        S5       H7  nU
U   R                  UU   U   5        UU   R                  UU   U   5        M9     GM     [
        R                  " U
 Vs/ s H  n[
        R                  " U5      PM     sn5      n[
        R                  " U Vs/ s H  n[
        R                  " U5      PM     sn5      nXE4$ s  sn	f s  sn	f s  snf s  snf )
uz  Get bilinear interpolation indices/weights, or pop `"bilinear_indices"`/`"bilinear_weights"` from `kwargs` if both precomputed.

Args:
    grid_thw: `(num_images_or_videos, 3)`
    num_grid_per_side: `int(num_position_embeddings ** 0.5)` from vision config.
    spatial_merge_size: merge block size from vision config.
    kwargs: optional caller kwargs — if it contains both `"bilinear_indices"` and `"bilinear_weights"` they are popped and returned.

Returns:
    `bilinear_indices`: `(4, total_thw)` long — bilinear corner indices into pos_embed table.
    `bilinear_weights`: `(4, total_thw)` float — interpolation weights.
Nbilinear_indicesbilinear_weightsr<   r   r   r   )maxr   )r   r   ranger%   r    r   linspaceclampr*   r&   viewr)   r-   r+   r,   r.   ) r   num_grid_per_sider/   r   rV   rW   sider3   r   _	idx_partsweight_partsr0   r1   r2   h_gridw_gridh_floorw_floorh_ceilw_ceilh_fracw_frach_floor_offseth_ceil_offsetcorner_indicescorner_weightsh_idxw_idxreorderips                                    r   'get_vision_bilinear_indices_and_weightsrs      s   $ !::&8$?!::&8$?',<,H#55D#J__F7<Qx*@x!2xI*@:?(-C(Qb(L-C??$aa&#a&#a&a4!8Q>4!8Q>**,**,A+$$$2A+$$$2'!'! 4 AtG$wtQw'77@@BAtG$vdAg6??A1d7#gdAg&66??A1d7#fT1Wo5>>@	
 &j!T'"a&j$'%::CCE&j!T'"VD!G_4==?AtG_F
D!G44==?AtG_vdAg.779	
 Q.33AOZPQ.33AOZPAtT)*Q.tT1a7G1HHSSTUWXYaacjjklmqAaLq 1' :;O"">!#4W#=> E %L {{)#D)QEIIaL)#DE{{,#G,QEIIaL,#GH--W +A-CP $E#Gs   N))N. N3? N8)N)r   torch.Tensorr   dict | Nonereturnrt   )r   rt   r/   zint | torch.Tensorr   ru   rv   rt   )r   rt   r/   r    rA   r    rB   r    r   ru   rv   !tuple[torch.Tensor, torch.Tensor])
r   rt   r]   r    r/   r    r   ru   rv   rw   )__doc__
__future__r   r   torch.nn.functionalnn
functionalr   r   r6   rT   rs    r   r   <module>r~      s    #   .& [_**0B*LW**L :+:+:+ :+ 	:+
 :+ ':+B 	F.F.F. F. 	F.
 'F.r   