
    3jT$                       S SK r S SKrS SKJr  S SKJr  S SKJrJr  S SK	r	S SK
Jr  S SKJs  Jr  S SK
Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSK J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-  SSK.J/r/J0r0J1r1J2r2  SSK3J4r4  SSK5J6r6J7r7J8r8J9r9  SSK:J;r;  SSK<J=r=J>r>  SSK?J@r@JArAJBrB  \" S5       " S S\R                  5      5       rD " S S\R                  5      rE " S S \R                  5      rF " S! S"\R                  5      rG " S# S$\R                  5      rH " S% S&\R                  5      rIS' rJS(\	R                  S)\	R                  S*\	R                  S+\	R                  S,\L\	R                  \	R                  4   4
S- jrMS.\	R                  S/\NS,\	R                  4S0 jrO SWS1\R                  S2\	R                  S3\	R                  S4\	R                  S5\	R                  S-  S6\PS7\PS8\-\/   4S9 jjrQ " S: S;\R                  5      rR " S< S=\!5      rS " S> S?\R                  5      rTS@ rUSXSA jrV " SB SC\R                  5      rW " SD SE\R                  5      rX " SF SG\!5      rY\0\ " SH SI\#5      5       5       rZ\0 " SJ SK\+5      5       r[ " SL SM\[5      r\\0 " SN SO\[5      5       r]\0 " SP SQ\[5      5       r^\0\ " SR SS\%5      5       5       r_ " ST SU\[\5      r`/ SVQrag)Y    N)Callable)	dataclass)AnyOptional)	LayerNorm   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)deprecate_kwarg)accepts_precomputed_kwargsis_flash_attention_requestedmaybe_autocastmerge_with_config_defaults)capture_outputs)get_vision_cu_seqlensget_vision_position_ids   )Glm4vConfigGlm4vTextConfigGlm4vVisionConfigRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Glm4vRMSNorm8   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Glm4vRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer.   	__class__s      b/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/glm4v/modeling_glm4v.pyr2   Glm4vRMSNorm.__init__:   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor5   float32powmeanrsqrtr8   r7   )r9   r?   input_dtypevariances       r<   forwardGlm4vRMSNorm.forwardB   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r>   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler7   shaper8   )r9   s    r<   
extra_reprGlm4vRMSNorm.extra_reprI   s*    ))*+6$2G2G1HIIr>   )r8   r7   )gư>)__name__
__module____qualname____firstlineno__floatr2   r5   TensorrL   rQ   __static_attributes____classcell__r;   s   @r<   r,   r,   8   sB    $ $$ $ $;U\\ ;ell ;J Jr>   r,   c                   :   ^  \ rS rSrSS\4U 4S jjjrS rSrU =r$ )Glm4VisionMlpM   biasc                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  US9U l        [
        R                  " U R                  U R                  US9U l        [
        R                  " U R                  U R                  US9U l	        [        UR                     U l        g Nr_   )r1   r2   r:   out_hidden_sizeintermediate_sizer3   Linear	gate_projup_proj	down_projr
   
hidden_actact_fn)r9   configr_   r;   s      r<   r2   Glm4VisionMlp.__init__N   s    !--!'!7!74#3#3T5K5KRVWyy!1!143I3IPTU4#9#94;K;KRVWV../r>   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      $ N)rh   rj   rf   rg   r9   hidden_states     r<   rL   Glm4VisionMlp.forwardW   s2    ~~dkk$..*FG$,,WcJddeer>   )rj   rh   rf   r:   rd   rg   F)	rS   rT   rU   rV   boolr2   rL   rY   rZ   r[   s   @r<   r]   r]   M   s     0T 0 0f fr>   r]   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Glm4vVisionPatchEmbed[   rk   r/   Nc                 N  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  U R                  /n[        R                  " U R                  U R                  X"S9U l	        g )N)kernel_sizestride)
r1   r2   
patch_sizetemporal_patch_sizein_channelsr:   	embed_dimr3   Conv3dproj)r9   rk   rx   r;   s      r<   r2   Glm4vVisionPatchEmbed.__init__\   s|     ++#)#=#= !--++//$//RIId..Kl	r>   r?   c                 0   U R                   R                  R                  nUR                  SU R                  U R
                  U R                  U R                  5      nU R                  UR                  US95      R                  SU R                  5      nU$ )NrB   rD   )	r   r7   rD   viewr|   r{   rz   rE   r}   )r9   r?   target_dtypes      r<   rL   Glm4vVisionPatchEmbed.forwardf   s~    yy''--%**  $":":DOOT__
 		-"2"2"2"FGLLRQUQ_Q_`r>   )r}   r|   rz   r   r{   rS   rT   rU   rV   r)   r2   r5   rX   rL   rY   rZ   r[   s   @r<   ru   ru   [   s:    m0 mT mU\\ ell  r>   ru   c                      ^  \ rS rSr% \R
                  \S'   SS\S\SS4U 4S jjjr	S\R
                  S\R
                  4S	 jr
S
rU =r$ )Glm4vVisionRotaryEmbeddingo   inv_freqdimthetar/   Nc           	         > [         TU ]  5         Xl        X l        SU[        R
                  " SUS[        R                  S9U-  -  -  nU R                  SUSS9  g )N      ?r   rA   r   r   F
persistent)r1   r2   r   r   r5   arangerW   register_buffer)r9   r   r   r   r;   s       r<   r2   #Glm4vVisionRotaryEmbedding.__init__r   sU    
%ELLC%++$NQT$TUVZeDr>   position_idsc                 \    UR                  S5      U R                  -  R                  S5      $ )NrB   r&   )	unsqueezer   flatten)r9   r   s     r<   rL   "Glm4vVisionRotaryEmbedding.forwardy   s'    &&r*T]]:CCAFFr>   )r   r   )g     @)rS   rT   rU   rV   r5   rX   __annotations__intrW   r2   rL   rY   rZ   r[   s   @r<   r   r   o   sU    llEC E ED E EGELL GU\\ G Gr>   r   c                   ~   ^  \ rS rSrSS\S\S\S\SS4
U 4S jjjrS	\R                  S\R                  4S
 jr
SrU =r$ )Glm4vVisionPatchMerger}   r   context_dimri   r_   r/   Nc                 b  > [         TU ]  5         [        R                  " XUS9U l        [        U5      U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " X!US9U l	        [        R                  " 5       U l        [        U   U l        g ra   )r1   r2   r3   re   r   r   post_projection_normrf   rg   rh   GELUact1r
   rj   )r9   r   r   ri   r_   r;   s        r<   r2   Glm4vVisionPatchMerger.__init__~   s{    IIcT2	$-cN!3$?yy=;$?GGI	Z(r>   rp   c                     U R                  U5      nU R                  U R                  U5      5      nU R                  U R	                  U R                  U5      5      U R                  U5      -  5      $ rn   )r   r   r   rh   rj   rf   rg   ro   s     r<   rL   Glm4vVisionPatchMerger.forward   sY    yy.yy!:!:<!HI~~dkk$..*FG$,,WcJddeer>   )r   rj   rh   rf   r   r   rg   rr   )rS   rT   rU   rV   r   strrs   r2   r5   rX   rL   rY   rZ   r[   s   @r<   r   r   }   sU    )C )c )s )$ )[_ ) )fELL fU\\ f fr>   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )Glm4vVisionEmbeddings   rk   c                 f  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        SU l        g )NrA   bicubic)r1   r2   rk   r:   r}   
image_sizerz   num_patchesnum_positionsr3   	Embeddingposition_embeddinginterpolated_methodr9   rk   r;   s     r<   r2   Glm4vVisionEmbeddings.__init__   s    ++ ++ ++ OOt>1D!--"$,,t/A/A4>>"R#, r>   r/   c                    U R                   R                  nUR                  S   nUR                  n[	        U[
        5      (       a#  [        R                  " X([        R                  S9nUR                  S   n	[        U	S-  5      n
UR                  XU5      R                  SSS5      R                  S5      R                  U[        R                  S9nUR                  S   n[        R                  " XR                  S9nUR                  S5      UR!                  S5      R                  S5      :  R#                  S5      nX>S4   R                  [        R                  S9nX>S4   R                  [        R                  S9nUS-   U-  S-  S-
  nUS-   U-  S-  S-
  n[        R$                  " UU4SS	9R                  S5      R                  S5      n[&        R(                  " UUU R*                  S
SS9nUR-                  S5      R-                  S5      R                  SS5      nUR                  UR.                  5      R                  UR                  5      nUU-   nU$ )aQ  
Forward pass with integrated position encoding adaptation using 2D interpolation.

Args:
    embeddings: Input embeddings tensor
    lengths (torch.Tensor): Sequence lengths for each image in the batch.
    image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
    h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
    w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

Returns:
    torch.Tensor: Embeddings with adapted position encoding added.
r&   devicerD   r   g      ?rA   r   r   rB   r   Fborder)modealign_cornerspadding_mode)r   r7   rP   r   
isinstancelistr5   tensorlongr   r   permuter   rE   rF   r   cumsumsumstackFgrid_sampler   squeezerD   )r9   
embeddingslengthsimage_shapesh_coordsw_coordspos_embed_weightr:   r   orig_size_sq	orig_sizepos_embed_2d
num_tokenstoken_positionsseq_idstarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32adapted_pos_embeds                          r<   rL   Glm4vVisionEmbeddings.forward   s6     2299&,,Q/!(( gt$$ll7LG (--a0c)*	!!)DWQ1Yq\RvU]]R3	 	  %%a(
,,z:K:KL",,Q/7>>!3D3N3Nq3QQVVWXY
+..U]].C
+..U]].C c>X-2Q6c>X-2Q6 {{FF+4>>qAKKAN #$--$T%=%=Uai#

 "9!@!@!C!K!KB!O!W!WXY[\!]2556F6L6LMPPQ[QbQbc  "33
r>   )rk   r}   r   r   r   r   rz   r   r   r[   s   @r<   r   r      s(    
-0 
-:PUP\P\ : :r>   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )*Rotates half the hidden dims of the input..NrB   rA   r   )rP   r5   catxx1x2s      r<   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r>   qkcossinr/   c                    U R                   nUR                   nU R                  5       UR                  5       pUR                  S5      R                  5       UR                  S5      R                  5       p2X-  [        U 5      U-  -   nX-  [        U5      U-  -   nUR	                  U5      nUR	                  U5      nXg4$ )N)rD   rW   r   r   rE   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r<   apply_rotary_pos_emb_visionr      s     77L77L779aggiq}}R &&(#--*;*A*A*Cw;q>C/0Gw;q>C/0Gjj&Gjj&Gr>   r?   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r&   N)rP   expandreshape)r?   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr>   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrA   r   rB   )r   rD   )ptrainingr&   )r   num_key_value_groupsr5   matmul	transposer3   
functionalsoftmaxrF   rE   rD   r   r  
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r<   eager_attention_forwardr     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r>   c            
          ^  \ rS rSrS\SS4U 4S jjr\" SSS9 SS	\R                  S
\R                  S\	\R                  \R                  4   S-  S\R                  4S jj5       r
SrU =r$ )Glm4vVisionAttentioni  rk   r/   Nc                   > [         TU ]  5         UR                  U l        UR                  U l        U R                  U R                  -  U l        SU l        [        R                  " UR                  UR                  S-  UR                  S9U l
        [        R                  " UR                  UR                  SS9U l        U R
                  S-  U l        Xl        UR                  U l        SU l        g )Nr&   r   rb   F      )r1   r2   r:   r   	num_headsr   r  r3   re   attention_biasqkvr   r   rk   attention_dropout	is_causalr   s     r<   r2   Glm4vVisionAttention.__init__  s    %%))DNN2$%!99V//1C1Ca1GfNcNcdIIf00&2D2D5Q	}}d*!'!9!9r>   rotary_pos_embv5.10versionr?   
cu_seqlensposition_embeddingsc                    UR                   S   nU R                  U5      R                  USU R                  S5      R	                  SSSS5      R                  S5      u  pgnUu  p[        XgX5      u  pgUR                  SS5      R                  S5      nUR                  SS5      R                  S5      nUR                  SS5      R                  S5      n[        R                  " U R                  R                  [        5      n[        U R                  5      (       aX  USS  US S -
  R                  5       nU" U UUU4S U R                   U R"                  (       d  SOU R$                  UUUUSS.UD6u  pOUSS  US S -
  nXgU4 Vs/ s H'  n[&        R(                  " UUR+                  5       SS	9PM)     nn[-        U6  VVVs/ s HB  u  nnnU" U UUU4S U R                   U R"                  (       d  SOU R$                  SS
.UD6S   PMD     nnnn[&        R.                  " USS	9nUR                  US5      R1                  5       nU R3                  U5      nU$ s  snf s  snnnf )Nr   r   rB   r&   rA           F)r   r   r   cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr  r   )r   r   r   r  )rP   r  r   r  r   unbindr   r  r   r   get_interfacerk   _attn_implementationr  r    maxr   r  r  r5   splittolistzipr   r	  r   )r9   r?   r  r  r   
seq_lengthquery_statesr
  r  r   r   attention_interface
max_seqlenr  _r   r   splitsr   r   vattn_outputss                         r<   rL   Glm4vVisionAttention.forward   s    #((+
HH]#++J4>>2NVVWXZ[]^`abiijkl 	/, '#>|Y\#b #--a3==a@))!Q/99!<
#--a3==a@(?(M(MKK,,.E)
 (44$QR.:cr?:??AJ0	
  $#'==d6L6L(('' NK" !nz#26GLXfrKsKsFGNN$4!<Ks     #F|  ,GAq! $	

 $( LL'+}}C$:P:P#
 
 
  ,    ))La8K!))*b9DDFii,-s   .IA	I)
r  rk   r   r   r  r  r  r   r  r   rn   )rS   rT   rU   rV   r)   r2   r   r5   rX   rO   rL   rY   rZ   r[   s   @r<   r  r    s    0 T  %w7
 IM	A||A LLA #5<<#=>E	A 
A 8Ar>   r  c                      ^  \ rS rSrSU 4S jjr\" SSS9\ SS\R                  S	\R                  S
\	\R                  \R                  4   S-  S\R                  4S jj5       5       r
SrU =r$ )Glm4vVisionBlockie  r/   Nc                    > [         TU ]  5         [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        USS9U l
        g )Nr.   Frb   )r1   r2   r,   r:   rms_norm_epsnorm1norm2r  attnr]   mlpr   s     r<   r2   Glm4vVisionBlock.__init__f  s\    !&"4"4&:M:MN
!&"4"4&:M:MN
(0	 e4r>   r  r  r  r?   r  r  c                     XR                   " U R                  U5      4UUS.UD6-   nXR                  U R                  U5      5      -   nU$ )z
cu_seqlens (`torch.Tensor`):
    Cumulative sequence lengths used for packed variable-length attention in Flash Attention kernels.
r  r  )r<  r:  r=  r;  )r9   r?   r  r  r   s        r<   rL   Glm4vVisionBlock.forwardm  sZ     &		JJ}%)
! 3)
 	)
 
 &M1J(KKr>   )r<  r=  r:  r;  r/   Nrn   )rS   rT   rU   rV   r2   r   r   r5   rX   rO   rL   rY   rZ   r[   s   @r<   r6  r6  e  s|    5 %w7
 IM	|| LL #5<<#=>E	 
  8r>   r6  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rS rSrU =r$ )Glm4vTextRotaryEmbeddingi  r   Nrk   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  UR                  R                  S/ SQ5      U l        g )	N	rope_typedefaultr   Fr   original_inv_freqmrope_section)      rK  )r1   r2   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrk   rope_parametersrF  compute_default_rope_parametersr   attention_scalingr   clonegetrI  )r9   rk   r   rope_init_fnr   r;   s        r<   r2   !Glm4vTextRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuU#3377Ur>   r   ztorch.deviceseq_lenr/   ztorch.Tensorc           	      j   U R                   S   nU R                   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        XT-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  nX4$ )
aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetapartial_rotary_factorr   r   Nr   rA   r   r   )rO  rS  getattrr:   num_attention_headsr   r5   r   int64rE   rW   )	rk   r   rV  baserY  r   r   attention_factorr   s	            r<   rP  8Glm4vTextRotaryEmbedding.compute_default_rope_parameters  s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r>   c                 Z   U R                   S S S S 2S 4   R                  5       R                  SUR                  S   SS5      nUS S 2S S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      nU R                  X`R                  5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  n	S S S 5        WR#                  UR$                  S
9W	R#                  UR$                  S
94$ ! , (       d  f       N@= f)Nr   r&   rB   mpscpuF)device_typeenabledrA   r   r   )r   rW   r   rP   r   r   typer   r!   r  apply_mroperI  r5   r   r   rQ  r   rE   rD   )
r9   r   r   inv_freq_expandedposition_ids_expandedrc  freqsembr   r   s
             r<   rL    Glm4vTextRotaryEmbedding.forward  sZ   
 !MM$a*=>DDFMMaQ]QcQcdeQfhjlmn ,Q4] ; A A C'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E$$U,>,>?E))UN3C'')d444C'')d444C D vvAGGv$cff177f&;;; DCs   BF
F*c           	          UnUR                  USS9n[        R                  " [        U5       VVs/ s H  u  pVXeS-     PM     snnSS9nU$ s  snnf )NrB   r   r   )r)  r5   r   	enumerate)r9   ri  rI  sectionchunksichunkresults           r<   rf  $Glm4vTextRotaryEmbedding.apply_mrope  sS    W"-69JK9JXQEa%L9JKQST Ls   A
)rQ  rk   rM  rI  rN  rF  rn   NNN)rS   rT   rU   rV   r5   rX   r   r(   r2   staticmethodr   r   rO   rW   rP  no_gradr   rL   rf  rY   rZ   r[   s   @r<   rD  rD    s    llV V V" )-+/"*$&*(* t* 
~u$	%	* *> ]]_<  <  r>   rD  c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	r   .r   NrA   r&   rB   r   r   )r5   r   r   r   s      r<   rotate_half_llmrx    sJ    	
319B	
319B;;Ryb)11"55r>   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pvUSSU24   USUS24   pXb-  [        U5      U-  -   n
X-  [        U5      U-  -   n[        R
                  " X/SS9n
[        R
                  " X/SS9nX4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.NrB   rA   r   )r   rP   repeat_interleaverx  r5   r   )r   r   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passr   r   s               r<   apply_rotary_pos_embr    s6   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {u5;<G{u5;<G ii)r2Gii)r2Gr>   c                   :  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )Glm4vTextAttentioni   zz
Multi-headed attention from 'Attention Is All You Need' paper.
and "Generating Long Sequences with Sparse Transformers".
Nrk   	layer_idxc                 r  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l	        SU l
        UR                  U l        UR                  U l        U R                  S-  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )NTr  rb   F)r1   r2   rk   r  r:   r[  r  r   r   r  r  r  rO  r   r3   re   q_projk_projv_projo_projr9   rk   r  r;   s      r<   r2   Glm4vTextAttention.__init__  sE   "!--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9%55}}d*ii 0 0$..4==2PW[\ii 0 0$2J2JT]]2Zaefii 0 0$2J2JT]]2Zaefii >@P@PW\]r>   r?   r  r   past_key_valuesr   r/   c                 >   UR                  5       u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nU	R	                  XgSU R
                  5      R                  SS5      n	U
R	                  XgSU R
                  5      R                  SS5      n
UR	                  XgSU R
                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U U	U
UU4U R                  (       d  SOU R                   U R"                  S.UD6u  nnUR%                  XgS5      R'                  5       nU R)                  U5      nUU4$ )NrB   r&   rA   r   )r   r   )sizer  r  r  r   r   r  r  updater  r   r&  rk   r'  r  r  r  r   r   r	  r  )r9   r?   r  r   r  r   bszq_lenr0  r-  r
  r  r   r   r.  r  r  s                    r<   rL   Glm4vTextAttention.forward  s    &**,A{{=1[[/
{{=1#((RGQQRSUVW__ST]]CMMaQRS
#((RGQQRSUVW&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "))#b9DDFkk+.L((r>   )r  rk   r   r:   r  r  r  r  r  r   r  r  rO  r   r  rn   rt  )rS   rT   rU   rV   __doc__r(   r   r2   r5   rX   rO   r   r   r   rL   rY   rZ   r[   s   @r<   r  r     s    
^ ^3: ^ ^. IM.2(,))||)) #5<<#=>E)) t+	))
 )) -.)) 
u||U\\D0%2E2LL	M)) ))r>   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Glm4vTextMLPiF  c                    > [         TU ]  5         Xl        [        R                  " UR
                  SUR                  -  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        UR                     U l        g )NrA   Frb   )r1   r2   rk   r3   re   r:   rd   gate_up_projrh   r
   ri   activation_fnr   s     r<   r2   Glm4vTextMLP.__init__G  sn    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r>   r?   r/   c                     U R                  U5      nUR                  SSS9u  p2X R                  U5      -  nU R                  U5      $ )NrA   rB   r   )r  rq  r  rh   )r9   r?   	up_statesgates       r<   rL   Glm4vTextMLP.forwardO  sH    %%m4	#//!/4 2 24 88	~~i((r>   )r  rk   rh   r  )
rS   rT   rU   rV   r2   r5   FloatTensorrL   rY   rZ   r[   s   @r<   r  r  F  s,    7)U%6%6 )5;L;L ) )r>   r  c                   T  ^  \ rS rSrS\S\4U 4S jjr\     SS\R                  S\
\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\
\R                  \
\R                  \R                  4   S-  4   4S jj5       rSrU =r$ )Glm4vTextDecoderLayeriX  rk   r  c                   > [         TU ]  5         UR                  U l        [        X5      U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr8  )r1   r2   r:   r  	self_attnr  r=  r,   r9  input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormr  s      r<   r2   Glm4vTextDecoderLayer.__init__Y  s    !--+F>'+F,>,>FDWDWX(4V5G5GVM`M`(a%(4V5G5GVM`M`(a%".v/A/AvGZGZ"[r>   Nr?   r  r   r   r  	use_cacher/   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU$ )N)r?   r  r   r   r  r   )r  r  r  r  r=  r  )
r9   r?   r  r   r   r  r  r   residualr0  s
             r<   rL   Glm4vTextDecoderLayer.forwardc  s     !,,];  >> 
' 3)%+
 
 55mD 0 !55mD///> 0r>   )r:   r  r=  r  r  r  r  )NNNNF)rS   rT   rU   rV   r(   r   r2   r   r5   rX   rO   
LongTensorr   rs   r  rL   rY   rZ   r[   s   @r<   r  r  X  s    \ \3 \  IM.204(,!&#||# #5<<#=>E# t+	#
 &&-# # $;# 
u  %(9(95;L;L(L"MPT"TT	U# #r>   r  c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Glm4vModelOutputWithPasti  
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
    The rope index difference between sequence length and multimodal rope.
    The attribute is deprecated and will be removed in v5.20, use `model.base_model.rope_deltas` instead.
Nrope_deltasr  
rS   rT   rU   rV   r  r  r5   r  r   rY   r  r>   r<   r  r         ,0K!!D(/r>   r  c                   ^   ^  \ rS rSr% \\S'   SrSrSrSS/r	S/r
SrSrSrSrU 4S	 jrS
rU =r$ )Glm4vPreTrainedModeli  rk   model)imagevideotextTr  r6  r  c           	      *  > [         TU ]  U5        [        U[        5      (       an  SUR                  [
        R                  " SUR                  S[
        R                  S9UR                  -  -  -  n[        R                  " UR                  U5        g g )Nr   r   rA   r   )r1   _init_weightsr   r   r   r5   r   r   rW   initcopy_r   )r9   r   r   r;   s      r<   r  "Glm4vPreTrainedModel._init_weights  sn    f%f899fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 :r>   r  )rS   rT   rU   rV   r'   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr  rY   rZ   r[   s   @r<   r  r    sQ    1&*#02DE#4"5N!"&2 2r>   r  c                      ^  \ rS rSr% \\S'   SrS/r\\	S.r
SU 4S jjrS r\\\S	\R"                  S
\R"                  S\\   S\\-  4S j5       5       5       rSrU =r$ )Glm4vVisionModeli  rk   )r  r  r6  r?   
attentionsr/   c                 8  > [         TU ]  U5        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        UR                  UR                  -  n[        US-  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        [%        UR&                  UR(                  UR*                  S9U l        [/        UR                  UR0                  S9U l        [        R4                  " UR                  UR&                  UR                  UR                  S9U l        [/        UR                  UR0                  S9U l        SU l        U R=                  5         g s  snf )NrA   )r   r   ri   r8  )r|   out_channelsrx   ry   F)r1   r2   spatial_merge_sizerz   r   r   ru   patch_embedr:   r  r   r  r3   
ModuleListrangedepthr6  blocksr   rc   rd   ri   mergerr,   r9  post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)r9   rk   r   r0  r;   s       r<   r2   Glm4vVisionModel.__init__  sG    "(";"; ++/708%%)9)998QGmmuV\\GZ$[GZ!%5f%=GZ$[\,&&F4L4LY_YjYj
 $00B0BH[H[#\ ))**//11,,	
 +6+=+=6CVCVW&+# %\s   &Fc                     [         R                  " SU R                  R                   S3[        SS9  [        XR                  5      nU R                  U5      nX24$ )N`z.rot_pos_emb` is deprecated and will be removed in v5.11. Use `get_vision_position_ids` from `transformers.vision_utils` and apply the rotary embedding module.rA   )
stacklevel)warningswarnr;   rS   FutureWarningr%   r  r  )r9   grid_thwr   r  s       r<   rot_pos_embGlm4vVisionModel.rot_pos_emb  s]    ''(  )H  I	

 /x9P9PQ,,\:++r>   r?   r  r   c           	      `   [        X R                  US9n[        X#S9nU R                  U5      nU R	                  U5      nU R                  U5      n[        R                  " Xf4SS9nUR                  5       UR                  5       4nUSS USS -
  n	U R                  UU	UUSS2S4   R                  UR                  5      USS2S4   R                  UR                  5      5      nU R                   H  n
U
" U4UUS.UD6nM     U R                  U5      nUR                  SU R                  U R                  UR                   S   5      nUR#                  SSSS	5      nU R%                  U5      R                  SU R&                  R(                  5      nU R+                  U5      n[-        UUS
9$ )a$  
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
    The final hidden states of the model.
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
    The temporal, height and width of feature shape of each image in LLM.

Returns:
    `torch.Tensor`: hidden_states.
)r   rB   r   r&   Nr   r@  r   rA   )last_hidden_statepooler_output)r%   r  r$   r  r  r  r5   r   r   r   r   rE   r   r  r  r   rP   r   r  rk   rc   r  r   )r9   r?   r  r   r   r  
rotary_embrj  r  seqlensblkmerged_hidden_statess               r<   rL   Glm4vVisionModel.forward  s    /x9P9PY_`*8C
((700?((6
ii0b9"wwy#'')4QR.:cr?2A!!-"6"67A!!-"6"67
 ;;C%$7 	M  ++M:%**'')@)@-BUBUVXBY
 &--aAq96;;B@[@[\#{{=9)+.
 	
r>   )r  r  r   r  r  r  rz   r  r  r  r  rB  )rS   rT   rU   rV   r)   r   r  r  r6  r  _can_record_outputsr2   r  r"   r#   r   r5   rX   r   r   rO   r   rL   rY   rZ   r[   s   @r<   r  r    s    )+,)*
8,  3
"\\3
5:\\3
MSTfMg3
	+	+3
    3
r>   r  c                   "  ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
\\\      SS\R                  S-  S\R                   S-  S	\R                  S-  S
\S-  S\R$                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )Glm4vTextModeli  rk   )r  r  c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr8  rk   F)r1   r2   pad_token_idpadding_idx
vocab_sizer3   r   r:   embed_tokensr  r  num_hidden_layersr  layersr,   r9  normrD  r  r  r  r  s      r<   r2   Glm4vTextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfgGf)"65Gfg
 !!3!39L9LM	2&A&+# hs   C?N	input_idsr   r   r  inputs_embedsr  r   r/   c           	      l   US L US L-  (       a  [        S5      eU(       a9  Uc6  [        R                  R                  5       (       d  [	        U R
                  S9nUc  U R                  U5      nUcv  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  SSS5      R                  SUR                  S   S5      nO3UR                  S:X  a#  US	   R                  SUR                  S   S5      nUR                  S:X  a  UR                  S   S
:X  a  US   n	USS  nOS n	U R
                  UUUU	S.n
[        S0 U
D6nUnU R                  XS9nU R                    H  nU" U4UU	UUS.UD6nUnM     U R#                  U5      n[%        UUS9$ )N:You must specify exactly one of input_ids or inputs_embedsr  r   r&   r   rB   r   rA   N.   )rk   r  r   r  r   )r   )r   r   r  r  )r  r  r  )
ValueErrorr5   jit
is_tracingr   rk   r  get_seq_lengthr   rP   r   r   r   ndimr   r  r  r  r   )r9   r  r   r   r  r  r  r   past_seen_tokenstext_position_idsmask_kwargscausal_maskr?   r  decoder_layerlayer_outputss                   r<   rL   Glm4vTextModel.forward,  s    -t";<YZZ 09M9M9O9O*$++>O  --i8M CRC^==?de <<(;(;A(>}G[G[\_ooL',,Q26==aATATUVAWY[\L!#'	299!\=O=OPQ=RTVWL !l&8&8&;q&@ ,Q'+L !% kk*,.-
 )7;7%"oomoW![[M)*. /$7 M *M ) 		-0&++
 	
r>   )r  r  r  r  r  r  r  )NNNNNN)rS   rT   rU   rV   r(   r   r  r  r  r  r2   r   r"   r#   r5   r  rX   r   r  rs   r   r   rO   r   rL   rY   rZ   r[   s   @r<   r  r    s     .(
    .2.204(,26!%J
##d*J
 t+J
 &&-	J

 J
 ((4/J
 $;J
 -.J
 
(	(J
    J
r>   r  c                   |  ^  \ rS rSrSrSrSS/rU 4S jr    S*S\S	\	\\\4   \
R                  -  S
\S\S\S\\
R                  -  S-  4S jjr   S+S\
R                  S\
R                   S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\
R                  \
R                  4   4S jjr\" SS9\\ S,S\
R,                  S\
R                  S-  S\\   S\\-  4S jj5       5       5       r\" SS9\\ S,S\
R,                  S\
R                  S-  S\\   S\\-  4S jj5       5       5       r  S-S\
R                  S\
R,                  S\
R,                  S-  S \
R,                  S-  4S! jjr     S.S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S"\
R                  S-  S\
R                   S-  S\
R                  S-  4S# jjr\" S$S%S&9\\          S/S\
R                  S-  S\
R                  S-  S'\
R                  S-  S"\S-  S\
R,                  S-  S\
R                  S-  S\
R,                  S-  S\
R                  S-  S\
R                  S-  S\
R                   S-  S\\   S\\ -  4S( jj5       5       5       r!S)r"U =r#$ )0
Glm4vModeli|  r  Fr  r6  c                    > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  UR                  5      U l        S U l	        U R                  5         g rn   )r1   r2   r  _from_configvision_configvisualr  text_configlanguage_modelr  r  r   s     r<   r2   Glm4vModel.__init__  sU     &33F4H4HI,99&:L:LM 	r>   Nstart_positionr  temp_merge_sizer  time_intervalr   c                    US   R                  5       U-  US   R                  5       U-  US   R                  5       U-  pn[        R                  " XvS9U-  n
[        R                  " XS9U-   n[        R                  " XS9U-   nUR                  X-  5      nUR	                  U	5      R                  U5      nU
R	                  X-  5      U-   n
[        R
                  " XU/SS9nU$ )a  
Compute 3D positional indices for vision tokens derived from a single image or video input.

The positions are generated from the input grid defined by temporal (T), height (H), and
width (W) dimensions. Temporal and spatial dimensions can be downscaled according to the
merge sizes used in the vision backbone. The resulting positions are offset by `start_position`.

Args:
    start_position (`int`):
        Offset added to all computed positional indices.
    grid_thw (`Sequence[int]` or `torch.Tensor` of shape `(3,)`):
        The (T, H, W) grid representing the feature layout of the current image or video after patch embedding.
    temp_merge_size (`int`, *optional*):
        Factor by which the temporal dimension is reduced in the backbone. The temporal grid size is divided
        by this value. Defaults to 1.
    spatial_merge_size (`int`, *optional*):
        Factor by which the spatial dimensions (H and W) are reduced in the backbone. Both H and W are divided
        by this value. Defaults to 1.
    time_interval (`int`, *optional*):
        Spacing factor applied between consecutive temporal position indices.Defaults to 1.
    device (`str` or `torch.device`, *optional*):
        Device on which the resulting tensor is allocated. If `None`, uses the current default device.

Returns:
    torch.LongTensor of shape (3, sequence_length):
        Positional indices for temporal, height, and width dimensions,
        flattened into sequence form and offset by `start_position`.
r   r&   rA   r   r   )itemr5   r   repeatrz  r   )r9   r  r  r  r  r  r   
llm_grid_t
llm_grid_h
llm_grid_wposition_temporalposition_widthposition_heightvision_position_idss                 r<   r%   "Glm4vModel.get_vision_position_ids  s    L QK/1QK"44QK"44 !+
 "LLCmSj@>Q,,zANR (..z/FG);;JGNNzZ-??
@WX[ii#kk+<~*^def""r>   r  mm_token_type_idsimage_grid_thwvideo_grid_thwr   r/   c           
         Ub%  [         R                  " XDSS2S4   SS9nSUSS2S4'   U R                  R                  R                  n/ n[         R
                  " SUR                  S   UR                  S   UR                  UR                  S9n	Ub  [        U5      OSUb  [        U5      OSS.n
[        U5       GH  u  pX+   nUb*  XU   R                  5          nXU   R                  5          n/ n[        R                  " [        UR                  5       5      S 5       H8  u  nn[        U5      nUS   S   nUS	   S   S-   nUR!                  UUU45        M:     Sn/ nU H  u  nnnUS:X  a]  UU-
  nUR!                  [         R"                  " UUR                  S
9R%                  SS	5      R'                  SS	5      U-   5        UU-  nMj  [)        U
U   5      nU R+                  UUSXqR                  S
9nUR!                  U5        U[-        US   US   5      U-  -  nM     [         R.                  " USS9R1                  SS	5      nUb4  UR3                  U	R                  5      U	SS2XU   R                  5       4'   O"UR3                  U	R                  5      U	SS2U4'   UR!                  UR-                  5       S-   [5        U5      -
  5        GM     [         R6                  " XR                  S
9R9                  S5      nX4$ )a9  
Difference from Qwen2VL/Qwen2.5VL's get_rope_index:
- GLM4V uses timestamps to separate each video frame, so the video_grid_thw should also be split too.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
        it.
    mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):
        Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
        The temporal, height and width of feature shape of each image in LLM.
    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
        The temporal, height and width of feature shape of each video in LLM.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

Returns:
    position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
    mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
Nr   r   r&   r   rD   r   )r&   rA   c                     U S   $ )Nr&   r  )r   s    r<   <lambda>+Glm4vModel.get_rope_index.<locals>.<lambda>  s    `abc`dr>   rB   r   rA   )r5   rz  rk   r  r  zerosrP   rD   r   iterrm  rs   	itertoolsgroupbyr*  r   appendr   r   r   nextr%   r(  r   r   rE   lenr   r   )r9   r  r#  r$  r%  r   r   r  mrope_position_deltasr   
grid_iters	batch_idxcurrent_input_idsinput_token_typeinput_type_groupr   groupstart_index	end_indexcurrent_posllm_pos_ids_listmodality_type	start_idxend_idxtext_lenr  r!  llm_positionss                               r<   get_rope_indexGlm4vModel.get_rope_index  s)   F %"44^TUWXTXEY_`aN#$N1a4 ![[66II "{{OOAOOA//##
 (6'AtN#t'5'AtN#t


 -6i,@(I0;)$5Y6O6T6T6V$W!#394M4R4R4T#U !'//	:J:Q:Q:S0TVde
UU#Ahqk!"IaL1,	 ''k9(EF	 f K!5E1y' A%&2H$++Xi6F6FGLLQPRSZZ[\^`adoo  8+K  $J}$=>H*.*F*F#Xq2DM]M] +G +' %++,?@3x{HQK#@DV#VVK 6F  "II&6A>FFq"MM)O\O_O_`l`s`sOtQ	)+D+I+I+KKL-:-=-=l>Q>Q-RQ	\*!(():):)<q)@3GXCY)YZI -AJ !&-BK[K[ \ f fgh i22r>   r  )modalitypixel_values_videosr   c                    UR                  U R                  R                  5      nUSS2S4   nUSS2SS24   n[        R                  " XTSS9nUR                  UR                  S   S5      n[        R                  " Xv/SS9nU R                  " U4USS.UD6n	UR                  S5      U R                  R                  S-  -  R                  5       n
[        R                  " U	R                  U
5      nXl        U	$ )	3  
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input videos.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.
Nr   r&   r   T)r  return_dictrB   rA   )re  r  rD   r5   rz  new_onesrP   r   prodr  r*  r)  r  )r9   rE  r%  r   thwflattened_hwprefix_onesflattened_video_grid_thwvision_outputssplit_sizesvideo_embedss               r<   get_video_featuresGlm4vModel.get_video_features#  s     266t{{7H7HI1a4 AqrE"..r!<$--l.@.@.CQG#(99k-Ha#P 
*BPT
X^
 &**2.$++2P2PRS2SS[[]{{>#?#?M'3$r>   r  pixel_valuesc                 :   UR                  U R                  R                  5      nU R                  " U4SU0UD6nUR                  S5      U R                  R                  S-  -  R                  5       n[        R                  " UR                  U5      nXdl        U$ ),  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
r  rB   rA   )	re  r  rD   rJ  r  r*  r5   r)  r  )r9   rU  r$  r   rP  rQ  image_embedss          r<   get_image_featuresGlm4vModel.get_image_featuresC  s     $(():):;\UNUfU%**2.$++2P2PRS2SS[[]{{>#?#?M'3$r>   r  image_featuresvideo_featuresc           	         Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nX R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  UR                  5      nUb?  [        XrR                  S   -  UR                  5       :H  SU SUR                  S    35        UR                  5       nUR                  S5      R                  UR                  5      nUb?  [        XR                  S   -  UR                  5       :H  SU SUR                  S    35        XV4$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r'  rB   z6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: )get_input_embeddingsr5   r   rk   image_token_idr   r   allvideo_token_idr   r   rE   r   rP   numel)	r9   r  r  r[  r\  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            r<   get_placeholder_maskGlm4vModel.get_placeholder_maskZ  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!; "+kk.H.H!H!*kk.H.H!H+//1/99"=@@AUAUV%"!4!4R!88N<P<P<RRHHXXdeseyeyz{e|d}~
 ,//1/99"=@@AUAUV%"!4!4R!88N<P<P<RRHHXXdeseyeyz{e|d}~ "55r>   r  c                    Uc  SOUR                  5       nUS L=(       d    US Ln	U	(       a  Uc  Ub  [        S5      eUS L=(       a    US L=(       a    U	n
U
(       a0  U R                  b  US:X  a  U R                  UUUUUS9u  pXl        U$ U R                  Gb8  US:  d  UGc.  UR                  u  pnUbu  UR                  5       R                  S5      S-
  nUR                  US:H  S5      nUR                  SUS5      R                  SSS5      R                  UR                  5      nOV[        R                  " XU-   5      nUR                  SSS5      R                  SUS5      R                  UR                  5      nU R                  R                  XR                  R                  S   -  SS9nUUR                  UR                  S9-   nU$ S nU$ )	Nr   a  Multimodal data was passed (via `image_grid_thw` or `video_grid_thw`) but `mm_token_type_ids` is missing. Please pass `mm_token_type_ids` to the model so that multimodal RoPE (M-RoPE) can be computed correctly. `mm_token_type_ids` is returned by the processor alongside `input_ids`.)r$  r%  r   r#  rB   r&   r   r   r   )r  r   r  rB  rP   r   r   masked_fillr   r  rE   r   r5   r   r   rz  )r9   r  r  r$  r%  r   r  r#  past_key_values_lengthhas_multimodalcan_compute_mroper   r  
batch_sizer,  r0  deltas                    r<   compute_3d_position_ids"Glm4vModel.compute_3d_position_ids  s    '6&=?CaCaCc't3Q~T7Q/7I<Qn 
 &T1f6Gt6SfXf$"2"2":>TXY>Y(,(;(;---"3 )< )%L  +&  )/E/IYM^(5(;(;%JA)-224;;B?!C+77!8KQO+00JCJJ1aQRSVVWdWkWkl$||,B]gDgh+00Ar:AA!ZQSTWWXeXlXlm$$66zEUEUE[E[\]E^7^de6fE'%((-:N:N(*OOL   Lr>   r  r  r  r   c           
         USL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUb{  U R                  " Xh4SS0UD6R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9u  pUR                  X5      nUb}  U R                  " Xy4SS0UD6R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9u  nnUR                  UU5      nUc  U R                  UUU	UUUU
S	9nU R                  " SSUUUUS
.UD6n[        S0 UDSU R                  0D6$ )a-  
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.
Nr  rH  Tr   r   )r[  )r\  )r  r$  r%  r  r   r  r#  )r  r   r   r  r  r  r  )r   r^  rY  r  r5   r   rE   r   rD   rg  masked_scatterrS  rp  r  r  r  )r9   r  r   r   r  r  rU  rE  r$  r%  r#  r   rX  
image_maskr0  rR  
video_maskoutputss                     r<   rL   Glm4vModel.forward  s   . -t";<YZZ  557	BM#22:>BHm  !99\q9<<]=Q=QS`SfSfgL 55i_k5lMJ)88RM*22#AEIOm  !99\q9<<]=Q=QS`SfSfgL 55i_k5lMAz)88\RM77#--+- /"3 8 L %% 
%)+'
 
 ( 

((
 	
r>   )r  r  r  )r&   r&   r&   Nrt  rn   )NN)NNNNN)
NNNNNNNNNN)$rS   rT   rU   rV   r  accepts_loss_kwargsr  r2   r   r   r5   rX   r   r   r%   r  	IntTensorrO   rB  r   r   r   r  r   r   r   rS  rY  rg  rp  r   r   r  rL   rY   rZ   r[   s   @r<   r  r  |  s>   02DE  !"#,08#8# sC}%48# 	8#
  8# 8# ell"T)8#| 3726.2[3##[3 !??[3 ((4/	[3
 ((4/[3 t+[3 
u||U\\)	*[3z  1 37".. ((4/ +,	
 
+	+   2:  1 37'' ((4/ +,	
 
+	+   20 4837(6##(6 (((6 ))D0	(6
 ))D0(6\ /3.2.2/348/<<$&/ ||d*/ t+	/
 t+/ t+/ ,/ !??T1/ 
	/b ]G4 .2.204(,26,08<262648A
##d*A
 t+A
 &&-	A

 A
 ((4/A
 llT)A
 #..5A
 ((4/A
 ((4/A
 !??T1A
 +,A
 
)	)A
   5A
r>   r  c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Glm4vCausalLMOutputWithPasti  r  Nr  r  r  r  r>   r<   r{  r{    r  r>   r{  c            !         ^  \ rS rSrSS0rSrU 4S jr\ S"S\R                  S\R                  S-  S	\\   S
\\-  4S jj5       r\ S"S\R                  S\R                  S-  S	\\   S
\\-  4S jj5       r\" SSS9\\            S#S\R                  S-  S\R&                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R&                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R*                  S-  S\\R&                  -  S	\\   S
\\-  4S jj5       5       5       r          S$U 4S jjrU 4S jr S"S\R                  S-  S\R&                  S-  S
\\R&                  \R&                  4   4S jjr   S%S\S\S\R                  S-  S
\\R                  \\\4   4   4S  jjr S!r!U =r"$ )&Glm4vForConditionalGenerationi  zlm_head.weightz(model.language_model.embed_tokens.weightFc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFrb   )r1   r2   r  r  r3   re   r  r:   r  lm_headr  r   s     r<   r2   &Glm4vForConditionalGeneration.__init__  sS     '
yy!3!3!?!?ASASA^A^ejkr>   NrE  r%  r   r/   c                 <    U R                   R                  " X40 UD6$ )rG  )r  rS  )r9   rE  r%  r   s       r<   rS  0Glm4vForConditionalGeneration.get_video_features  s      zz,,-@[TZ[[r>   rU  r$  c                 <    U R                   R                  " X40 UD6$ )rW  )r  rY  )r9   rU  r$  r   s       r<   rY  0Glm4vForConditionalGeneration.get_image_features#  s     zz,,\TVTTr>   r  r  r  r  r   r   r  r  labelsr#  logits_to_keepc                    U R                   " SUUUU	U
UUUUUS.
UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnUb.  U R                  UX`R                  R                  R                  S9n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
    The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
    The temporal, height and width of feature shape of each video in LLM.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

>>> model = Glm4vForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
>>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")

>>> messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
```)
r  rU  rE  r$  r%  r#  r   r   r  r  r   N)logitsr  r  )lossr  r  r?   r  r  r  )r  r   r   slicer  loss_functionrk   r  r  r{  r  r?   r  r  )r9   r  r   r   r  r  r  rU  rE  r$  r%  r#  r  r   rv  r?   slice_indicesr  r  s                      r<   rL   %Glm4vForConditionalGeneration.forward2  s    z ** 
% 3))/%)+'
 
  
 9C>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD*#33!//))++
 	
r>   c                 p   > [         TU ]  " U4UUUUUUU	U
UUS.
UD6nU(       d  U(       a
  S US'   S US'   U$ )N)
r  r   r  r   rU  rE  r$  r%  r  is_first_iterationrU  rE  )r1   prepare_inputs_for_generation)r9   r  r  r   r  r   r  rU  rE  r$  r%  r  r   model_inputsr;   s                 r<   r  ;Glm4vForConditionalGeneration.prepare_inputs_for_generation  sf    " w<
+)'%% 3))1
 
 "i+/L(26L./r>   c                   > [         TU ]  X5      nSnUR                  S5      =nb  UR                  5       nUS:w  a5  U R                  R
                  b  US   U R                  R
                  -   nU$ SU;   a  US   R                  S   S:  a  US   n[        UR                  5      S:H  =(       a-    UR                  [        R                  [        R                  4;   nU(       a  UR                  S5      b  UR                  S5      c  UR                  S	5      b\  UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     nnn	U R                  R                  " U40 UD6u  pXR                  l        OmUR                  S5      R                  S
SS5      n
[        R                   " UR                  S   S[        R                  UR"                  S9U R                  l        US   n[        R$                  " X:/SS9nU$ s  sn	nf )Nr   r  r  r  r&   rA   r#  r$  r%  r   rB   r'  r   )r1   $_prepare_position_ids_for_generationrS  r  r  r  rP   r1  rD   r5   r   r   itemsrB  r   r   r+  r   r   )r9   inputs_tensormodel_kwargstext_positionspast_lengthcacher   is_input_idsr   r2  vision_positionsr  r;   s               r<   r  BGlm4vForConditionalGeneration._prepare_position_ids_for_generation  s    Emb !%%&788EE..0K!

 6 6 B))4tzz7M7MML ,&<+D+J+J1+MPQ+Q(5M=../14g9L9LQVQZQZ\a\f\fPg9g  !45A!!"23?<CSCSTdCeCq-9-?-?-AV-ATQQ+EUDAD-ALV,0JJ,E,Em,dWc,d)%0JJ"-77:AA!RL%*[[##A&MDXDX&DJJ"
 (	2yy.!CK Ws   /H?Hc           	         UGb  UU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  S   nOHXR                  R                  :H  nXR                  R                  :H  nXR                  R                  :H  n[        R                  " UR                  5       UR                  5       -
  SS9nUS:  nX7) -  nUR                  SS9n	UR                  SS9n
X4$ )a  
Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

Returns:
    image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
    video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
r'  ).r   r&   r   r   )r^  r5   r   rk   image_start_token_idr   r   video_start_token_idvideo_end_token_idr   r   r   )r9   r  r  is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              r<   _get_image_nums_and_video_nums<Glm4vForConditionalGeneration._get_image_nums_and_video_nums  s   $ $,,.LL!A!A\i\p\pq H ,,.LL!A!A\i\p\pq N ,,.LL!?!?uzzZgZnZno L !KK$D$DDH&++*J*JJN$(F(FFL ll>#5#5#7,:J:J:L#LRST"Q %6 ),,,3%))a)0))r>   expand_sizeis_encoder_decoderc                    ^ ^^^^ TS:X  a  TT4$ / SQmUUUU 4S jnUU4S jnU" T5      mTb  TR                  TSS9mU" T5      mU(       a+  TR                  S5      c  [        S5      eU" TS   5      TS'   TT4$ )	Nr&   )rU  r$  rE  r%  second_per_grid_tsc           	        > TR                  SS 5      nTR                  SS 5      nTR                  TTR                  SS 5      S9u  p4S nU  GH/  nUS:X  aa  [        R                  " U[	        U5      5      nU Vs/ s H&  n[        R
                  " USS9R                  5       PM(     n	nU" X   U	T
S	9X'   Mk  US:X  a  [	        U5      n	U" X   U	T
S	9X'   M  US
:X  aa  [        R                  " U[	        U5      5      nU Vs/ s H&  n[        R
                  " USS9R                  5       PM(     n	nU" X   U	T
S	9X'   M  US:X  a  [	        U5      n	U" X   U	T
S	9X'   GM  US:X  d  GM  U" X   [	        U5      T
S	9X'   GM2     U $ s  snf s  snf )Nr$  r%  r  )r  c                     [         R                  " X5      nU/S/U R                  5       S-
  -  -   n[         R                  " U Vs/ s H  oUR                  " U6 PM     snSS9nU$ s  snf )Nr&   r   r   )r5   r)  r   r   r  )r   r   repeat_timessamplesrepeat_argssamplerr  s          r<   _repeat_interleave_samplesڋGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>._repeat_interleave_samples,  s_    ++a1+nsaeegk/BBg#VgFMM;$?g#V\]^ $Ws   A&rU  r&   r   )r   r  rE  r  )rS  r  r5   r)  r   rJ  r   )dict_to_expandr$  r%  
image_nums
video_numsr  r   r  r  r   r  r  r  r9   s             r<   "_expand_dict_for_generation_visualgGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual%  s   )--.>EN)--.>EN%)%H%H)9)9/4)P &I &"J &.(#kk.$z:JKGMTUW6uzz&a8<<>WGU*D&+W;+N' ,,":.G*D&+W;+N' 11#kk.$z:JKGMTUW6uzz&a8<<>WGU*D&+W;+N' ,,":.G*D&+W;+N' 00*D&+T*5ET_+N'7 &< "!3 V Vs   ;-F-Fc                   > U  Hw  nUS:X  a(  X   R                   S:X  a  X   R                  TSS9X'   M1  X   c  M8  [        X   [        R                  5      (       d  M[  UT;  d  Mc  X   R                  TSS9X'   My     U $ )Nr   r   r&   r   r   )r  rz  r   r5   rX   )r  r   r  visual_keyss     r<   _expand_dict_for_generation`Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generationR  s    %.(^-@-E-E-J*8*=*O*OP[ab*O*cN'"'3">#6EE;.*8*=*O*OP[ab*O*cN' & "!r>   r   r   encoder_outputszMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)rz  rS  r   )r9   r  r  r  r  r  r  r  s   `` ``  @r<   _expand_inputs_for_generation;Glm4vForConditionalGeneration._expand_inputs_for_generation  s     !l**w+	" +	"Z
	" :,G !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+,&&r>   )r  r  rn   )NNNNNNNNNNNr   )
NNNNTNNNNF)r&   FN)#rS   rT   rU   rV   _tied_weights_keysrx  r2   r   r5   r  r  r   r   rO   r   rS  rY  r   r   rX   r   ry  r   r{  rL   r  r  r  rs   dictr   r   r  rY   rZ   r[   s   @r<   r}  r}    s(   *,VW  37\"..\ ((4/\ +,	\
 
+	+\ \  37U''U ((4/U +,	U
 
+	+U U ]G4 .2.204(,26*.,08<262648-.Y
##d*Y
 t+Y
 &&-	Y

 Y
 ((4/Y
   4'Y
 llT)Y
 #..5Y
 ((4/Y
 ((4/Y
 !??T1Y
 ell*Y
 +,Y
 
,	,Y
   5Y
|   $L$R .26*##d*6* ||d*6* 
u||U\\)	*	6*t #(-1	V'V' !V' ##d*	V' 
uc3h/	0V' V'r>   r}  )r}  r  r  r  r  )r   )r&   )br-  r  collections.abcr   dataclassesr   typingr   r   r5   torch.nnr3   torch.nn.functionalr  r   r    r	   r  activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r    r!   r"   utils.output_capturingr#   vision_utilsr$   r%   configuration_glm4vr'   r(   r)   Moduler,   r]   ru   r   r   r   r   rX   rO   r   r   r   rW   r  r  r6  rD  rx  r  r  r  r  r  r  r  r  r  r{  r}  __all__r  r>   r<   <module>r     s  (   $ !        & ! . ) 7 / B 9 k k K F & a a 0  6 J P P Y'J299 J (J(fBII fBII (G GfRYY f"GBII GT(||+0<<>Cll
5<<%&	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2P299 Pf1 >Jryy JZ6%PC) C)L)299 )$/6 /d 
06 0  0 2? 2 2(e
+ e
P e
) e
 e
P |
% |
 |
~ 
0"8 0  0b'$8/ b'J xr>   