
    3j !              
          S SK r S SKrS SKJr  \R
                  S\R                  S\R                  S\R                  4S j5       r\R
                  S 5       rSS\ R                  S\ R                  S	\ R                  S\
4S
 jjrS\ R                  S\ R                  S\ R                  4S jrS\ R                  S\ R                  S\ R                  S\\ R                  \ R                  4   4S jrS\ R                  S\ R                  S\ R                  4S jrS\ R                  S\ R                  S\ R                  S\\ R                  \ R                  4   4S jrg)    Ncompute_dtype
block_size
split_halfc                    [         R                  " S5      nUS-  nXV-  U-  U-  nUU-  nU[         R                  " SU5      -   nUU:  nUU-  nUU-  n U U-  n!U U-  n U U-  n"U U-  n#U(       a  Un$UU-   n%OUS-  n$US-  S-   n%U#U-  U"U-  -   U!U-  -   U$U-  -   n&U#U-  U"U-  -   U!U-  -   U%U-  -   n'[         R                  " U	S:H  SU#5      n([         R                  " U
S:H  SU"5      n)[         R                  " US:H  SU!5      n*U(U-  U)U-  -   U*U-  -   UU-  -   n+U+SU-  -   SU-  -   n,U+SU-  -   SU-  -   n-U+SU-  -   SU-  -   n.U+SU-  -   SU-  -   n/[         R                  " UU,-   USS9n0[         R                  " UU--   USS9n1[         R                  " UU.-   USS9n2[         R                  " UU/-   USS9n3[        XUU0U1U2U3U&U'U5
        Ub  [        XUU0U1U2U3U&U'U5
        gg)aH  Triton kernel for RoPE (Rotary Position Embedding) with flexible layout.

Args:
    xq_ptr: Query tensor pointer (batch, dim1, dim2, head_dim)
    xk_ptr: Key tensor pointer (batch, dim1, dim2, head_dim)
    freqs_ptr: Frequency tensor pointer (batch, dim1, dim2, head_dim//2, 2, 2)
    xq_out_ptr: Output query tensor pointer
    xk_out_ptr: Output key tensor pointer
    batch, dim1, dim2, head_dim: Tensor dimensions (dim1/dim2 can be heads/seq in any order)
    freqs_batch, freqs_dim1, freqs_dim2: Frequency tensor dimensions (1 for broadcasting)
    stride_*: Stride information for memory access (enables layout flexibility)
    compute_dtype: Data type for computation (from freqs_cis)
    block_size: Number of elements to process per block
    split_half: if True, pair k uses elements [k] and [k + n_pairs]; else uses [2k, 2k+1]
r                 maskotherN)tl
program_idarangewhereload_apply_freq_tile)4xq_ptrxk_ptr	freqs_ptr
xq_out_ptr
xk_out_ptrbatchdim1dim2head_dimfreqs_batch
freqs_dim1
freqs_dim2stride_x_batchstride_x_dim1stride_x_dim2stride_x_dimstride_freqs_batchstride_freqs_dim1stride_freqs_dim2stride_freqs_dimstride_freqs_rotstride_freqs_pairr   r   r   pidn_pairstotal_elementsblock_startoffsetsr   pair_idxtempdim2_idxdim1_idx	batch_idx	dim_idx_0	dim_idx_1
x_offset_0
x_offset_1freqs_batch_idxfreqs_dim1_idxfreqs_dim2_idx
freqs_basefreqs_00_offsetfreqs_01_offsetfreqs_10_offsetfreqs_11_offsetfreqs_00freqs_01freqs_10freqs_11s4                                                       \/home/wildlama/miniconda3/lib/python3.13/site-packages/comfy_kitchen/backends/triton/rope.pyapply_rope_kernelrD      s   X --
C !mG\D(72N 
"KBIIa44G^#D  HgDd{H4<Dd{HI 	w&	 qL	qL1$	 n,m+,m+, |+,J n,m+,m+, |+,J hh{a/I>OXXjAoq(;NXXjAoq(;N "$66 #445 #445 --.J !1'7#77!>O:OOO 1'7#77!>O:OOO 1'7#77!>O:OOO 1'7#77!>O:OOOwwy?2SIHwwy?2SIHwwy?2SIHwwy?2SIHVx8XWacmo|}T8XxQY[egq  tA  	B     c
                     [         R                  " X-   USS9R                  U	5      n
[         R                  " X-   USS9R                  U	5      nX:-  XK-  -   nXZ-  Xk-  -   n[         R                  " X-   XS9  [         R                  " X-   XS9  g )Nr	   r
   )r   )r   r   tostore)x_ptr	x_out_ptrr   r?   r@   rA   rB   r5   r6   r   x_0x_1xq_out_0xq_out_1s                 rC   r   r      s     ''%$4s
;
>
>}
MC
''%$4s
;
>
>}
MC ~.H~.H HHY#X9HHY#X9rE   x1	freqs_cisx2c                    U R                   u  pEpgUR                   S   UR                   S   UR                   S   pnU R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       n[        R                  " U 5      nS nUS-  nXE-  U-  U-  nUS:  a  SnOUS:  a  SnOSn[
        R                  " X5      4nU R                  5       u  nnnnUR                  5       n[        R                  [        R                  [        R                  [        R                  [        R                  [        R                  0nUR                  UR                  [        R                  5      nUb;  UR                  5       (       d  UR                  5       n[        R                  " U5      n[        U   " U UUUUUUUUUU	U
UUUUUS   US   US   US	   US
   US   UUUS9  X4$ )Nr   r   r   i      i   i   i            )r   r   r   )shapeis_contiguous
contiguoustorch
empty_liketritoncdivstridefloat32r   float16bfloat16getdtyperD   )rO   rP   rQ   r   r   r   r   r   r   r   r   x1_outx2_outr*   r+   r   gridr   r    r!   r"   stride_freqs	dtype_mapr   s                           rC   _apply_roperi      s   "$((E*3//!*<iooa>PR[RaRabcRdZK ]]_""$$((*	b!FF !mG\D(72N 
	%	

 KK35D BD>NM=,##%L 	rzzrzzI
 MM)//2::>M	~!!B!!"%d

QQQQQQ#38 >rE   xreturnc                      [        X5      u  p#U$ Nri   rj   rP   x_out_s       rC   apply_rope1rr      s    1(HELrE   xqxkc                     [        XU5      $ rm   rn   rs   rt   rP   s      rC   
apply_roperw      s     rb))rE   c                     [        XSS9u  p#U$ NT)r   rn   ro   s       rC   apply_rope_split_half1rz      s    1D9HELrE   c                     [        XUSS9$ ry   rn   rv   s      rC   apply_rope_split_halfr|      s     rbT::rE   )NF)rZ   r\   triton.languagelanguager   jit	constexprrD   r   Tensorboolri   rr   tuplerw   rz   r|    rE   rC   <module>r      s{       jB. <</jB0 1jB2 3jB jBX : :IELL IU\\ Iu|| I`d IX5<< ELL U\\ 
**,,*38<<*
5<<%&*ell u||  
;;,,;38<<;
5<<%&;rE   