
    
3j04                     z    S SK r S SKrS SKrS SKJr  S SKJr  S SKJrJ	r	  S SK
Jr   " S S5      r " S S	\5      rg)
    N)SentencePieceProcessor)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategyc            
           \ rS rSrS\4S jrSS\4S jjrSS\S\S\S\\	   4S	 jjr
S
\\	   S\4S jrS\\   S\4S jrS rS rSrg)SPTokenizer   
model_pathc                 .   [         R                  R                  U5      (       d   U5       e[        US9U l        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l
        U R                  R                  5       U R                  R                  5       :X  d   e/ SQn/ SQU-   n0 U l        0 U l        U HI  nU R                  U R                  U'   X@R                  U R                  '   U =R                  S-  sl        MK     SR                  U Vs/ s H  n[        R                   " U5      PM     sn5      U l        g s  snf )N)
model_file)z
<|system|>z<|user|><|assistant|>z<|observation|>)z[MASK][gMASK]z[sMASK]sopeop   |)ospathisfiler   sp_model
vocab_sizen_wordsbos_ideos_idunk_idpad_idget_piece_sizespecial_tokensindex_special_tokensjoinreescaperole_special_token_expression)selfr   role_special_tokensr   tokens        ^/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/pipelines/kolors/tokenizer.py__init__SPTokenizer.__init__   s6   ww~~j))5:5).*E !MM446==//1==//1==//1}}'')T]]-I-I-KKKK\GJ]] $&!#E)-D&6;%%dll3LLAL $ .1XXUh6iUhEryy7GUh6i-j*6is   % Fsc           	      0   U(       a  Sn/ n[         R                  " U R                  U5       H  nX5R                  5       :  a:  UR	                  U R
                  R                  XUR                  5        5      5        UR                  XR                  5       UR                  5        5        UR                  5       nM     U[        U5      :  a,  UR	                  U R
                  R                  XS  5      5        U$ U R
                  R                  U5      $ )Nr   )
r"   finditerr$   startextendr   EncodeAsPiecesappendendlen)r%   r+   encode_special_tokens
last_indextmatchs         r(   tokenizeSPTokenizer.tokenize0   s     JAT%G%GK-HHT]]99!:WXY;;=599;78"YY[
	 L
 CF"55anEFH==//22    boseosreturnc                     [        U[        5      (       d   eU R                  R                  U5      nU(       a  U R                  /U-   nU(       a  X@R
                  /-   nU$ N)
isinstancestrr   encoder   r   )r%   r+   r;   r<   r6   s        r(   rB   SPTokenizer.encode?   sR    !S!!!!MM  #!A[[M!Ar:   r6   c                    S/ p2U H\  nX@R                   ;   a9  U(       a  X R                  R                  U5      -  n/ nX R                   U   -  nMK  UR                  U5        M^     U(       a  X R                  R                  U5      -  nU$ )N )r    r   decoder1   )r%   r6   textbufferr'   s        r(   rF   SPTokenizer.decodeH   s~    2fE111MM0088DF11%88e$  MM((00Dr:   tokensc                 <    U R                   R                  U5      nU$ r?   )r   DecodePieces)r%   rJ   rG   s      r(   decode_tokensSPTokenizer.decode_tokensV   s    }}))&1r:   c                 t    XR                   ;   a  U R                   U   $ U R                  R                  U5      $ z0Converts a token (str) in an id using the vocab.)r   r   	PieceToIdr%   r'   s     r(   convert_token_to_idSPTokenizer.convert_token_to_idZ   s4    '''&&u--}}&&u--r:   c                     XR                   ;   a  U R                   U   $ XR                  U R                  U R                  4;   d  US:  a  gU R                  R                  U5      $ )=Converts an index (integer) in a token (str) using the vocab.r   rE   )r    r   r   r   r   	IdToPiecer%   indexs     r(   convert_id_to_tokenSPTokenizer.convert_id_to_token`   sV    ---,,U33[[$++t{{;;uqy}}&&u--r:   )r   r   r    r   r   r$   r   r   N)F)FF)__name__
__module____qualname____firstlineno__rA   r)   r8   boollistintrB   rF   rM   rS   rZ   __static_attributes__ r:   r(   r	   r	      sy    k3 k,3# 3 $ T d3i S	 c DI # ..r:   r	   c                   8  ^  \ rS rSrSS0r/ SQr   S(U 4S jjrS r\S\	4S j5       r
\
R                  S	\	4S
 j5       r
\S\	4S j5       r\R                  S	\	4S j5       r\S 5       r\S\	4S j5       r\R                  S	\	4S j5       r\S 5       r\S 5       rS rS rS rS rS\\	   S\	4S jrS)S jrS rS rS*S jr S)S\\   S\\   S-  S\\   4S jjrS\R<                  SSS4S \\	\ 4   \!-  S!\S-  S"\S#\S-  S$\"S-  S%\"S-  S\4S& jjr#S'r$U =r%$ )+ChatGLMTokenizeri   
vocab_fileztokenizer.model)	input_idsattention_maskposition_idsc                   > SU l         Xl        [        U5      U l        U R                  R                  U R                  R
                  U R                  R                  S.U l        X@l        [        TU ](  " SUUUS.UD6  g )NGLMTokenizer)z<bos><eos><pad>)padding_sideclean_up_tokenization_spacesr4   rd   )namerh   r	   	tokenizerr   r   r   r   r4   superr)   )r%   rh   rp   rq   r4   kwargs	__class__s         r(   r)   ChatGLMTokenizer.__init__n   s{     #	$$Z0^^**^^**^^**

 &;" 	
%)E"7	
 		
r:   c                     XR                   ;   a  U R                   U   $ XR                  R                   ;   d   U SU R                   35       eU R                  R                   U   $ )Nz is not a special token for )r   rs   rr   rR   s     r(   get_commandChatGLMTokenizer.get_command   sc    '''&&u--555h%@\]a]f]f\g7hh5~~,,U33r:   r=   c                     gNz<unk>rd   r%   s    r(   	unk_tokenChatGLMTokenizer.unk_token       r:   valuec                     Xl         g r?   )
_unk_tokenr%   r   s     r(   r~   r          r:   c                     gr|   rd   r}   s    r(   	pad_tokenChatGLMTokenizer.pad_token   r   r:   c                     Xl         g r?   )
_pad_tokenr   s     r(   r   r      r   r:   c                 $    U R                  S5      $ )Nro   ry   r}   s    r(   pad_token_idChatGLMTokenizer.pad_token_id       ((r:   c                     g)Nz</s>rd   r}   s    r(   	eos_tokenChatGLMTokenizer.eos_token   s    r:   c                     Xl         g r?   )
_eos_tokenr   s     r(   r   r      r   r:   c                 $    U R                  S5      $ )Nrn   r   r}   s    r(   eos_token_idChatGLMTokenizer.eos_token_id   r   r:   c                 .    U R                   R                  $ r?   )rs   r   r}   s    r(   r   ChatGLMTokenizer.vocab_size   s    ~~%%%r:   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf )zReturns vocab as a dict)ranger   _convert_id_to_tokenupdateadded_tokens_encoder)r%   ivocabs      r(   	get_vocabChatGLMTokenizer.get_vocab   sL    :?:PQ:PQ**1-q0:PQT../ Rs   Ac                 H    U R                   R                  XR                  S9$ )N)r4   )rs   r8   r4   )r%   rG   ru   s      r(   	_tokenizeChatGLMTokenizer._tokenize   s     ~~&&tC]C]&^^r:   c                 8    U R                   R                  U5      $ rP   )rs   rS   rR   s     r(   _convert_token_to_id%ChatGLMTokenizer._convert_token_to_id       ~~11%88r:   c                 8    U R                   R                  U5      $ )rV   )rs   rZ   rX   s     r(   r   %ChatGLMTokenizer._convert_id_to_token   r   r:   rJ   c                 8    U R                   R                  U5      $ r?   )rs   rM   )r%   rJ   s     r(   convert_tokens_to_string)ChatGLMTokenizer.convert_tokens_to_string   s    ~~++F33r:   Nc                    [         R                  R                  U5      (       a-  [         R                  R                  XR                  S   5      nOUn[        U R                  S5       nUR                  5       nSSS5        [        US5       nUR                  W5        SSS5        U4$ ! , (       d  f       N7= f! , (       d  f       U4$ = f)a9  
Save the vocabulary and special tokens file to a directory.

Args:
    save_directory (`str`):
        The directory in which to save the vocabulary.
    filename_prefix (`str`, *optional*):
        An optional prefix to add to the named of the saved files.

Returns:
    `tuple(str)`: Paths to the files saved.
rh   rbNwb)	r   r   isdirr!   vocab_files_namesopenrh   readwrite)r%   save_directoryfilename_prefixrh   fin	proto_strwriters          r(   save_vocabulary ChatGLMTokenizer.save_vocabulary   s     77==((n6L6L\6Z[J'J$//4(C
I ) *d#vLL# $ } )( $# }s   *B,B=,
B:=
Cc                 J    U R                  S5      U R                  S5      /nU$ )Nr   r   r   )r%   prefix_tokenss     r(   get_prefix_tokens"ChatGLMTokenizer.get_prefix_tokens   s(    )))4d6F6Fu6MNr:   c                     US;   d   U5       eU R                  SU S35      /U R                  R                  U S35      -   nU R                  R                  U5      nXE-   nU$ )N)systemuser	assistantobservationz<|z|>
)ry   rs   rB   )r%   rolemetadatamessagerole_tokensmessage_tokensrJ   s          r(   build_single_message%ChatGLMTokenizer.build_single_message   sp    EEKtKE''"TF"67$..:O:OS[R\\^P_:``..w7-r:   c           
         Uc  / n/ nU Hk  nUS   nUS   S:X  a%  SU;   a  US-   [         R                  " US   SSS9-   nUR                  U R                  US   UR	                  S	S
5      U5      5        Mm     UR                  U R                  US
U5      5        UR                  U R                  S5      /5        U R                  U/SSS9$ )Ncontentr   r   toolsr      F)indentensure_asciir   rE   r   ptT)return_tensorsis_split_into_words)jsondumpsr/   r   getry   batch_encode_plus)r%   queryhistoryr   ri   itemr   s          r(   build_chat_input!ChatGLMTokenizer.build_chat_input   s    ?G	D9oGF|x'GtO!D.4::d7mA\a+bbT66tF|TXXjZ\E]_fgh	 
 	224UCD$**?;<=%%yk$\`%aar:   token_ids_0token_ids_1c                 b    U R                  5       nX1-   nUb  X-   U R                  S5      /-   nU$ )a6  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`list[int]`):
        list of IDs to which the special tokens will be added.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
rn   )r   ry   )r%   r   r   r   s       r(    build_inputs_with_special_tokens1ChatGLMTokenizer.build_inputs_with_special_tokens   sA    & ..0#1"%3t7G7G7P6QQKr:   encoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskrp   c                 :   U R                   S:X  d   eXR                  S      n[        U5      nU[        R                  :X  a  [        U5      nUb  Ub  X$-  S:w  a
  X$-  S-   U-  nU[        R
                  :g  =(       a    [        U5      U:g  n	SU;  a	  S/U-  US'   SU;  a  [        [        U5      5      US'   U	(       aZ  U[        U5      -
  n
SU;   a  S/U
-  US   -   US'   SU;   a  S/U
-  US   -   US'   U R                  /U
-  U-   XR                  S   '   U$ )a  
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
    encoded_inputs:
        Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
    max_length: maximum length of the returned list and optionally padding length (see below).
        Will truncate by taking into account the special tokens.
    padding_strategy: PaddingStrategy to use for padding.

        - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
        - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
        - PaddingStrategy.DO_NOT_PAD: Do not pad
        The tokenizer padding sides are defined in self.padding_side:

            - 'left': pads on the left of the sequences
            - 'right': pads on the right of the sequences
    pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
        This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
        `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
    return_attention_mask:
        (optional) Set to False to avoid returning attention mask (default: set to model specifics)
leftr   r   rj   rk   )	rp   model_input_namesr3   r   LONGEST
DO_NOT_PADra   r   r   )r%   r   r   r   r   r   rp   required_input
seq_lengthneeds_to_be_padded
differences              r(   _padChatGLMTokenizer._pad  s[   H   F***'(>(>q(AB(
666^,J!&8&D*JimnJn%;q@DVVJ-1K1KKqPSTbPcgqPq >101sZ/?N+,/-1%
2C-DN>*#c.&99J>14533CnUeFf3f/0/23z1ANSaDb1b~.9=9J9J8Kj8X[i8iN11!45r:   )r   r   r   r4   rr   r   rs   rh   )r   FFr?   )Nr   )&r\   r]   r^   r_   r   r   r)   ry   propertyrA   r~   setterr   r   r   r   r   r   r   r   r   ra   r   r   r   r   r   rb   r   r   r   dictr   r   r`   r   rc   __classcell__)rv   s   @r(   rf   rf   i   s   %'89G
 %*#
24 3    s     3    s     ) ) 3    s     ) ) & &_994tCy 4S 44b GK9379t3C	c8 "&,;,F,F)--1$(AS,./-?A $JA *	A
  $JA  $d{A TkA 
A Ar:   rf   )r   r   r"   sentencepiecer   transformersr   $transformers.tokenization_utils_baser   r   transformers.utilsr   r	   rf   rd   r:   r(   <module>r      s8     	 	 0 , L .M. M.`h* hr:   