
    0jF                     N    S SK Jr  S SKrS SKrS SKJr  SSKJr   " S S\5      r	g)    )defaultdictN)xrange   )NGramc                       \ rS rSrSrSr\R                  " S5      r\R                  " S5      r	SS jr
S rS	 rS
 rSrg)LangProfile
      i z
^[A-Za-z]$z.*[A-Za-z].*Nc                     [        [        5      U l        Ub  U R                  R                  U5        Uc  S/[        R
                  -  nXl        X0l        g )Nr   )r   intfrequpdater   N_GRAMnamen_words)selfr   r   r   s       W/home/wildlama/miniconda3/lib/python3.13/site-packages/langdetect/utils/lang_profile.py__init__LangProfile.__init__   sE    $	IIT"?cELL(G	    c                     U R                   b  Uc  g[        U5      nUS:  d  U[        R                  :  a  gU R                  US-
  ==   S-  ss'   U R
                  U==   S-  ss'   g)zAdd n-gram to profile.Nr   )r   lenr   r   r   r   )r   gramlengths      r   addLangProfile.add   s[    99TA:%,,.VaZ A% 		$1r   c                    U R                   c  g[        U R                  S   U R                  -  U R                  5      nSn[        [        R                  " U R                  5      5       Hb  u  p4XA::  a2  U R                  [        U5      S-
  ==   U-  ss'   U R                  U	 M<  U R                  R                  U5      (       d  M^  X$-  nMd     X R                  S   S-  :  a  [        [        R                  " U R                  5      5       HW  u  p4U R                  R                  U5      (       d  M'  U R                  [        U5      S-
  ==   U-  ss'   U R                  U	 MY     gg)zAEliminate below less frequency n-grams and noise Latin alphabets.Nr   r      )r   maxr   LESS_FREQ_RATIOMINIMUM_FREQlistsix	iteritemsr   r   ROMAN_CHAR_REmatchROMAN_SUBSTR_RE)r   	thresholdromankeycounts        r   omit_less_freqLangProfile.omit_less_freq&   s   99Q4+?+??ARARS	s}}TYY78JC!SXaZ(E1(IIcN##))#.. 9 <<?a''"3==#;<
''--c22LLS!,5,		# = (r   c                    Uc  g[         R                  " U5      n[        5       nU HX  nUR                  U5        [        S[         R                  S-   5       H#  nU R                  UR                  U5      5        M%     MZ     g)zxUpdate the language profile with (fragmented) text.
Extract n-grams from text and add their frequency into the profile.
Nr   )r   normalize_viadd_charr   r   r   get)r   textr   chns        r   r   LangProfile.update;   sf     <!!$'wBMM"Au||A~.!% / r   )r   r   r   )NNN)__name__
__module____qualname____firstlineno__r!   r    recompiler%   r'   r   r   r,   r   __static_attributes__ r   r   r   r   
   s<    LOJJ}-Mjj1O	'*&r   r   )
collectionsr   r:   r#   	six.movesr   ngramr   objectr   r=   r   r   <module>rB      s     # 	 
  <&& <&r   