o
    tBh?                  
   @   s>  d dl mZ d dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd deZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdee dee de fddZ!eddd&d ed!e"d"e de"fd#d$Z#d%S )'    )	lru_cache)OptionalList)UNICODE_SECONDARY_RANGE_KEYWORD)is_punctuation	is_symbolunicode_rangeis_accentuatedis_latinremove_accentis_separatoris_cjkis_case_variable	is_hangulis_katakanais_hiraganais_asciiis_thaic                   @   sP   e Zd ZdZdedefddZdeddfddZdd	d
Ze	de
fddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C      t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r   l/var/www/html/riverr-enterprise-integrations-main/venv/lib/python3.10/site-packages/charset_normalizer/md.pyeligible      zMessDetectorPlugin.eligibleNc                 C   r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r   r   r   r   feed   s   zMessDetectorPlugin.feedc                 C   r   )zB
        Permit to reset the plugin to the initial state.
        r   r   r   r   r   reset   r   zMessDetectorPlugin.resetc                 C   r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r!   r   r   r   ratio"   s   zMessDetectorPlugin.ratior   N)__name__
__module____qualname____doc__strboolr   r    r"   propertyfloatr#   r   r   r   r   r   	   s    
r   c                   @   T   e Zd Zdd ZdedefddZdeddfdd	Zdd
dZe	de
fddZdS ) TooManySymbolOrPunctuationPluginc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr!   r   r   r   __init__-   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   r   c                 C      |  S Nisprintabler   r   r   r   r   5      z)TooManySymbolOrPunctuationPlugin.eligibleNc                 C   sd   |  j d7  _ || jkr-|dvr-t|r|  jd7  _n| du r-t|r-|  jd7  _|| _d S )N   )<>=:/&;{}[],|"F   )r1   r2   r   r/   isdigitr   r0   r   r   r   r   r    8   s   
z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r/   r1   r0   r!   r   r   r   r"   C      
z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)r1   r/   r0   )r   ratio_of_punctuationr   r   r   r#   H   s   
z&TooManySymbolOrPunctuationPlugin.ratior$   r%   r&   r'   r4   r)   r*   r   r    r"   r+   r,   r#   r   r   r   r   r.   +   s    
r.   c                   @   r-   )TooManyAccentuatedPluginc                 C      d| _ d| _d S rK   r1   _accentuated_countr!   r   r   r   r4   T      
z!TooManyAccentuatedPlugin.__init__r   r   c                 C   r5   r6   )isalphar   r   r   r   r   X   r9   z!TooManyAccentuatedPlugin.eligibleNc                 C   s,   |  j d7  _ t|r|  jd7  _d S d S Nr:   )r1   r	   rT   r   r   r   r   r    [   s   zTooManyAccentuatedPlugin.feedc                 C   rR   rK   rS   r!   r   r   r   r"   a   rU   zTooManyAccentuatedPlugin.resetc                 C   s*   | j dkrdS | j| j  }|dkr|S dS )Nr   rM   gffffff?rS   )r   ratio_of_accentuationr   r   r   r#   e   s   
zTooManyAccentuatedPlugin.ratior$   rP   r   r   r   r   rQ   R   s    
rQ   c                   @   r-   )UnprintablePluginc                 C   rR   rK   )_unprintable_countr1   r!   r   r   r   r4   o   rU   zUnprintablePlugin.__init__r   r   c                 C      dS NTr   r   r   r   r   r   s      zUnprintablePlugin.eligibleNc                 C   s4   |dvr|  du r|  jd7  _|  jd7  _d S )N>   
	Fr:   )r8   rZ   r1   r   r   r   r   r    v   s   zUnprintablePlugin.feedc                 C   s
   d| _ d S rK   )rZ   r!   r   r   r   r"   {   s   
zUnprintablePlugin.resetc                 C      | j dkrdS | jd | j  S )Nr   rM      )r1   rZ   r!   r   r   r   r#   ~      
zUnprintablePlugin.ratior$   rP   r   r   r   r   rY   m   s    
rY   c                   @   r-   )SuspiciousDuplicateAccentPluginc                 C      d| _ d| _d | _d S rK   _successive_countr1   _last_latin_characterr!   r   r   r   r4      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   r   c                 C   s   |  ot|S r6   )rV   r
   r   r   r   r   r      s   z(SuspiciousDuplicateAccentPlugin.eligibleNc                 C   st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rW   )r1   ri   r	   isupperrh   r   r   r   r   r   r       s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C   rf   rK   rg   r!   r   r   r   r"      rL   z%SuspiciousDuplicateAccentPlugin.resetc                 C   rb   )Nr   rM   rI   )r1   rh   r!   r   r   r   r#      rd   z%SuspiciousDuplicateAccentPlugin.ratior$   rP   r   r   r   r   re      s    
re   c                   @   r-   )SuspiciousRangec                 C   rf   rK   )"_suspicious_successive_range_countr1   _last_printable_seenr!   r   r   r   r4      rL   zSuspiciousRange.__init__r   r   c                 C   r5   r6   r7   r   r   r   r   r      r9   zSuspiciousRange.eligibleNc                 C   sp   |  j d7  _ | st|rd | _d S | jd u r|| _d S t| j}t|}t||r3|  jd7  _|| _d S rW   )r1   isspacer   rm   r    is_suspiciously_successive_rangerl   )r   r   unicode_range_aunicode_range_br   r   r   r       s   



zSuspiciousRange.feedc                 C   rf   rK   )r1   rl   rm   r!   r   r   r   r"      rL   zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk rdS |S )Nr   rM   rI   g?)r1   rl   )r   ratio_of_suspicious_range_usager   r   r   r#      s   
zSuspiciousRange.ratior$   rP   r   r   r   r   rk      s    
rk   c                   @   r-   )SuperWeirdWordPluginc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )_word_count_bad_word_count_is_current_word_bad_foreign_long_watchr1   _bad_character_count_buffer_buffer_accent_countr!   r   r   r   r4      s   
zSuperWeirdWordPlugin.__init__r   r   c                 C   r[   r\   r   r   r   r   r   r      r]   zSuperWeirdWordPlugin.eligibleNc                 C   s  |  rFd| j|g| _t|r|  jd7  _| jdu rDt|du rDt|du rDt|du rDt	|du rDt
|du rDt|du rDd| _d S | jsKd S | sWt|sWt|r| jr|  jd7  _t| j}|  j|7  _|dkr{| j| dkr{d| _|dkr| jrd| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d| _d S |d	vr| du rt|rd| _|  j|7  _d S d S d S d S )
Nrt   r:   FT   rN      r   >   -r;   r=   r<   )rV   joinrz   r	   r{   rx   r
   r   r   r   r   r   rn   r   r   ru   lenr1   rw   rv   ry   rJ   r   )r   r   buffer_lengthr   r   r   r       s8   R

zSuperWeirdWordPlugin.feedc                 C   s.   d| _ d| _d| _d| _d| _d| _d| _d S )Nrt   Fr   )rz   rw   rx   rv   ru   r1   ry   r!   r   r   r   r"   
     
zSuperWeirdWordPlugin.resetc                 C   s   | j dkrdS | j| j S )N
   rM   )ru   ry   r1   r!   r   r   r   r#        
zSuperWeirdWordPlugin.ratior$   rP   r   r   r   r   rs      s    
!	rs   c                   @   sX   e Zd ZdZdd ZdedefddZdeddfd	d
ZdddZ	e
defddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected.
    Searching for the overuse of '丅' and '丄'.
    c                 C   rR   rK   _wrong_stop_count_cjk_character_countr!   r   r   r   r4   !  rU   zCjkInvalidStopPlugin.__init__r   r   c                 C   r[   r\   r   r   r   r   r   r   %  r]   zCjkInvalidStopPlugin.eligibleNc                 C   s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N)u   丅u   丄r:   )r   r   r   r   r   r   r   r    (  s   zCjkInvalidStopPlugin.feedc                 C   rR   rK   r   r!   r   r   r   r"   /  rU   zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   rM   )r   r   r!   r   r   r   r#   3  s   
zCjkInvalidStopPlugin.ratior$   )r%   r&   r'   r(   r4   r)   r*   r   r    r"   r+   r,   r#   r   r   r   r   r     s    
r   c                   @   r-   )ArchaicUpperLowerPluginc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr1   _last_alpha_seen_current_ascii_onlyr!   r   r   r   r4   <  s   
z ArchaicUpperLowerPlugin.__init__r   r   c                 C   r[   r\   r   r   r   r   r   r   I  r]   z ArchaicUpperLowerPlugin.eligibleNc                 C   s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQt
|du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r:   TrI   )rV   r   r   rJ   r   r   r   r   r   r1   r   rj   islower)r   r   is_concerned	chunk_sepr   r   r   r    L  s0    
$

zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r1   r   r   r   r   r   r   r!   r   r   r   r"   n  r   zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   rM   )r1   r   r!   r   r   r   r#   w  r   zArchaicUpperLowerPlugin.ratior$   rP   r   r   r   r   r   :  s    
"	r   rp   rq   r   c                 C   s<  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS |  d| d}}|D ]}|tv r8q1||v r? dS q1| dv rJ|dv rJdS | dv sR|dv r\d| v sZd|v r\dS d	| v sdd	|v rxd| v sld|v rndS | d
ksv|d
krxdS d| v sd|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons )KatakanaHiraganaCJKHangulzBasic LatinPunctuationForms)splitr   )rp   rq   keywords_range_akeywords_range_belr   r   r   ro     s>    ro   i   )maxsize皙?Fdecoded_sequencemaximum_thresholddebugc                 C   s   g }t  D ]}||  qt| }d}|dk rd}n	|dkr#d}nd}t| td|D ]2\}}	|D ]}
|
|r?|
| q3|	dkrJ|	| dksP|	|d kr_td	d
 |D }||kr_ nq-|rn|D ]	}t	|j
|j qdt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    rM   i       i   r      r   r:   c                 S   s   g | ]}|j qS r   )r#   ).0dtr   r   r   
<listcomp>  s    zmess_ratio.<locals>.<listcomp>   )r   __subclasses__appendr   zipranger   r    sumprint	__class__r#   round)r   r   r   	detectorsmd_classlengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorr   r   r   r   
mess_ratio  sH   

 r   N)r   F)$	functoolsr   typingr   r   charset_normalizer.constantr   charset_normalizer.utilsr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r.   rQ   rY   re   rk   rs   r   r   r)   r*   ro   r,   r   r   r   r   r   <module>   s     @"'#/CE0"