o
    tBh0                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZ G dd dZG dd dZe
eef Ze	e ZG dd dZ eZ!dS )    N)aliases)sha256)dumps)OptionalListTupleSet)Counter)subcompile)TOO_BIG_SEQUENCE)
mess_ratio)	iana_nameis_multi_byte_encodingunicode_rangec                   @   s
  e Zd Z	dEdededededddee fd	d
ZdefddZ	defddZ
edefddZedefddZedefddZdefddZdefddZdFddZedefddZedee fdd Zedefd!d"Zedefd#d$Zedee fd%d&Zedefd'd(Zedefd)d*Zedefd+d,Zedefd-d.Zedefd/d0Zedefd1d2Zeded  fd3d4Zedefd5d6Z edee fd7d8Z!edee fd9d:Z"dGd;d<Z#dGd=d>Z$dHd@edefdAdBZ%edefdCdDZ&dS )ICharsetMatchNpayloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesCoherenceMatchesdecoded_payloadc                 C   sF   || _ || _|| _|| _|| _d | _g | _d| _d | _d | _	|| _
d S )N        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leaves_mean_coherence_ratio_output_payload_output_encoding_string)selfr   r   r   r   r   r    r&   p/var/www/html/riverr-enterprise-integrations-main/venv/lib/python3.10/site-packages/charset_normalizer/models.py__init__   s   	
zCharsetMatch.__init__returnc                 C   s>   t |tstdt|jt| j| j|jko| j|jkS )Nz&__eq__ cannot be invoked on {} and {}.)
isinstancer   	TypeErrorformatstr	__class__encodingfingerprintr%   otherr&   r&   r'   __eq__(   s   
zCharsetMatch.__eq__c                 C   s>   t |tstt| j|j }|dk r| j|jkS | j|jk S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?)r*   r   
ValueErrorabschaos	coherence)r%   r2   chaos_differencer&   r&   r'   __lt__-   s   
zCharsetMatch.__lt__c                 C   s   t dt tt| dS )z
        Check once again chaos in decoded text, except this time, with full content.
        Use with caution, this can be very slow.
        Notice: Will be removed in 3.0
        z=chaos_secondary_pass is deprecated and will be removed in 3.0g      ?)warningswarnDeprecationWarningr   r-   r%   r&   r&   r'   chaos_secondary_pass<   s
   z!CharsetMatch.chaos_secondary_passc                 C   s   t dt dS )zy
        Coherence ratio on the first non-latin language detected if ANY.
        Notice: Will be removed in 3.0
        z<coherence_non_latin is deprecated and will be removed in 3.0r   )r:   r;   r<   r=   r&   r&   r'   coherence_non_latinI   s   z CharsetMatch.coherence_non_latinc                 C   s4   t dt td}t|dt|  }t| S )z_
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
        z2w_counter is deprecated and will be removed in 3.0z[0-9\W\n\r\t]+ )	r:   r;   r<   
re_compiler
   r-   lowerr	   split)r%   not_printable_patternstring_printable_onlyr&   r&   r'   	w_counterR   s   zCharsetMatch.w_counterc                 C   s"   | j d u rt| j| jd| _ | j S )Nstrict)r$   r-   r   r   r=   r&   r&   r'   __str__^   s   
zCharsetMatch.__str__c                 C   s   d | j| jS )Nz<CharsetMatch '{}' bytes({})>)r,   r/   r0   r=   r&   r&   r'   __repr__d   s   zCharsetMatch.__repr__r2   c                 C   s8   t |tr	|| krtd|jd |_| j| d S )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r*   r   r4   r,   r.   r$   r    appendr1   r&   r&   r'   add_submatchg   s   zCharsetMatch.add_submatchc                 C      | j S N)r   r=   r&   r&   r'   r/   n      zCharsetMatch.encodingc                 C   sD   g }t  D ]\}}| j|kr|| q| j|kr|| q|S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr/   rJ   )r%   also_known_asupr&   r&   r'   encoding_aliasesr   s   


zCharsetMatch.encoding_aliasesc                 C   rL   rM   r   r=   r&   r&   r'   bom   rN   zCharsetMatch.bomc                 C   rL   rM   rT   r=   r&   r&   r'   byte_order_mark   rN   zCharsetMatch.byte_order_markc                 C   s   dd | j D S )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        c                 S   s   g | ]}|d  qS )r   r&   ).0er&   r&   r'   
<listcomp>   s    z*CharsetMatch.languages.<locals>.<listcomp>r   r=   r&   r&   r'   r      s   zCharsetMatch.languagesc                 C   sp   | j s1d| jv r
dS ddlm}m} t| jr|| jn|| j}t|dks+d|v r-dS |d S | j d d S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiEnglishr   )mb_encoding_languagesencoding_languageszLatin BasedUnknown)r   could_be_from_charsetcharset_normalizer.cdr]   r^   r   r/   len)r%   r]   r^   r   r&   r&   r'   language   s   
zCharsetMatch.languagec                 C   rL   rM   )r   r=   r&   r&   r'   r6      rN   zCharsetMatch.chaosc                 C   s   | j sdS | j d d S )Nr   r      rZ   r=   r&   r&   r'   r7      s   zCharsetMatch.coherencec                 C      t | jd ddS Nd      )ndigits)roundr6   r=   r&   r&   r'   percent_chaos      zCharsetMatch.percent_chaosc                 C   re   rf   )rj   r7   r=   r&   r&   r'   percent_coherence   rl   zCharsetMatch.percent_coherencec                 C   rL   )z+
        Original untouched bytes.
        )r   r=   r&   r&   r'   raw   s   zCharsetMatch.rawc                 C   rL   rM   )r    r=   r&   r&   r'   submatch   rN   zCharsetMatch.submatchc                 C   s   t | jdkS )Nr   )rb   r    r=   r&   r&   r'   has_submatch   s   zCharsetMatch.has_submatchc                 C   sR   | j d ur| j S t }t| D ]}t|}|r|t| qtt|| _ | j S rM   )r   setr-   r   addsortedlist)r%   detected_ranges	characterdetected_ranger&   r&   r'   	alphabets   s   
zCharsetMatch.alphabetsc                 C   s   | j gdd | jD  S )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        c                 S   s   g | ]}|j qS r&   )r/   )rW   mr&   r&   r'   rY      s    z6CharsetMatch.could_be_from_charset.<locals>.<listcomp>)r   r    r=   r&   r&   r'   r`      s   z"CharsetMatch.could_be_from_charsetc                 C      | S z>
        Kept for BC reasons. Will be removed in 3.0.
        r&   r=   r&   r&   r'   first      zCharsetMatch.firstc                 C   rz   r{   r&   r=   r&   r&   r'   best   r}   zCharsetMatch.bestutf_8r/   c                 C   s2   | j du s
| j |kr|| _ t| |d| _| jS )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        Nreplace)r#   r-   encoder"   )r%   r/   r&   r&   r'   output   s   zCharsetMatch.outputc                 C   s   t |   S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   r   	hexdigestr=   r&   r&   r'   r0      s   zCharsetMatch.fingerprintrM   )r2   r   r)   N)r)   r   )r   )'__name__
__module____qualname__bytesr-   floatboolr   r(   r3   r9   propertyr>   r?   r	   rF   rH   rI   rK   r/   r   rS   rU   rV   r   rc   r6   r7   rk   rm   rn   ro   rp   rx   r`   r|   r~   r   r0   r&   r&   r&   r'   r      sz    



r   c                   @   s~   e Zd ZdZddee fddZdd Zdefd	d
Zde	fddZ
deddfddZded fddZded fddZdS )CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    Nresultsc                 C   s   |r	t || _d S g | _d S rM   )rs   _results)r%   r   r&   r&   r'   r(     s   zCharsetMatches.__init__c                 c   s    | j D ]}|V  qd S rM   r   )r%   resultr&   r&   r'   __iter__  s   
zCharsetMatches.__iter__r)   c                 C   sJ   t |tr
| j| S t |tr#t|d}| jD ]}||jv r"|  S qt)z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        F)r*   intr   r-   r   r`   KeyError)r%   itemr   r&   r&   r'   __getitem__
  s   





zCharsetMatches.__getitem__c                 C   s
   t | jS rM   )rb   r   r=   r&   r&   r'   __len__  s   
zCharsetMatches.__len__r   c                 C   s|   t |tstdt|jt|jtkr0| j	D ]}|j
|j
kr/|j|jkr/||  dS q| j	| t| j	| _	dS )z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r*   r   r4   r,   r-   r.   rb   rn   r   r   r0   r6   rK   rJ   rs   )r%   r   matchr&   r&   r'   rJ     s   


zCharsetMatches.appendr   c                 C   s   | j sdS | j d S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   r   r=   r&   r&   r'   r~   +  s   
zCharsetMatches.bestc                 C   s   |   S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )r~   r=   r&   r&   r'   r|   3  s   zCharsetMatches.firstrM   )r   r   r   __doc__r   r   r(   r   r   r   r   rJ   r   r~   r|   r&   r&   r&   r'   r      s    r   c                   @   sl   e Zd Zdededee dee dedee deded	ed
ee defddZe	dd Z
defddZdS )CliDetectionResultpathr/   rS   alternative_encodingsrc   rx   r   r6   r7   unicode_pathis_preferredc                 C   sF   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
d S rM   )r   r   r/   rS   r   rc   rx   r   r6   r7   r   )r%   r   r/   rS   r   rc   rx   r   r6   r7   r   r   r&   r&   r'   r(   @  s   
zCliDetectionResult.__init__c                 C   s2   | j | j| j| j| j| j| j| j| j| j	| j
dS )Nr   r/   rS   r   rc   rx   r   r6   r7   r   r   r   r=   r&   r&   r'   __dict__M  s   zCliDetectionResult.__dict__r)   c                 C   s   t | jdddS )NT   )ensure_asciiindent)r   r   r=   r&   r&   r'   to_json]  s
   zCliDetectionResult.to_jsonN)r   r   r   r-   r   r   r   r   r(   r   r   r   r&   r&   r&   r'   r   >  s
    F
r   )"r:   encodings.aliasesr   hashlibr   jsonr   typingr   r   r   r   collectionsr	   rer
   r   rA   charset_normalizer.constantr   charset_normalizer.mdr   charset_normalizer.utilsr   r   r   r   r   r-   r   CoherenceMatchr   r   CharsetNormalizerMatchr&   r&   r&   r'   <module>   s"     q<'