o
    tBhE=                     @   s  d dl mZmZ d dlmZmZmZmZmZ zd dl	m
Z
 W n ey-   eedf Z
Y nw d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d d	lZd d
lmZmZmZmZmZmZ d dl m!Z!m"Z"m#Z#m$Z$ e%dZ&e&'ej( e) Z*e*+e,d e&-e* 									d&de.de/de/de0dee dee de1de1defddZ2									d&dede/de/de0dee dee de1de1defdd Z3									d&d!e
de/de/de0dee dee de1de1defd"d#Z4d'd!e
de/de/de0dee dee de1defd$d%Z5d	S )(    )splitextbasename)ListBinaryIOOptionalSetUnion)PathLikezos.PathLike[str])TOO_SMALL_SEQUENCETOO_BIG_SEQUENCEIANA_SUPPORTED)
mess_ratio)CharsetMatchesCharsetMatch)warnN)any_specified_encodingis_multi_byte_encodingidentify_sig_or_bomshould_strip_sig_or_bomis_cp_similar	iana_name)coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratioscharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc           /      C   s6  |s	t tj nt tj t| }|dkr(t d tt| dddg dgS |dur=t dd		| d
d |D }ng }|durTt dd		| dd |D }ng }||| krht d||| d}|}|dkrx|| |k rxt
|| }t| tk }	t| tk}
|	rtd| g }|du rt| nd}|dur|| t d| t }g }g }d}d}d}d}d}t }t| \}}|dur|| t dt|| |d d|vr|d |t D ]`}|r||vrq|r||v rq||v rq|| d}||k}|ot|}|dv r|du rt d| qzt|}W n ttfy7   t d| Y qw z9|
r]|du r]t|du rO| dt
d n	| t|t
d |d nt|du re| n| t|d |d}W n= ty } zt d|t| || |s|d7 }W Y d}~qd}~w ty   || |s|d7 }Y qw d}|D ]}t||rd} nq|rt d|| qt|du rdnt||t
|| } |o|duot||k }!|!rt d| t
t| d }"|"dk rd}"d}#g }$g }%| D ]E}&| |&|&|  }'|r#|du r#||' }'|'j |dd }(|$|( |%t!|(| |%d! |krB|#d7 }#|#|"ksO|rQ|du rQ nq|%r_t"|%t|% })nd})|)|ksk|#|"kr|| |sw|d7 }t d"||#t#|)d# d$d% |dd|fv rt| ||dg |}*||kr|*}n
|dkr|*}n|*}qt d&|t#|)d# d$d% |st$|}+nt%|}+|+rt d'|t|+ g },|$D ]}(t&|(d(|+rd)	|+nd}-|,|- qt'|,}.|.rt d*|.| |t| ||)||.| ||ddfv r#|)d(k r#t d+| t|| g  S ||kr7t d,| t|| g  S |d! j(rGt d-||| j) qt|dkr|sX|sX|r]t d. |rnt d/|j* || |S |rv|du s|r|j+|j+krt d0 || |S |rt d1 || |S )2aD  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    r   zXGiven content is empty, stopping the process very early, returning empty utf_8 str matchutf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S      g | ]}t |d qS Fr   .0cp r0   m/var/www/html/riverr-enterprise-integrations-main/venv/lib/python3.10/site-packages/charset_normalizer/api.py
<listcomp>N       zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S   r*   r+   r,   r-   r0   r0   r1   r2   W   r3   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.   z>Trying to detect encoding from a tiny portion of ({}) byte(s).Tz@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      ignore)errorszc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z0%s is most likely the one. Stopping the process.z[%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.z:Using %s code page we detected the following languages: %szONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z#%s will be used as a fallback matchz&utf_8 will be used as a fallback matchz&ascii will be used as a fallback match),loggersetLevelloggingCRITICALINFOlenwarningr   r   joinintr
   r   r   formatr   appendinfosetr   r   addr   r   ModuleNotFoundErrorImportErrordebugstrUnicodeDecodeErrorLookupErrorr   rangedecoder   sumroundr   r   r   r   	languages
_languagesr8   fingerprint)/r   r    r!   r"   r#   r$   r%   r&   lengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedsingle_byte_hard_failure_countsingle_byte_soft_failure_countresultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_count	md_chunks	md_ratiosicut_sequencechunkmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergedr0   r0   r1   
from_bytes   s  





,











$



r   fpc              	   C   s   t |  |||||||S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r   read)r   r    r!   r"   r#   r$   r%   r&   r0   r0   r1   from_fpb  s   r   pathc           	   
   C   sD   t | d}t||||||||W  d   S 1 sw   Y  dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )	r   r    r!   r"   r#   r$   r%   r&   r   r0   r0   r1   	from_path|  s   $r   c              	   C   s   t | ||||||}t| }tt|}	t|dkr!td|| }
|	d  d|
j 7  < t	d| 
|d|	d}||
  W d   |
S 1 sRw   Y  |
S )zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r)   wbN)r   r   listr   rG   IOErrorrK   bestr8   r   replacerI   writeoutput)r   r    r!   r"   r#   r$   r%   rj   filenametarget_extensionsresultr   r0   r0   r1   	normalize  s.   
 
r   )r   r   r   NNTF)r   r   r   NNT)6os.pathr   r   typingr   r   r   r   r   osr	   rQ   rS   charset_normalizer.constantr
   r   r   charset_normalizer.mdr   charset_normalizer.modelsr   r   warningsr   rD   charset_normalizer.utilsr   r   r   r   r   r   charset_normalizer.cdr   r   r   r   	getLoggerrB   rC   DEBUGStreamHandlerhandlersetFormatter	Formatter
addHandlerbytesrJ   floatboolr   r   r   r   r0   r0   r0   r1   <module>   s     

	
  K	
	
8