a
    xd.                  	   @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ ed	ejZed
ejZedejZedejejB ZedejejB ZedejZdZd6eee e eedddZ!d7eee e dddZ"d8eeee edddZ#edejZ$d9eee edddZ%d:eee ee ee ed d!d"Z&d;eee ee ed#d$d%Z'd<eee eee ed'd(d)Z(d=eee e ee edd*d+Z)d>eeeed,d-d.Z*d?eeeee e
ee+ ee f d0d1d2Z,eed3d4d5Z-dS )@z(
Functions for dealing with markup text
    N)name2codepoint)IterableMatchAnyStrOptionalPatternTupleUnion)urljoin)
to_unicode)safe_url_string)
StrOrByteszI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)[^>]*?\shttp-equiv\s*=[^>]*refreshz<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
 Tutf-8)textkeepremove_illegalencodingreturnc                    s,   t t td fdd}t|t| |S )u  Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    'Price: \xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    mr   c              	      s   |   }d }|dr&t|d d}nZ|dr@t|d d}n@|dr|d }|  v rh| dS t|p~t| }|d urz8d|  krdkrn nt|fd	W S t|W S W n t	t
fy   Y n0 r|d
rdS | dS )Ndec
   hex   namedr         cp1252Z	semicolon )	groupdictgetintlowergroupr   bytesdecodechr
ValueErrorOverflowError)r   groupsnumberentity_namer   r   r   B/var/www/html/Ranjet/env/lib/python3.9/site-packages/w3lib/html.pyconvert_entityE   s*    



z(replace_entities.<locals>.convert_entity)r   str_ent_resubr   )r   r   r   r   r/   r   r-   r.   replace_entities$   s    !r3   )r   r   r   c                 C   s   t tt| |S N)boolr1   searchr   )r   r   r   r   r.   has_entitiesf   s    r7   r   )r   tokenr   r   c                 C   s   t |t| |S )ac  Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags('This text contains <a>some tag</a>')
    'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
    ' -- Je ne parle pas  -- fran\xe7ais --  -- '
    >>>

    )_tag_rer2   r   )r   r8   r   r   r   r.   replace_tagsj   s    r:   z<!--.*?(?:-->|$)c                 C   s   t | |}td|S )zRemove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    'test  whatever'
    >>>

    r   )r   _REMOVECOMMENTS_REr2   )r   r   utextr   r   r.   remove_comments   s    

r=   )r   
which_onesr   r   r   c                    s   r rt ddd D dd  D  ttd fddtt tdfd	d
}d}t|tjtjB }||t	| |S )a;  Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    '<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    '<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
        ...
    ValueError: Cannot use both which_ones and keep
    >>>

    z#Cannot use both which_ones and keepc                 S   s   h | ]}|  qS r   r#   .0tagr   r   r.   	<setcomp>       zremove_tags.<locals>.<setcomp>c                 S   s   h | ]}|  qS r   r?   r@   r   r   r.   rC      rD   )rB   r   c                    s    |   } r| v S |  vS d S r4   r?   )rB   )r   r>   r   r.   will_remove   s    z remove_tags.<locals>.will_remover   c                    s    |  d} |rdS |  dS )N   r   r   )r$   )r   rB   )rE   r   r.   
remove_tag   s    
zremove_tags.<locals>.remove_tagz</?([^ >/]+).*?>)
r(   r0   r5   r   recompileDOTALL
IGNORECASEr2   r   )r   r>   r   r   rG   regexretagsr   )r   r>   rE   r.   remove_tags   s    1rN   )r   r>   r   r   c                 C   sF   t | |}|rBddd |D }t|tjtjB }|d|}|S )a  Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    '<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    |c              	   S   s$   g | ]}d | d| d| dqS )<z\b.*?</z>|<z\s*/>r   r@   r   r   r.   
<listcomp>   rD   z,remove_tags_with_content.<locals>.<listcomp>r   )r   joinrH   rI   rJ   rK   r2   )r   r>   r   r<   tagsrM   r   r   r.   remove_tags_with_content   s    
rT   
	)r   r>   
replace_byr   r   c                 C   s*   t | |}|D ]}||t ||}q|S )a$  Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\n``, ``\t``, ``\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    )r   replace)r   r>   rY   r   r<   Zecr   r   r.   replace_escape_chars   s    
r[   c                 C   st   t tt  ttt tt  f  ddd}t| |}d}||tD ]0}t|t r`|t|||d7 }q>||	d7 }q>|S )a`  
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    )txtpatternr   c                 s   sL   d}| | D ]*}|d\}}| || V  |V  |}q| |d  V  d S )Nr   rF   )finditerspan)r\   r]   offsetmatchZmatch_sZmatch_er   r   r.   _get_fragments  s    z&unquote_markup.<locals>._get_fragmentsr   r-   Zcdata_d)
r0   r   r   r	   r   r   	_cdata_re
isinstancer3   r$   )r   r   r   r   rb   r<   Zret_textfragmentr   r   r.   unquote_markup  s    


rf   )r   baseurlr   r   c                 C   sB   t | |d}t|}|r6tt|t|d|dS t|S dS )zReturn the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    )r   rF   N)r=   _baseurl_rer6   r
   r   r$   )r   rg   r   r<   r   r   r   r.   get_base_url2  s    

ri   scriptnoscript)r   rg   r   ignore_tagsr   c                 C   s   zt | |}W n ty*   t|   Y n0 t||}tt|}t|pTt|}|rt	|
d}t|
dd|}t||}||fS dS dS )aY  Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    r"   urlz "')NNN)r   UnicodeDecodeErrorprintrT   r=   r3   _meta_refresh_rer6   _meta_refresh_re2floatr$   r   stripr
   )r   rg   r   rm   r<   r   intervalrn   r   r   r.   get_meta_refreshF  s    

rv   )r   r   c                 C   s
   |  tS )a  
    Strip all leading and trailing space characters (as defined in
    https://www.w3.org/TR/html5/infrastructure.html#space-character).

    Such stripping is useful e.g. for processing HTML element attributes which
    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
    defines them as "valid URL potentially surrounded by spaces"
    or "valid non-empty URL potentially surrounded by spaces".

    >>> strip_html5_whitespace(' hello\n')
    'hello'
    )rt   HTML5_WHITESPACE)r   r   r   r.   strip_html5_whitespacef  s    rx   )r   Tr   )N)r   N)N)r   r   N)r   N)rU   r   N)r   TN)r   r   )r   r   rj   ).__doc__rH   html.entitiesr   typingr   r   r   r   r   r   r	   urllib.parser
   Z
w3lib.utilr   Z	w3lib.urlr   Zw3lib._typesr   rI   rK   r1   rJ   r9   Irh   rq   rr   rc   rw   r0   r5   r3   r7   r:   r;   r=   rN   rT   r[   rf   ri   rs   rv   rx   r   r   r   r.   <module>   s   $

   B   I       +     