
    dh"              	           S SK r S SKrS SKJrJrJrJrJrJrJ	r	J
r
Jr  S SKJr  S SKJr  S SKJr  S\S\4S jrS	\S
\S\4S jrS\S\S\\	\   SS4   4S jrS\S\\\4   4S jr " S S\5      rg)    N)	AnyCallableDict	GeneratorIterableIteratorListOptionalTuple)urlparse)Document)WebBaseLoadercontentreturnc                 4    [        U R                  5       5      $ N)strget_text)r   s    d/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/sitemap.py_default_parsing_functionr      s    w!""    meta_contentc                     SU S   0U E$ )Nsourceloc )r   r   s     r   _default_meta_functionr      s    d5k*T**r   iterablesizec              #      #    [        U 5      n[        [        R                  " X!5      5      =n(       a-  Uv   [        [        R                  " X!5      5      =n(       a  M,  g g 7fr   )iterlist	itertoolsislice)r   r    ititems       r   _batch_blockr(      sN     	hBy''12
2$
2
 y''12
2$
2
2s   AA" A"urlc                 H    [        U 5      nUR                  UR                  4$ )zExtract the scheme + domain from a given URL.

Args:
    url (str): The input URL.

Returns:
    return a 2-tuple of scheme and domain
)r   schemenetloc)r)   
parsed_uris     r   _extract_scheme_and_domainr.   #   s$     #Jj////r   c                      ^  \ rS rSrSr         SS\S\\\      S\\   S\\	   S\	S	\\   S
\
S\
S\
S\	S\4U 4S jjjrSS.S\S\	S\\   4S jjrS\\   4S jrSrU =r$ )SitemapLoader0   a  Load a sitemap and its URLs.

**Security Note**: This loader can be used to load all URLs specified in a sitemap.
    If a malicious actor gets access to the sitemap, they could force
    the server to load URLs from other domains by modifying the sitemap.
    This could lead to server-side request forgery (SSRF) attacks; e.g.,
    with the attacker forcing the server to load URLs from internal
    service endpoints that are not publicly accessible. While the attacker
    may not immediately gain access to this data, this data could leak
    into downstream systems (e.g., data loader is used to load data for indexing).

    This loader is a crawler and web crawlers should generally NOT be deployed
    with network access to any internal servers.

    Control access to who can submit crawling requests and what network access
    the crawler has.

    By default, the loader will only load URLs from the same domain as the sitemap
    if the site map is not a local file. This can be disabled by setting
    restrict_to_same_domain to False (not recommended).

    If the site map is a local file, no such risk mitigation is applied by default.

    Use the filter URLs argument to limit which URLs can be loaded.

    See https://python.langchain.com/docs/security
r   web_pathfilter_urlsparsing_function	blocksizeblocknummeta_functionis_localcontinue_on_failurerestrict_to_same_domain	max_depthkwargsc                 V  > Ub  US:  a  [        S5      eUS:  a  [        S5      e SSKn[        TU ]  " SSU/0UD6  X l        Xl        U=(       d    [        U l        U=(       d    [        U l
        X@l        XPl        Xpl        Xl        Xl        g! [         a    [        S5      ef = f)	a  Initialize with webpage path and optional filter URLs.

Args:
    web_path: url of the sitemap. can also be a local path
    filter_urls: a list of regexes. If specified, only
        URLS that match one of the filter URLs will be loaded.
        *WARNING* The filter URLs are interpreted as regular expressions.
        Remember to escape special characters if you do not want them to be
        interpreted as regular expression syntax. For example, `.` appears
        frequently in URLs and should be escaped if you want to match a literal
        `.` rather than any character.
        restrict_to_same_domain takes precedence over filter_urls when
        restrict_to_same_domain is True and the sitemap is not a local file.
    parsing_function: Function to parse bs4.Soup output
    blocksize: number of sitemap locations per block
    blocknum: the number of the block that should be loaded - zero indexed.
        Default: 0
    meta_function: Function to parse bs4.Soup output for metadata
        remember when setting this method to also copy metadata["loc"]
        to metadata["source"] if you are using this field
    is_local: whether the sitemap is a local file. Default: False
    continue_on_failure: whether to continue loading the sitemap if an error
        occurs loading a url, emitting a warning instead of raising an
        exception. Setting this to True makes the loader more robust, but also
        may result in missing data. Default: False
    restrict_to_same_domain: whether to restrict loading to URLs to the same
        domain as the sitemap. Attention: This is only applied if the sitemap
        is not a local file!
    max_depth: maximum depth to follow sitemap links. Default: 10
N   z&Sitemap blocksize should be at least 1r   z(Sitemap blocknum can not be lower then 0zAlxml package not found, please install it with `pip install lxml`	web_pathsr   )
ValueErrorlxmlImportErrorsuper__init__allow_url_patternsr:   r   r4   r   r7   r5   r6   r8   r9   r;   )selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   rA   	__class__s                r   rD   SitemapLoader.__init__M   s    Z  Y]EFFa<GHH	 	8H:88 #.'>$ 0 M4M*D.D"  #6 "'  	S 	s   B B(depthsouprJ   r   c          	      Z  ^
 X R                   :  a  / $ / nUR                  S5       GH  nUR                  S5      nU(       d  M  UR                  R	                  5       m
U R
                  (       a5  U R                  (       d$  [        T
5      [        U R                  5      :w  a  M~  U R                  (       a&  [        U
4S jU R                   5       5      (       d  M  UR                  S Vs0 sH7  nUR                  U5      =n(       d  M  XgR                  R	                  5       _M9     sn5        GM     UR                  S5       H^  nUR                  S5      nU(       d  M  U R                  UR                  /S5      S   n	UR                  U R                  XS-   S	95        M`     U$ s  snf )
zParse sitemap xml and load into a list of dicts.

Args:
    soup: BeautifulSoup object.
    depth: current depth of the sitemap. Default: 0

Returns:
    List of dicts.
r)   r   c              3   P   >#    U H  n[         R                  " UT5      v   M     g 7fr   )rematch).0regexp_patternloc_texts     r   	<genexpr>.SitemapLoader.parse_sitemap.<locals>.<genexpr>   s&      3&=N 22&=s   #&)r   lastmod
changefreqprioritysitemapxmlr   r>   rI   )r;   find_allfindtextstripr:   r8   r.   r2   rE   anyappend
scrape_allextendparse_sitemap)rF   rK   rJ   elsr)   r   tagproprX   
soup_childrR   s             @r   rb   SitemapLoader.parse_sitemap   si    NN"I=='C((5/C xx~~'H++DMM-h7;UMM<  &&s 3&*&=&=3 0 0 JJ  LK #-- +C**K) (8 }}Y/G,,u%C#((U;A>JJJt))*AI)FG 0 
s   *F(
	F(
c              #     #    U R                   (       a-   SSKn[        U R                  5      nUR                  US5      nOU R                  U R                  SS9nU R                  U5      nU R                  bU  [        [        X@R                  5      5      n[        U5      nUS-
  U R                  :  a  [        S5      eXPR                     nU R                  U Vs/ sH  nSU;   d  M  US   R                  5       PM      sn5      n[!        U5       H2  u  p[#        U R%                  U
5      U R'                  XI   U
5      S	9v   M4     g! [         a    [        S5      ef = fs  snf 7f)
zLoad sitemap.r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`rY   )parserr>   zBSelected sitemap does not contain enough blocks for given blocknumr   )page_contentmetadata)r8   bs4rB   openr2   BeautifulSoup_scraperb   r5   r#   r(   lenr6   r@   r`   r]   	enumerater   r4   r7   )rF   rl   fprK   rc   elblocks
blockcountelresultsiresults              r   	lazy_loadSitemapLoader.lazy_load   sG    == dmm$B$$R/D<<e<<D  &>>%Lnn=>HXJA~- X  }}-//s"Rserk#42e9??#4s"RS"7+IA!226:++CFF;  ,1  !9 , #Ss/   E(E
 CE(	E#*E#A	E(
E  E()	rE   r6   r5   r9   r8   r;   r7   r4   r:   )	NNNr   NFFT
   )__name__
__module____qualname____firstlineno____doc__r   r
   r	   r   intboolr   rD   dictrb   r   r   ry   __static_attributes____classcell__)rG   s   @r   r0   r0   0   s    > ,0/3#',0$)(,H#H# d3i(H# #8,	H#
 C=H# H#  )H# H# "H# "&H# H# H# H#T 89 2# 2 2T$Z 2h!8H- ! !r   r0   )r$   rN   typingr   r   r   r   r   r   r	   r
   r   urllib.parser   langchain_core.documentsr   -langchain_community.document_loaders.web_baser   r   r   r   r   r   r(   r.   r0   r   r   r   <module>r      s     	
 
 
 " - G#s #s #+ + + +8 3 9T$Zt=S3T 
0C 
0E#s(O 
0|M |r   