
    dh=                         S SK r S SKJrJrJrJrJrJrJr  S SK	J
r
  S SKJr  S SKJr  S SKJr  S SKJr   " S S	\5      rg)
    N)AnyAsyncIteratorIteratorListOptionalSetUnion)urlparse)BeautifulSoup)Document)
BaseLoader)WebBaseLoaderc                      \ rS rSrSr     S#SSS.S\S\S\\   S\S	\S
\S\\   S\\\      4S jjjr	S\S\4S jr
 S$S\\   S\S\S\4S jjrS\\\\   4   S\4S jrS\S\4S jrS\S\\   4S jr S%S\S\\   S\\   S\\   4S jjr S%S\S\S\\   S\\   S\\   4
S jjrS\\   4S jrS\\   4S jr S%S\S\\   S\\   4S  jjrS\S\\   4S! jrS"rg)&GitbookLoader   a  Load `GitBook` data.

1. load from either a single page, or
2. load all (relative) paths in the sitemap, handling nested sitemap indexes.

When `load_all_paths=True`, the loader parses XML sitemaps and requires the
`lxml` package to be installed (`pip install lxml`).
N)sitemap_urlallowed_domainsweb_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressr   r   c                   U=(       d    UU l         U R                   R                  S5      (       a  U R                   SS U l         Xl        X l        X@l        XPl        X`l        Xl        U R                  c$  [        U5      R                  n	U	(       a  U	1U l        U(       a  U=(       d    U R                    S3U l
        OXl
        U R                  U R                  5      (       d%  [        SU R                   SU R                   35      eg)a  Initialize with web page and whether to load all paths.

Args:
    web_page: The web page to load or the starting point from where
        relative paths are discovered.
    load_all_paths: If set to True, all relative paths in the navbar
        are loaded instead of only `web_page`. Requires `lxml` package.
    base_url: If `load_all_paths` is True, the relative paths are
        appended to this base url. Defaults to `web_page`.
    content_selector: The CSS selector for the content to load.
        Defaults to "main".
    continue_on_failure: whether to continue loading the sitemap if an error
        occurs loading a url, emitting a warning instead of raising an
        exception. Setting this to True makes the loader more robust, but also
        may result in missing data. Default: False
    show_progress: whether to show a progress bar while loading. Default: True
    sitemap_url: Custom sitemap URL to use when load_all_paths is True.
        Defaults to "{base_url}/sitemap.xml".
    allowed_domains: Optional set of allowed domains to fetch from.
        If None (default), the loader will restrict crawling to the domain
        of the `web_page` URL to prevent potential SSRF vulnerabilities.
        Provide an explicit set (e.g., {"example.com", "docs.example.com"})
        to allow crawling across multiple domains. Use with caution in
        server environments where users might control the input URLs.
/Nz/sitemap.xmlz
Domain in z% is not in the allowed domains list: )r   endswithr   r   r   r   r   r   r
   netloc	start_url_is_url_allowed
ValueError)
selfr   r   r   r   r   r   r   r   initial_domains
             d/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/gitbook.py__init__GitbookLoader.__init__   s    J !,H==!!#&& MM#2.DM , 0#6 *. '%h/66N(6'7$ (Jt}}o\,JDN%N ##DNN33T^^,,Q''(*  4    urlreturnc                     U R                   c  g [        U5      nUR                  S;  a  gUR                  (       d  gUR                  U R                   ;   $ ! [         a     gf = f)z0Check if a URL has an allowed scheme and domain.F)httphttps)r   r
   schemer   	Exception)r"   r(   parseds      r$   r    GitbookLoader._is_url_allowedY   sf    
 '	c]F }}$55 ====D$8$888 		s   A A A 
A$#A$url_listurl_typec                     U R                  U5      (       a  UR                  U5        g[        R                  " SU SU 35        g)a
  Safely add a URL to a list if it's from an allowed domain.

Args:
    url_list: The list to add the URL to
    url: The URL to add
    url_type: Type of URL for warning message (e.g., "sitemap", "content")

Returns:
    bool: True if URL was added, False if skipped
TzSkipping disallowed z URL: F)r    appendwarningswarn)r"   r1   r(   r2   s       r$   _safe_add_urlGitbookLoader._safe_add_urlp   sA     $$OOC MM0
&FGr'   url_or_urlsc                 @    [        UU R                  U R                  S9$ )z|Create a new WebBaseLoader instance for the given URL(s).

This ensures each operation gets its own isolated WebBaseLoader.
)web_pathr   r   )r   r   r   )r"   r9   s     r$   _create_web_loader GitbookLoader._create_web_loader   s&    
   $ 8 8,,
 	
r'   soupc                 (    UR                  S5      SL$ )z+Check if the soup contains a sitemap index.sitemapindexN)find)r"   r>   s     r$   _is_sitemap_indexGitbookLoader._is_sitemap_index   s    yy(44r'   c                     UR                  S5      n/ nU HL  nUR                  S5      nU(       d  M  UR                  (       d  M0  U R                  X5R                  S5        MN     U$ )z*Extract sitemap URLs from a sitemap index.sitemaploc)find_allrA   textr7   )r"   r>   sitemap_tagsurlsrE   rF   s         r$   _extract_sitemap_urls#GitbookLoader._extract_sitemap_urls   sU    }}Y/#G,,u%Cssxxx""49= $ r'   processed_urls
web_loaderc                 P   Uc  U R                  U R                  5      nU R                  U5      (       a  U R                  U5      n/ nU H  nXb;   a  [        R
                  " SU 35        M#  UR                  U5         UR                  nU/Ul        UR                  SS9nXsl        U R                  XU5      n	UR                  U	5        M     U$ U R                  U5      $ ! [         a9  n
U R                  (       a"  [        R
                  " SU SU
 35         Sn
A
M  e Sn
A
ff = f)a  Process a sitemap, handling both direct content URLs and sitemap indexes.

Args:
    soup: The BeautifulSoup object of the sitemap
    processed_urls: Set of already processed URLs to avoid cycles
    web_loader: WebBaseLoader instance to reuse for all requests,
        created if None
Nz(Skipping already processed sitemap URL: lxml-xmlparserError processing sitemap : )r<   r   rB   rK   r5   r6   add	web_pathsscrape_process_sitemapextendr.   r   
_get_paths)r"   r>   rM   rN   sitemap_urlsall_content_urlsr   original_web_pathssitemap_soupcontent_urlses              r$   rX   GitbookLoader._process_sitemap   s5    00@J !!$''55d;L!+0MMB;-P "";/)3)=)=&,7=J( $.#4#4J#4#GL ,>( $(#8#8$j$L %++L91  ,> $# ??4(( ! // (A+bQRPS&TUU	s   ?AC""
D%,-D D  D%c                   #    Uc  U R                  U R                  5      nU R                  U5      (       a  U R                  U5      n/ nU Vs/ sH  owU;  d  M
  UPM     nnU(       d  / $ UR                  n	Xl        UR                  USS9I Sh  vN n
Xl        [        X5       HB  u  pUR                  U5         U R                  XX45      I Sh  vN nUR                  U5        MD     U$ U R                  U5      $ s  snf  Ns N3! [         a9  nU R                  (       a"  [        R                  " SU SU 35         SnAM  e SnAff = f7f)a&  Async version of _process_sitemap.

Args:
    soup: The BeautifulSoup object of the sitemap
    base_url: The base URL for relative paths
    processed_urls: Set of already processed URLs to avoid cycles
    web_loader: WebBaseLoader instance to reuse for all requests,
        created if None
NrP   rQ   rS   rT   )r<   r   rB   rK   rV   ascrape_allziprU   _aprocess_sitemaprY   r.   r   r5   r6   rZ   )r"   r>   r   rM   rN   r[   r\   r(   new_urlsr]   soupsr   r^   r_   r`   s                  r$   re   GitbookLoader._aprocess_sitemap   sZ    " 00@J !!$''55d;L! (4Q|.7P|HQ	 ",!5!5#+  %00*0MME $6 -0-A)"";/
)-)?)?$* $L %++L9 .B $# ??4((C R N$ ! // (A+bQRPS&TUU	sf   AEC?C? 0ED+E=DDD(ED
E-E?EEEEc              #     #    U R                   (       dS  U R                  U R                  5      nUR                  5       nU R	                  X R                  5      nU(       a  Uv   ggU R                  U R
                  5      nUR                  SS9n[        5       nU R                  XE5      nU(       d4  U R                  (       a#  [        R                  " SU R
                   35        / nU H  nU R                  XxS5        M     U(       d  gU R                  U5      n	U	R                  U5      n
[        X5       H#  u  pHU R	                  XH5      nU(       d  M  Uv   M%     g7f)zDFetch text from one single GitBook page or recursively from sitemap.rP   rQ   $No content URLs found in sitemap at contentN)r   r<   r   rW   _get_documentr   setrX   r   r5   r6   r7   
scrape_allrd   )r"   temp_loaderr>   doc	soup_inforM   relative_pathsrJ   r(   content_loader
soup_infoss              r$   	lazy_loadGitbookLoader.lazy_load  s/    ""11$--@K%%'D$$T==9C	  11$..AK#****=I (+uN!229MN!d&8&8 DT^^DTUV !D%""4i8 &  "44T:N (2248J"%j"7	((83I #8s   EE!	E!c                n  #    U R                   (       dm  U R                  U R                  5      nUR                  U R                  /5      I Sh  vN nUS   nU R	                  X0R                  5      nU(       a  U7v   ggU R                  U R
                  5      nUR                  U R
                  /SS9I Sh  vN nUS   n[        5       nU R                  X0R                  U5      I Sh  vN nU(       d4  U R                  (       a#  [        R                  " SU R
                   35        / nU H  nU R                  XxS5        M     U(       d  gU R                  U5      n	U	R                  U5      I Sh  vN n
[        X5       H   u  p8U R	                  X85      nUc  M  U7v   M"     g GN^ N N N;7f)z/Asynchronously fetch text from GitBook page(s).Nr   rP   rQ   rj   rk   )r   r<   r   rc   rl   r   rm   re   r   r   r5   r6   r7   rd   )r"   ro   rg   rq   rp   rM   rr   rJ   r(   rs   rt   	maybe_docs               r$   
alazy_loadGitbookLoader.alazy_load?  s    ""11$--@K%114==/BBEaI$$Y>C	  11$..AK%114>>2B:1VVEaI (+uN#'#9#9==.$ N "d&8&8 DT^^DTUV !D%""4i8 &  "44T:N  .99$??J"%j"7	 ..y>	(#O #8I C W
( @sP   AF5F,A+F5:F/;2F5-F1.B	F57F38&F5"F5/F51F53F5
custom_urlc                    UR                  U R                  5      nU(       d  gUR                  SS9R                  5       nUR                  S5      nU(       a  UR                  OSnU=(       d    U R
                  US.n[        XGS9$ )z,Fetch content from page and return Document.N
)	separatorh1 )sourcetitle)page_contentmetadata)rA   r   get_textstriprH   r   r   )r"   r>   r{   page_content_rawrk   title_if_existsr   r   s           r$   rl   GitbookLoader._get_documentm  sx      99T%:%:;"++d+;AAC*//5(7$$R(9DMMEJW@@r'   c                     / nUR                  S5       H1  nUR                  (       d  M  UR                  UR                  5        M3     U$ )zFetch all URLs in the sitemap.rF   )rG   rH   r4   )r"   r>   rJ   rF   s       r$   rZ   GitbookLoader._get_pathsz  s<    =='Cxxx CHH%	 (
 r'   )r   r   r   r   r   r   r   r   )FNmainFT)URL)N)__name__
__module____qualname____firstlineno____doc__strboolr   r   r%   r    r   r7   r	   r   r<   r   rB   rK   rX   re   r   r   ru   r   ry   r   rl   rZ   __static_attributes__ r'   r$   r   r      s     %"& &$)"A &*.2AA A 3-	A
 A "A A c]A "#c(+AF3 4 0 >CS	(+7:	(	
eCcN.C 	
 	
5m 5 5- DI  /3	9)9) C9) ]+	9)
 
c9)@ /3;);) ;) C	;)
 ]+;) 
c;)z(8H- (T,$-"9 ,$^ 6:AA%-c]A	(	As tCy r'   r   )r5   typingr   r   r   r   r   r   r	   urllib.parser
   bs4r   langchain_core.documentsr   )langchain_community.document_loaders.baser   -langchain_community.document_loaders.web_baser   r   r   r'   r$   <module>r      s.     K K K !  - @ GvJ vr'   