
    dh;                         S r SSKrSSKrSSKrSSKJrJrJrJrJ	r	J
r
JrJr  SSKrSSKrSSKJr  SSKJr  SSKJr  SSKJr  \R.                  " \5      r\" 5       SS	S
SSSS.rS\S\S\4S jr " S S\5      rg)zWeb base loader class.    N)AnyAsyncIteratorDictIteratorListOptionalSequenceUnion)
deprecated)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                    SU0nU R                  S5      =n(       a  UR                  5       US'   U R                  SSS0S9=n(       a  UR                  SS5      US'   U R                  S	5      =n(       a  UR                  S
S5      US'   U$ )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r    s         e/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/web_base.py_build_metadatar(      s    #H		'""u"!NN,iiv}.EiFF{F"-//)=T"Uyy  t #xx0DEO    c            &          \ rS rSrSr               S4SSS.S\\\\   4   S\\	   S	\
S
\\	   S\
S\
S\\   S\\   S\S\S\\\\4      S\
S\\\\4      S\\\\4      S\S\
S\
SS4$S jjjr\S\4S j5       r S5S\S\S\S\S\4
S jjrS\S \R(                  S\4S! jrS"\\   S\4S# jr\S$\SS4S% j5       r S6S&\S"\\   S$\\S4   S\\   4S' jjrS6S"\\   S$\\S4   S\\   4S( jjr S6S"\\   S$\\S4   S\\   4S) jjr  S7S\S$\\S4   S\\	   S\4S* jjrS6S$\\S4   S\4S+ jjrS\\    4S, jr!S\"\    4S- jr#\$" S.S/S0S19S\\    4S2 j5       r%S3r&g)8WebBaseLoader*   aI  
WebBaseLoader document loader integration

Setup:
    Install ``langchain_community``.

    .. code-block:: bash

        pip install -U langchain_community

Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import WebBaseLoader

        loader = WebBaseLoader(
            web_path = "https://www.espn.com/"
            # header_template = None,
            # verify_ssl = True,
            # proxies = None,
            # continue_on_failure = False,
            # autoset_encoding = True,
            # encoding = None,
            # web_paths = (),
            # requests_per_second = 2,
            # default_parser = "html.parser",
            # requests_kwargs = None,
            # raise_for_status = False,
            # bs_get_text_kwargs = None,
            # bs_kwargs = None,
            # session = None,
            # show_progress = True,
            # trust_env = False,
        )

Lazy load:
    .. code-block:: python

        docs = []
        for doc in loader.lazy_load():
            docs.append(doc)
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        ESPN - Serving Sports Fans. Anytime. Anywhere.

        {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


Async load:
    .. code-block:: python

        docs = []
        async for doc in loader.alazy_load():
            docs.append(doc)
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        ESPN - Serving Sports Fans. Anytime. Anywhere.

        {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

.. versionchanged:: 0.3.14

    Deprecated ``aload`` (which was not async) and implemented a native async
    ``alazy_load``. Expand below for more details.

    .. dropdown:: How to update ``aload``

        Instead of using ``aload``, you can use ``load`` for synchronous loading or
        ``alazy_load`` for asynchronous lazy loading.

        Example using ``load`` (synchronous):

        .. code-block:: python

            docs: List[Document] = loader.load()

        Example using ``alazy_load`` (asynchronous):

        .. code-block:: python

            docs: List[Document] = []
            async for doc in loader.alazy_load():
                docs.append(doc)

        This is in preparation for accommodating an asynchronous ``aload`` in the
        future:

        .. code-block:: python

            docs: List[Document] = await loader.aload()

NTF)show_progress	trust_envweb_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr-   r.   r   c                   U(       a  U(       a  [        S5      eU(       a  [        U5      U l        Oh[        U[        5      (       a	  U/U l        OJ[        U[
        5      (       a  [        U5      U l        O$[        S[        U5       S[        U5       S35      eXl        Xl	        U=(       d    0 U l
        Xl        UU l        U=(       d    0 U l        U=(       d    0 U l        U(       a  Xl        O[         R"                  " 5       nU=(       d    [$        R'                  5       nUR)                  S5      (       d   SSKJn  U" 5       R.                  US'   [7        U5      Ul        X?l        U(       a  UR<                  R?                  U5        Xl        XPl         X`l!        Xpl"        UU l#        g	! [0         a    [2        R5                  S5         Nyf = f)
ax  Initialize loader.

Args:
    web_paths: Web paths to load from.
    requests_per_second: Max number of concurrent requests to make.
    default_parser: Default parser to use for BeautifulSoup.
    requests_kwargs: kwargs for requests
    raise_for_status: Raise an exception if http status code denotes an error.
    bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
    bs_kwargs: kwargs for beatifulsoup4 web page parsing
    show_progress: Show progress bar when loading pages.
    trust_env: set to True if using proxy to make web requests, for example
        using http(s)_proxy environment variables. Defaults to False.
zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)$
ValueErrorlistr6   
isinstancestrr	   	TypeErrortyper7   r8   r9   r:   r-   r;   r<   r=   requestsSessiondefault_header_templatecopyr%   fake_useragentr@   randomImportErrorloggerinfodictheadersverifyr2   updater3   r4   r5   r.   )selfr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r-   r.   r@   s                      r'   __init__WebBaseLoader.__init__   s   H 	D  !)_DN#&&&ZDN(++!(^DN=d8n=M N99=i8IL  $7 ,.4" 0*"4":"b"L&&(G-O1H1M1M1OO"&&|44	84=K4F4FOL1 #?3GO'N&&w/"L#6  0 " # KK8s   :F, ,GGc                 h    [        U R                  5      S:  a  [        S5      eU R                  S   $ )N   zMultiple webpaths found.r   )lenr6   rA   )rT   s    r'   r/   WebBaseLoader.web_path   s.    t~~"788~~a  r)   r   retriescooldownbackoffc                   #    [         R                  " U R                  S9 IS h  vN n[        U5       H  n [	        U R
                  R                  U R
                  R                  R                  5       S9nU R
                  R                  (       d  SUS'   UR                  " U40 U R                  U-  D6 IS h  vN nU R                  (       a  UR                  5         UR                  5       I S h  vN sS S S 5      IS h  vN   s  sS S S 5      IS h  vN   $    S S S 5      IS h  vN   ['        S5      e GN N NJ N< N+! , IS h  vN  (       d  f       GM+  = f! [         R                   a^  n	XbS-
  :X  a  e [        R!                  SU SUS-    SU S	U	 S
3	5        ["        R$                  " X4U-  -  5      I S h  vN     S n	A	GM  S n	A	ff = f N! , IS h  vN  (       d  f       N= f7f)N)r.   )rQ   cookiesFsslrX   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpClientSessionr.   rangerP   r=   rQ   r_   get_dictrR   r%   r9   r:   textClientConnectionErrorrN   warningasynciosleeprA   )
rT   r   r[   r\   r]   r=   ikwargsresponsees
             r'   _fetchWebBaseLoader._fetch   s     ((4>>BBg7^C#' $ 4 4 $ 4 4 = = ?$F  <<..(-u&{{  $ 4 4v =   !00$557%-]]_4      CBB# CB2 /003 C 
  5  C        44 CaK'-cU. 1ugQwir!MC &mmHz,ABBBBC! CBBBs   #G4D<G4GBE#=D?
>E#5E6E
7E:E#E
E#GG4EG4G!G4,G-G4?E#EE#G4E EE E#G E##G7AGG
G	GGGG4G1 G#!G1-G4	semaphorec                   #    U IS h  vN    U R                  U5      I S h  vN sS S S 5      IS h  vN   $  N/ N N	! [         a`  nU R                  (       a/  [        R	                  SU S35         S nAS S S 5      IS h  vN    g[        R                  SU S35        UeS nAff = f! , IS h  vN  (       d  f       g = f7f)Nra   z*, skipping due to continue_on_failure=True za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rp   	Exceptionr3   rN   ri   	exception)rT   r   rr   ro   s       r'   _fetch_with_rate_limit$WebBaseLoader._fetch_with_rate_limit  s      9![[-- 99-   ++NN)# /4 5  99   %cU +L L  99s   C9CB,?;?C=C?C
B)	*B$3B,7CBC	B$$B))B,,C2B53C?Curlsc                   #    [         R                  " U R                  5      n/ nU H9  n[         R                  " U R	                  XB5      5      nUR                  U5        M;      U R                  (       a"  SSKJn  UR                  " USSSS.6I Sh  vN $ [         R                  " U6 I Sh  vN $  N N! [         a5    [        R                  " S5        [         R                  " U6 I Sh  vN  s $ f = f7f)	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrX   )descasciiminintervalNz2For better logging of progress, `pip install tqdm`)rj   	Semaphorer7   ensure_futurerw   appendr-   tqdm.asyncior{   gatherrM   warningswarn)rT   ry   rr   tasksr   taskr{   s          r'   	fetch_allWebBaseLoader.fetch_all  s     %%d&>&>?	C(()D)DS)TUDLL 	0!!5)00!11   %^^U333	 4 	0MMNO ////	0sf   A"C9%.B7 B3B7 C9B7 .B5/B7 2C93B7 5B7 76C6-C0.C63C95C66C9parserc                 V    / SQnX;  a   [        SSR                  U5      -   S-   5      eg)z#Check that parser is valid for bs4.)html.parserlxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)rA   join)r   valid_parserss     r'   _check_parserWebBaseLoader._check_parser.  s8     O&*TYY}-EEK  'r)   resultsc                     SSK Jn  / n[        U5       Hd  u  pgX&   nUc6  UR                  S5      (       a  SnOU R                  nU R                  U5        UR                  U" Xs40 U R                  D65        Mf     U$ )z0Unpack fetch results into BeautifulSoup objects.r   BeautifulSoup.xmlr   )bs4r   	enumerateendswithr8   r   r   r<   )	rT   r   ry   r   r   final_resultsrl   resultr   s	            r'   _unpack_fetch_results#WebBaseLoader._unpack_fetch_results7  sz     	&"7+IA'C~<<''"F!00F""6*  v!P!PQ , r)   c                 l    [         R                  " U R                  U5      5      nU R                  X1US9$ )z2Fetch all urls, then return soups for all results.r   )rj   runr   r   rT   ry   r   r   s       r'   
scrape_allWebBaseLoader.scrape_allI  s/    ++dnnT23))')GGr)   c                 `   #    U R                  U5      I Sh  vN nU R                  X1US9$  N7f)z8Async fetch all urls, then return soups for all results.Nr   )r   r   r   s       r'   ascrape_allWebBaseLoader.ascrape_allN  s3      t,,))')GG -s   .,.c                    SSK Jn  Uc%  UR                  S5      (       a  SnOU R                  nU R	                  U5        U R
                  R                  " U40 U R                  D6nU R                  (       a  UR                  5         U R                  b  U R                  Ul	        O"U R                  (       a  UR                  Ul	        U" UR                  U40 U=(       d    0 D6$ )Nr   r   r   r   )r   r   r   r8   r   r=   r%   r9   r:   r5   r4   apparent_encodingrg   )rT   r   r   r<   r   html_docs         r'   _scrapeWebBaseLoader._scrapeU  s     	&>||F##,,6"<<##C@4+?+?@  %%'==$ $H"" ( : :HX]]FHyBHHr)   c                 J    U R                  U R                  XR                  S9$ )z?Scrape data from webpage and return it in BeautifulSoup format.)r   r<   )r   r/   r<   )rT   r   s     r'   scrapeWebBaseLoader.scrapeo  s     ||DMM&NN|SSr)   c              #      #    U R                    HN  nU R                  XR                  S9nUR                  " S0 U R                  D6n[        X!5      n[        X4S9v   MP     g7f)z+Lazy load text from the url(s) in web_path.)r<   page_contentr&   N )r6   r   r<   r$   r;   r(   r   )rT   pathr   rg   r&   s        r'   	lazy_loadWebBaseLoader.lazy_loadt  sS     NND<<<?D==;4#:#:;D&t2H@@	 #s   A A"c                   #    U R                  U R                  5      I Sh  vN n[        U R                  U5       H8  u  p#UR                  " S0 U R                  D6n[        X25      n[        XES97v   M:     g NW7f)z1Async lazy load text from the url(s) in web_path.Nr   r   )r   r6   zipr$   r;   r(   r   )rT   r   r   r   rg   r&   s         r'   
alazy_loadWebBaseLoader.alazy_load|  sd     ((88dnng6JD==;4#:#:;D&t2H@@ 7 9s   A;A9AA;z0.3.14z1.0zSee API reference for updated usage: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html)sinceremovalmessagec                     U R                  U R                  5      n/ n[        U R                  U5       HD  u  p4UR                  " S0 U R                  D6n[        XC5      nUR                  [        XVS95        MF     U$ )z9Load text from the urls in web_path async into Documents.r   r   )r   r6   r   r$   r;   r(   r   r   )rT   r   docsr   r   rg   r&   s          r'   aloadWebBaseLoader.aload  sk     //$..1dnng6JD==;4#:#:;D&t2HKKdFG 7
 r)   )r4   r;   r<   r3   r8   r5   r:   r9   r7   r=   r-   r.   r6   )rt   NTNFTNr      r   NFNNN)   r   g      ?)N)NN)'__name__
__module____qualname____firstlineno____doc__r
   rD   r	   r   rP   boolintr   r   rU   propertyr/   floatrp   rj   r   rw   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   __static_attributes__r   r)   r'   r+   r+   *   s_   aJ /1*."&$)!%"&#%#$+48!&7;.2!S#$ #'S#Xc]*+S# "$S# 	S#
 $S# "S# S# 3-S# C=S# !S# S# "$sCx.1S# S# %T#s(^4S# DcN+S#  !S#$ %S#& 'S#( 
)S#j !# ! ! OR11!$1471FK1	1<#*#4#4	&0DI 0# 0( c d   IM"&s)5:395E	c$HtCy H%T	2B HdSVi H ;?HIH',S$Y'7H	cH $($(	II c4i I D>	I
 
I4TU39- T T
A8H- AA-"9 A U	
tH~ 

r)   r+   )r   rj   loggingr   typingr   r   r   r   r   r   r	   r
   rc   rG   langchain_core._apir   langchain_core.documentsr   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rN   rI   rD   rP   r(   r+   r   r)   r'   <module>r      s        V V V   * - @ ?			8	$ !"'(!$	 	# 	C 	D 	lJ lr)   