
    dh&                        S r SSKrSSKrSSKJrJr  SSKJrJrJ	r	J
r
JrJrJr  SSKJr  SSKJr  \(       a  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  \R4                  " \5      r " S S\5      r " S S\5      r " S S\5      rg)zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptionalUnion)Document)
BaseLoader)Browser)Page)Response)r   r   r   c            	       `    \ rS rSrSr\SSSSSSS	\4S
 j5       r\SSSSSSS	\4S j5       rSr	g)PlaywrightEvaluator   zAbstract base class for all evaluators.

Each evaluator should take a page, a browser instance, and a response
object, process the page as necessary, and return the resulting text.
pager   browserr   responser   returnc                     g)zSynchronously process the page and return the resulting text.

Args:
    page: The page to process.
    browser: The browser instance.
    response: The response from page.goto().

Returns:
    text: The text content of the page.
N selfr   r   r   s       k/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/url_playwright.pyevaluatePlaywrightEvaluator.evaluate   s     	    	AsyncPageAsyncBrowserAsyncResponsec                    #    g7f)zAsynchronously process the page and return the resulting text.

Args:
    page: The page to process.
    browser: The browser instance.
    response: The response from page.goto().

Returns:
    text: The text content of the page.
Nr   r   s       r   evaluate_async"PlaywrightEvaluator.evaluate_async+   s
      	s   r   N)
__name__
__module____qualname____firstlineno____doc__r   strr   r$   __static_attributes__r   r   r   r   r      sl     V i : RU   *8DS	 r   r   c                   j    \ rS rSrSrSS\\\      4S jjrSSSS	S
SS\4S jr	SSSSS
SS\4S jr
Srg)UnstructuredHtmlEvaluator<   z@Evaluate the page HTML content using the `unstructured` library.Nremove_selectorsc                 L     SSK nXl        g! [         a    [        S5      ef = f)z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr0   )r   r0   r2   s      r   __init__"UnstructuredHtmlEvaluator.__init__?   s5    	 !1  	- 	s    #r   r   r   r   r   r   r   c                 p   SSK Jn  U R                  =(       d    /  HS  nUR                  U5      R	                  5       nU H+  nUR                  5       (       d  M  UR                  S5        M-     MU     UR                  5       nU" US9nSR                  U V	s/ sH  n	[        U	5      PM     sn	5      $ s  sn	f )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text


unstructured.partition.htmlr8   r0   locatorall
is_visibler   contentjoinr+   
r   r   r   r   r8   selectorelementselementpage_sourceels
             r   r   "UnstructuredHtmlEvaluator.evaluateK   s    >--33H||H-113H#%%''$$%BC $ 4 lln!{3{{h7hCGh7887s   B3r    r!   r"   c                   #    SSK Jn  U R                  =(       d    /  Hk  nUR                  U5      R	                  5       I Sh  vN nU H;  nUR                  5       I Sh  vN (       d  M"  UR                  S5      I Sh  vN   M=     Mm     UR                  5       I Sh  vN nU" US9nSR                  U V	s/ sH  n	[        U	5      PM     sn	5      $  N Nv NX N:s  sn	f 7f)z4Asynchronously process the HTML content of the page.r   r7   Nr9   r:   r<   r=   rD   s
             r   r$   (UnstructuredHtmlEvaluator.evaluate_asyncY   s      	?--33H!\\(37799H# ++---!**+HIII $ 4 !LLN*!{3{{h7hCGh788 :-I*7s`   AC"CC" C!C",C" CC" C!C":CC"C"C"C"C")r0   )N)r&   r'   r(   r)   r*   r
   r	   r+   r4   r   r$   r,   r   r   r   r.   r.   <   sa    J
1$s))< 
19V 9i 9: 9RU 999*89DS9	9r   r.   c                       \ rS rSrSr      SS\\   S\S\S\\\      S\\	   S	\\
\\4      S
\\\\R                  \   4      4S jjrS\\   4S jrS\\   4S jrS\\   4S jrSrg)PlaywrightURLLoaderj   a  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

This is useful for loading pages that require javascript to render.

Attributes:
    urls (List[str]): List of URLs to load.
    continue_on_failure (bool): If True, continue loading other URLs on failure.
    headless (bool): If True, the browser will run in headless mode.
    proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
        through the specified proxy.
    browser_session (Optional[Union[str, os.PathLike[str]]]): Path to a file with
        browser session data that can be used to restore the browser session.

Example:
    .. code-block:: python

        from langchain_community.document_loaders import PlaywrightURLLoader

        urls = ["https://api.ipify.org/?format=json",]
        proxy={
            "server": "https://xx.xx.xx:15818", # https://<host>:<port>
            "username": "username",
            "password": "password"
        }
        loader = PlaywrightURLLoader(urls, proxy=proxy)
        data = loader.load()
Nurlscontinue_on_failureheadlessr0   	evaluatorproxybrowser_sessionc                      SSK nXl        X l        X0l        X`l        Xpl        U(       a  U(       a  [        S5      eU=(       d    [        U5      U l	        g! [         a    [        S5      ef = f)z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)

playwrightr3   rP   rQ   rR   rT   rU   
ValueErrorr.   rS   )	r   rP   rQ   rR   r0   rS   rT   rU   rW   s	            r   r4   PlaywrightURLLoader.__init__   st    	 	#6  
.	L 
 #Q&?@P&Q%  	+ 	s   A A-r   c           	   #     #    SSK Jn  U" 5        nUR                  R                  U R                  U R
                  S9nSnU R                  (       aj  [        R                  R                  U R                  5      (       a  UR                  U R                  S9nO"[        R                  SU R                   35        Uc  UR                  5       nU R                   H  n UR                  5       nUR                  U5      nUc  [!        SU 35      eUR#                  S5        U R$                  R'                  XcU5      nUR)                  5         S	U0n	[+        XS
9v   M     UR)                  5         SSS5        g! [,         a9  n
U R.                  (       a!  [        R1                  SU SU
 35         Sn
A
M  U
eSn
A
ff = f! , (       d  f       g= f7f)zLoad the specified URLs using Playwright and create Document instances.

Returns:
    A list of Document instances with loaded content.
r   )sync_playwrightrR   rT   Nstorage_stateSession file not found: "page.goto() returned None for url loadsourcepage_contentmetadataError fetching or processing , exception: )playwright.sync_apir[   chromiumlaunchrR   rT   rU   ospathexistsnew_contextloggerwarningrP   new_pagegotorX   wait_for_load_staterS   r   closer   	ExceptionrQ   error)r   r[   pr   contexturlr   r   r;   re   es              r   	lazy_loadPlaywrightURLLoader.lazy_load   s     	8!jj''djj'QGG##77>>$"6"677%11@T@T1UGNN%=d>R>R=S#TU!--/yy "++-D#yy~H'(+McU)STT,,V4>>224(KDJJL (#H"HH !( MMOC 4 !  //;C5aSQ    5 sN   GCG A>E:G 1	G:
F=,F80G 6F88F==G  
G
Gc                 `   #    U R                  5        Vs/ s Sh  vN oPM   N
 sn$ s  snf 7f)Load the specified URLs with Playwright and create Documents asynchronously.
Use this function when in a jupyter notebook environment.

Returns:
    A list of Document instances with loaded content.
N)
alazy_load)r   docs     r   aloadPlaywrightURLLoader.aload   s%      &*__%677c777s$   .)%#
%)%).c           	       #    SSK Jn  U" 5        ISh  vN nUR                  R                  U R                  U R
                  S9I Sh  vN nSnU R                  (       ar  [        R                  R                  U R                  5      (       a"  UR                  U R                  S9I Sh  vN nO"[        R                  SU R                   35        Uc  UR                  5       I Sh  vN nU R                   H  n UR                  5       I Sh  vN nUR                  U5      I Sh  vN nUc  [!        SU 35      eUR#                  S5      I Sh  vN   U R$                  R'                  XcU5      I Sh  vN nUR)                  5       I Sh  vN   S	U0n	[+        XS
97v   M     UR)                  5       I Sh  vN   SSS5      ISh  vN   g GN GN GN- N N N N Nj NT! [,         a:  n
U R.                  (       a"  [        R1                  SU SU
 35         Sn
A
GM*  U
eSn
A
ff = f Nq Nc! , ISh  vN  (       d  f       g= f7f)r~   r   )async_playwrightNr\   r]   r_   r`   ra   rb   rc   rf   rg   )playwright.async_apir   ri   rj   rR   rT   rU   rk   rl   rm   rn   ro   rp   rP   rq   rr   rX   rs   rS   r$   rt   r   ru   rQ   rv   )r   r   rw   r   rx   ry   r   r   r;   re   rz   s              r   r   PlaywrightURLLoader.alazy_load   s     	:#%%JJ--t}}DJJ-WWGG##77>>$"6"677$+$7$7&*&:&: %8 % G NN%=d>R>R=S#TU ' 3 3 55yy !(!1!1!33D%)YYs^3H'(+McU)STT226:::!%!>!>th!WWD**,&& (#H"HH !( --/!!G &%%W
 6 43 ;W& !  //;C5aSQ     "G &%%%s  IGI2H3
GA!H3,G-=H3*G+H3?G(GG(+G ,)G(G"#G(9G$:G(G&G(&H3=H/>H3IH1IH3H3H3G( G("G($G(&G((
H,2,H'H3%H''H,,H31I3I
9H<:I
I)rU   rQ   rS   rR   rT   rP   )TTNNNN)r&   r'   r(   r)   r*   r	   r+   boolr
   r   r   r   rk   PathLiker4   r   r   r{   r   r   r   r,   r   r   r   rN   rN   j   s    > %)0437*.BFR3iR "R 	R
 #49-R /0R S#X'R "%R[[-=(=">?RB)8H- )V8T(^ 8,"-"9 ,"r   rN   ) r*   loggingrk   abcr   r   typingr   r   r   r   r	   r
   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   r   r   r!   r   r    r   r"   rh   	getLoggerr&   ro   r   r.   rN   r   r   r   <module>r      sm    W  	 # V V V - @<6>;; 
		8	$## #L+9 3 +9\^"* ^"r   