o
    sâg $  ã                   @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddiZG dd„ dƒZedkr‰eddZe ¡ ZdD ]Z e !e ¡Z"e#d $e"¡ƒ e#dƒ e#e ¡ e ƒ qndS dS )zselenium driver starteré    N)Údatetime)ÚComment)ÚPyPDFLoader)Ú	webdriver)ÚNoSuchElementExceptionÚTimeoutExceptionÚWebDriverException)ÚOptions)ÚBy)Úexpected_conditions)ÚWebDriverWaitz
User-AgentzŽMozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/101.0.4951.44 Mobile/15E148 Safari/604.1c                   @   sŠ   e Zd ZdZddeddfdd„Zdefdd	„Zd
d„ Zdd„ Z	ddd„Z
dd„ Zdd„ Zdefdd„Zdefdd„Zdedefdd„ZdS )ÚSeleniumDriverChromez;Class to start a specific version of the selenium webdriverTÚheadless_modeÚreturnNc                 C   s´   t ƒ }| d¡ |r| d¡ | d¡ | d¡ | d¡ | d¡ | d¡ | d¡ | d	¡ | d
¡ | d¡ | d¡ | dddt ¡ dddddœ¡ tj|d| _d S )Nz²user-agent==Mozilla/5.0
            (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like
            Gecko) CriOS/101.0.4951.44 Mobile/15E148 Safari/604.1z
--headlessz--remote-allow-origins=*z--disable-infobarsz--disable-blink-featuresz---disable-blink-features=AutomationControlledz--disable-loggingz--disable-login-animationsz--disable-notificationsz--disable-default-appsz--allow-insecure-localhostz--log-level=3Úprefsé   FT)z.profile.default_content_setting_values.cookiesz/profile.managed_default_content_settings.imageszdownload.default_directoryzdownload.prompt_for_downloadz:profile.default_content_setting_values.automatic_downloadszdownload.directory_upgradezsafebrowsing.enabled)Úoptions)r	   Úadd_argumentÚadd_experimental_optionÚosÚgetcwdr   ÚChromeÚdriver)Úselfr   Úchrome_options© r   ú8/var/www/html/XCapMarket/utils/selenium_driver_chrome.pyÚ__init__%   s8   ÿ










ùþzSeleniumDriverChrome.__init__c                 C   sx   z|   |¡}|W S  ty   t d|¡ Y dS  ty;   zt dd|¡}|   |¡}|W  Y S  ty:   Y Y dS w w )z Get the page source from the URLzTimeoutError: %sÚ z://www.z://)Úscrape_textr   ÚloggingÚerrorr   ÚreÚsub)r   ÚurlÚ	page_textr   r   r   Úget_urlR   s   


ÿúzSeleniumDriverChrome.get_urlc              
   C   sÌ   | j  |¡ z#t| j dƒ dd„ ¡ t d|t ¡  d¡¡ | j  	t
jd¡j}W |S  tjjttfyJ } zt d||¡ d}W Y d	}~|S d	}~w tye } zt d
||¡ d}W Y d	}~|S d	}~ww )zScrape the text from the URLé   c                 S   s   |   d¡dkS )Nzreturn document.readyStateÚcomplete)Úexecute_script)r   r   r   r   Ú<lambda>k   s    ÿz2SeleniumDriverChrome.scrape_text.<locals>.<lambda>z*Page loaded successfully for URL: %s at %sz%Y-%m-%d %H:%M:%Sú
/html/bodyz!Error retrieving text from %s: %sr   NzUnexpected error with %s: %s)r   Úgetr   Úuntilr    Úinfor   ÚnowÚstrftimeÚfind_elementr
   ÚXPATHÚtextÚurllib3Ú
exceptionsÚProtocolErrorr   r   r!   Ú	Exception)r   r$   r%   Úer   r   r   r   f   s6   ÿýöý€ý€ýz SeleniumDriverChrome.scrape_textc                 C   s  | j  |¡ t| j dƒ t tjdf¡¡ t 	dt
 ¡  d¡|¡ g }| j  tjd¡D ]%}z
| | d¡¡ W q+ tyP } zt d||¡ W Y d}~q+d}~ww tt|ƒƒ}z| j  tjd	¡j}|  ||¡}||fW S  ty€   tjd
|dd g |f Y S w )zScrape the images from the URLr   Úbodyz
%s URL: %sz%Y-%m-%D %H:%M:%SÚimgÚsrcúError with %s: %sNr+   zError with %sT)Úexc_info)r   r,   r   r-   ÚECÚvisibility_of_element_locatedr
   ÚTAG_NAMEr    r.   r   Útodayr0   Úfind_elementsÚappendÚget_attributer7   r!   ÚlistÚsetr1   r2   r3   Úclean_page_textr   )r   r$   Úpage_imagesr:   r8   r%   Úpage_text_listr   r   r   Úscrape_text_and_images€   s,   ÿ€ÿ
þz+SeleniumDriverChrome.scrape_text_and_imagesc                 C   s   | j  ¡  dS )zexit the driverN)r   Úquit)r   r   r   r   rK   ˜   s   zSeleniumDriverChrome.quitc                 C   s.   |j jdv rdS |j jrdS t|tƒrdS dS )z'remove the style, script, and head tags)ÚstyleÚscriptÚmetaz
[document]FT)ÚparentÚnameÚhiddenÚ
isinstancer   )r   Úelementr   r   r   Útag_visibleœ   s   
z SeleniumDriverChrome.tag_visiblec                 C   sx   t |tƒr
| d¡}g }|D ]}t dd|¡}| ¡ }|dks'|dks'|dkr(q| |¡ qt|ƒdkr:t 	d|¡ |S )zÆ
        Input
            page_texts (list): list of strings from the page
            url (str): url that we scrapped
        Output:
            page_texts (list): cleaned page text list
        Ú
z[^a-zA-Z0-9 \n\.]r   Ú r   z%s has no text for us to scrape)
rR   ÚstrÚsplitr"   r#   ÚstriprC   Úlenr    Úwarning)r   Ú
page_textsr$   Úpage_texts_non_emptyr3   r   r   r   rG   ¬   s   

z$SeleniumDriverChrome.clean_page_textc              
   C   sz   z!|  d¡r|  |¡}|  ||¡}|W S |  |¡}|  ||¡}|W S  ty< } zt d||¡ g W  Y d}~S d}~ww )zGet the text from the URLú.pdfr<   N)ÚendswithÚ
scrape_pdfrG   r&   ÚAttributeErrorr    r!   ©r   r$   r%   rI   r8   r   r   r   Úget_url_textÊ   s   


€þz!SeleniumDriverChrome.get_url_textc              
   Ã   s°   t  d¡I dH  z/| d¡r"t  | j|¡I dH }|  ||¡}|W S t  | j|¡I dH }|  ||¡}|  ¡  |W S  tyW } zt	 
d||¡ |  ¡  g W  Y d}~S d}~ww )z(Get the text from the URL asynchronouslygš™™™™™¹?Nr^   r<   )ÚasyncioÚsleepr_   Ú	to_threadr`   rG   r&   rK   ra   r    r!   rb   r   r   r   Úasync_get_url_textÚ   s"   €
€ýz'SeleniumDriverChrome.async_get_url_textr$   c              
   C   sš   zt |td}| ¡ }d}|D ]}||j7 }q|W S  ty3 } zt d||¡ W Y d}~dS d}~w tyL } zt d||¡ W Y d}~dS d}~ww )zScrape the text from the PDF)Úheadersr   r<   N)r   rh   ÚloadÚpage_contentÚ
ValueErrorr    r!   r7   )r   r$   Ú
pdf_loaderÚpagesÚpdf_textÚpager8   r   r   r   r`   í   s    €€þzSeleniumDriverChrome.scrape_pdf)T)r   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úboolr   rW   r&   r   rJ   rK   rT   rG   rc   rg   rE   r`   r   r   r   r   r   "   s    -
r   Ú__main__F)r   )zLhttps://en.wikipedia.org/wiki/List_of_Formula_One_World_Drivers%27_ChampionsrU   z


)%rs   rd   r    r   r"   Útimer   r4   Úbs4.elementr   Ú$langchain_community.document_loadersr   Úseleniumr   Úselenium.common.exceptionsr   r   r   Ú!selenium.webdriver.chrome.optionsr	   Úselenium.webdriver.common.byr
   Úselenium.webdriver.supportr   r>   Úselenium.webdriver.support.uir   rh   r   rp   Ú
sel_driverÚsÚirc   ÚaÚprintÚjoinr   r   r   r   Ú<module>   s<    ÿ b

ò