o
    |gr+                     @   s"  d dl Z d dlZd dlZd dlmZ d dlZejd d dlZd dlZd dl	m
Z
 d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlmZ d dlmZ d dl m!Z! e" Z#dZ$G dd dZ%e&dkrdZ'e%e'dddddZ(e)e(j* dS dS )    N)urlparse.)RobotFileParser)NodePreOrderIter)ChatPromptTemplate)
ChatOpenAI)CharacterTextSplitter)Selector)OPENAI_API_KEYOPENAI_MODEL_35OPENAI_MODEL_MINIOPENAI_TOKEN_LIMIT)SeleniumDriverChrome)
parsed_url)crawl_pagesd   c                   @   s.  e Zd ZdgZdZi Z				d2dedededed	ef
d
dZdd Z	dd Z
efddZefddZdedee fddZdedejfddZdee dee fddZdd ZdefddZd edefd!d"Zd#efd$d%Zd&ee dee fd'd(Zded)ee fd*d+Zded,efd-d.Zd/d0 Zd1S )3SitemapScrapez/robots.txt   TFurlscrape_sitemap
url_scrape	summarizeexclude_blogsc                 C   st   || _ || _|| _|| _|| _| t|j| _| | j| _	| j r8| 
 | _tt| j| _| | j| _d S d S N)sitemap_scraper   r   r   r   sitemaps_from_robotsr   domain_sitemapremove_duplicate_sitemaps_urlsconsolidate_sitemapsr   	site_urlslistsetrank_urls_by_path_depth)selfr   r   r   r   r    r%   :/var/www/html/XCapMarket/utils/webscrape/sitemap_scrape.py__init__0   s   	
zSitemapScrape.__init__c                    s$   |  | jI d H | _|  | _d S r   )scrape_urlsr    urls_scrapedconsolidate_images
image_urlsr$   r%   r%   r&   a_process_urlsJ   s   zSitemapScrape.a_process_urlsc                 C   s   t |   d S r   )asynciorunr-   r,   r%   r%   r&   process_urlsN   s   zSitemapScrape.process_urlsc                 C       dd }t ||d}|d | S )Nc                 S   "   t | j}tdd |dD S )Nc                 S      g | ]}|r|qS r%   r%   .0pr%   r%   r&   
<listcomp>U       MSitemapScrape.rank_urls_by_path_depth.<locals>.path_depth.<locals>.<listcomp>/r   pathlensplitr   r<   r%   r%   r&   
path_depthR      
9SitemapScrape.rank_urls_by_path_depth.<locals>.path_depthkeysortedr$   urlstop_nr@   ranked_urlsr%   r%   r&   r#   Q      z%SitemapScrape.rank_urls_by_path_depthc                 C   r1   )Nc                 S   r2   )Nc                 S   r3   r%   r%   r4   r%   r%   r&   r7   ^   r8   r9   r:   r;   r?   r%   r%   r&   r@   [   rA   rB   rC   rE   rG   r%   r%   r&   r#   Z   rK   returnc           	      C   s   |d g}| |d  | jD ]+}|| }t|}| |}|jdv r$q|jd}|| | }|r:|	| qt
t|S )zGet the sitemap for a urlz/sitemap.xmlz/sitemap_index.xml)i  i  
)appendrobots_locationsr   get_urlstatus_codetextr>   parse	site_mapsextendr!   r"   )	r$   r   sitemap_urlsrobots_location	robot_urlrprlinessitemapsr%   r%   r&   r   c   s   





z"SitemapScrape.sitemaps_from_robotsc              
   C   s   zt j|ddiddd}W |S  t jjy@ } z$tjd||dd td t j|d	d
ddiddd}W Y d}~|S d}~w t	jj
y] } ztjd||dd td |d}~ww )zGet the urlz
User-AgentzMozilla/5.0
   T)headerstimeoutallow_redirectszConnectionError with %s: %sexc_info   z//www.z//NzMaxRetryError with %s: %s      ?)requestsget
exceptionsConnectionError
XCM_loggererrortimesleepreplaceurllib3MaxRetryError)r$   r   rZ   er%   r%   r&   rP   v   s2   


zSitemapScrape.get_urlr   c                 C   s*   t  }|D ]}| |}||j q|S )zRemove duplicate sitemap urls)r"   rP   addr   )r$   r   r   sitemaprZ   r%   r%   r&   r      s
   
z,SitemapScrape.remove_duplicate_sitemaps_urlsc                 C   s   t jd| jdd g }| jrOtd | j }| |}| |j	}t
|}|d D ]}| j| | q.|d D ]
}|| | qA| jst dt|| j |S )zScrape the sitemapzScrape sitemap for %sTra   rd   z//sitemap/loc/text()z//url/loc/text()z#Scraped %s urls from sitemap for %s)ri   infor   r   rk   rl   poprP   _clean_cdatarR   r
   xpathgetallrq   clean_loc_textrN   r=   )r$   rH   sitemap_urlresponseresponse_textselectorlocr%   r%   r&   r      s   


zSitemapScrape.scrape_sitemaprR   c                 C   s   dd l }|dd|S )Nr   z<!\[CDATA\[(.*?)\]\]>z\1)resub)r$   rR   r~   r%   r%   r&   ru      s   zSitemapScrape._clean_cdataloc_textc                 C   s    |  ddd }|  S )zClean the loc url textr:    r   )striprstripr>   )r$   r   r%   r%   r&   rx      s   zSitemapScrape.clean_loc_textrH   c           
         s   i } fdd|D }t |ddI dH }g }| D ]\}}|dd}|r3 jr3| || qtj| I dH }t|D ]\}	}|| d || d  jrU||	 ndd	||< q@|S )
zScrape the URLs asynchronouslyc                    s<   g | ]}t t|jd  jkrd| v r js|qS )r:   blog)r=   r   r<   r>   max_depth_to_scrapelowerr   )r5   r   r,   r%   r&   r7      s    z-SitemapScrape.scrape_urls.<locals>.<listcomp>T)with_imagesNmarkdown images)rR   r   summary)	r   itemsrf   r   rN   summarize_websiter.   gather	enumerate)
r$   rH   r)   urls_to_scrapetasksr   	page_data	page_text	summariesir%   r,   r&   r(      s&   



zSitemapScrape.scrape_urlstextsc                 C   s0   g }|D ]}|| j vrd| j |< || q|S )zRemove duplicate textT)duplicate_page_text_dictrN   )r$   url_noder   unique_page_textrR   r%   r%   r&   remove_duplicate_text   s   


z#SitemapScrape.remove_duplicate_textpage_text_listc                    s^   t g d}tttddd}||B }| fdd|D I dH }dd	d |D }|S )
z Summarize the website text chunk))systema  You are a research analyst with 20 years of experience and you have been asked to summarize website pages.                 You will first understand the website text in detail and then summarize the relevant information.                 Only return the summary text of the website text. Make your summary verbose but factual. Extra information is better than truncating information that could be useful.                You will be provided prior summary of the website if the page had too much text. You want to only append to the summary.z2systemBring the relevant information from the text)humanz%The URL you are summarizing is: {url})r   z The website text is: {page_text}g?i  )
model_nameapi_keytemperature
max_tokensc                    s   g | ]} |d qS ))r   r   r%   )r5   rR   r   r%   r&   r7     s    z>SitemapScrape.summarize_website_text_chunk.<locals>.<listcomp>NrM   c                 S   s   g | ]}|j qS r%   )content)r5   resultr%   r%   r&   r7     s    )r   from_messagesr   r   r   abatchjoin)r$   r   r   promptllmchainoutput_resultsoutputr%   r   r&   summarize_website_text_chunk   s    
z*SitemapScrape.summarize_website_text_chunkr   c                    sB   t jdttd tdt d}||}| ||I dH }|S )zSummarize the websitecl100k_base   g?)encoding_name
chunk_sizechunk_overlapN)r	   from_tiktoken_encoderintr   
split_textr   )r$   r   r   text_splitterr   r   r%   r%   r&   r     s   


zSitemapScrape.summarize_websitec                 C   s   t d| j i }| jD ]&}| j| d D ]}|d }||vr'|dd||< q|| d  d7  < qqdd t| d	d
 ddD }|S )zConsolidate the imageszConsolidating images for %sr   srcrc   )r   countr   c                 S   s   i | ]\}}|d ur||qS r   r%   )r5   kvr%   r%   r&   
<dictcomp>3  s
    z4SitemapScrape.consolidate_images.<locals>.<dictcomp>c                 S   s   | d d S )Nrc   r   r%   )itemr%   r%   r&   <lambda>6  s    z2SitemapScrape.consolidate_images.<locals>.<lambda>T)rD   reverse)ri   rs   r   r)   rF   r   )r$   r+   siteimage	image_urlr%   r%   r&   r*   %  s   
z SitemapScrape.consolidate_imagesN)TTTF)__name__
__module____qualname__rO   r   r   strboolr'   r-   r0   MAX_SITES_TO_SCRAPEr#   r!   r   re   ResponserP   r"   r   r   ru   rx   r(   r   r   r   r*   r%   r%   r%   r&   r   (   sD    
		
#!r   __main__zhttps://munichmotorsport.com/TF)r   r   r   r   )+r.   loggingsysurllib.parser   rn   r<   rN   rk   urllib.robotparserr   re   anytreer   r   langchain.promptsr   langchain_openair   langchain_text_splittersr	   parselr
   configs.configr   r   r   r   utils.selenium_driver_chromer   utils.url_parserr   utils.webscrape.crawl_pagesr   	getLoggerri   r   r   r   URLr   printr    r%   r%   r%   r&   <module>   s>     
