import asyncio
# from utils.logger import ServiceLogger
import logging
import os
import re
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

import urllib3
from bs4 import BeautifulSoup as soup
from bs4.element import Comment
from langchain_community.document_loaders import PyPDFLoader
from playwright.async_api import async_playwright

XCM_logger = logging.getLogger()

headers = {
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/101.0.4951.44 Mobile/15E148 Safari/604.1"
}


class PlaywrightDriver:
    "Class to start a Playwright browser session"

    def __init__(self, headless_mode: bool = True) -> None:
        self.headless_mode = headless_mode

    async def init_browser(self):
        "Initialize the Playwright browser"
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(headless=self.headless_mode)
        self.context = await self.browser.new_context()

    async def get_url(self, url) -> str:
        "Get the page source from the URL"
        try:
            page_text = await self.scrape_text(url)
            return page_text

        except Exception as e:
            XCM_logger.error("Error retrieving URL %s: %s", url, e)
            return ""

    async def scrape_text(self, url):
        "Scrape the text from the URL"
        try:
            page = await self.context.new_page()
            await page.goto(url, timeout=10000)
            XCM_logger.info(
                "Page loaded successfully for URL: %s at %s",
                url,
                datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            )
            page_text = await page.inner_text("body")
            await page.close()
            return page_text
        except Exception as e:
            XCM_logger.error("Unexpected error with %s: %s", url, e)
            return ""

    async def scrape_text_and_images(self, url):
        "Scrape the text and images from the URL"
        try:
            page = await self.context.new_page()
            await page.goto(url, timeout=10000)
            XCM_logger.info("Page loaded for URL: %s", url)

            # Extract images
            images = await page.eval_on_selector_all(
                "img", "elements => elements.map(img => img.src)"
            )
            images = list(set(images))

            # Extract text
            page_text = await page.inner_text("body")
            page_text_list = self.clean_page_text(page_text, url)

            await page.close()
            return page_text_list, images
        except Exception as e:
            XCM_logger.error("Error retrieving content from %s: %s", url, e)
            return [], []

    async def quit(self) -> None:
        "Exit the browser session"
        await self.context.close()
        await self.browser.close()

    def clean_page_text(self, page_texts, url):
        """
        Input:
            page_texts (list): list of strings from the page
            url (str): url that we scraped
        Output:
            page_texts (list): cleaned page text list
        """
        if isinstance(page_texts, str):
            page_texts = page_texts.split("\n")

        page_texts_non_empty = []
        for text in page_texts:
            text = re.sub("[^a-zA-Z0-9 \n\\.]", "", text)  # Remove unwanted characters
            text = text.strip()

            if text:
                page_texts_non_empty.append(text)

        if not page_texts_non_empty:
            XCM_logger.warning("%s has no text for us to scrape", url)

        return page_texts_non_empty

    async def get_url_text(self, url) -> str:
        "Get the text from the URL"
        await asyncio.sleep(0.1)
        try:
            if url.endswith(".pdf"):
                page_text = await asyncio.to_thread(self.scrape_pdf, url)
                page_text_list = self.clean_page_text(page_text, url)
                return page_text_list

            page_text = await self.get_url(url)
            page_text_list = self.clean_page_text(page_text, url)
            await self.quit()
            return page_text_list

        except Exception as e:
            XCM_logger.error("Error retrieving text from %s: %s", url, e)
            await self.quit()
            return []

    def scrape_pdf(self, url: str) -> list:
        "Scrape the text from the PDF"
        try:
            pdf_loader = PyPDFLoader(url, headers=headers)
            pages = pdf_loader.load()
            pdf_text = "".join(page.page_content for page in pages)
            return pdf_text

        except Exception as e:
            XCM_logger.error("Error scraping PDF %s: %s", url, e)
            return ""


if __name__ == "__main__":

    async def main():
        driver = PlaywrightDriver(headless_mode=False)
        await driver.init_browser()

        url = "https://en.wikipedia.org/wiki/List_of_Formula_One_World_Drivers%27_Champions"
        result = await driver.get_url_text(url)
        print("\n".join(result))

    asyncio.run(main())
