"""selenium driver starter"""

# import sys

# sys.path.append(r".")


import asyncio
import logging
import os
import re
import time
from datetime import datetime

import urllib3
from bs4.element import Comment
from langchain_community.document_loaders import PyPDFLoader
from selenium import webdriver
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    WebDriverException,
)
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

headers = {
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/101.0.4951.44 Mobile/15E148 Safari/604.1"
}


class SeleniumDriverChrome:
    "Class to start a specific version of the selenium webdriver"

    def __init__(self, headless_mode: bool = True) -> None:
        # time.sleep(1)
        chrome_options = Options()
        chrome_options.add_argument(
            """user-agent==Mozilla/5.0
            (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like
            Gecko) CriOS/101.0.4951.44 Mobile/15E148 Safari/604.1"""
        )
        if headless_mode:
            chrome_options.add_argument("--headless")  # Enable headless mode

        chrome_options.add_argument("--remote-allow-origins=*")
        chrome_options.add_argument("--disable-infobars")
        chrome_options.add_argument("--disable-blink-features")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--disable-logging")
        chrome_options.add_argument("--disable-login-animations")
        chrome_options.add_argument("--disable-notifications")
        chrome_options.add_argument("--disable-default-apps")
        chrome_options.add_argument("--allow-insecure-localhost")
        chrome_options.add_argument("--log-level=3")

        chrome_options.add_experimental_option(
            "prefs",
            {
                "profile.default_content_setting_values.cookies": 2,
                "profile.managed_default_content_settings.images": 2,
                "download.default_directory": os.getcwd(),  # Change default directory for downloads
                "download.prompt_for_download": False,  # To auto download the file
                "profile.default_content_setting_values.automatic_downloads": 2,
                "download.directory_upgrade": True,
                "safebrowsing.enabled": True,
            },
        )

        self.driver: webdriver = webdriver.Chrome(options=chrome_options)
        # stealth(self.driver,
        #     languages=["en-US", "en"],
        #     vendor="Google Inc.",
        #     platform="Win32",
        #     webgl_vendor="Intel Inc.",
        #     renderer="Intel Iris OpenGL Engine",
        #     fix_hairline=True,
        # )

    def get_url(self, url) -> str:
        "Get the page source from the URL"

        try:
            page_text = self.scrape_text(url)
            return page_text

        except TimeoutException:
            logging.error("TimeoutError: %s", url)
            return ""

        except WebDriverException:
            try:
                url = re.sub("://www.", "://", url)
                page_text = self.scrape_text(url)
                return page_text

            except WebDriverException:
                return ""

    def scrape_text(self, url):
        "Scrape the text from the URL"
        self.driver.get(url)
        try:
            WebDriverWait(self.driver, 5).until(
                lambda driver: driver.execute_script("return document.readyState")
                == "complete"
            )
            logging.info(
                "Page loaded successfully for URL: %s at %s",
                url,
                datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            )
            page_text = self.driver.find_element(By.XPATH, "/html/body").text
        except (
            urllib3.exceptions.ProtocolError,
            NoSuchElementException,
            TimeoutException,
        ) as e:
            logging.error("Error retrieving text from %s: %s", url, e)
            page_text = ""
        except Exception as e:
            logging.error("Unexpected error with %s: %s", url, e)
            page_text = ""
        return page_text

    def scrape_text_and_images(self, url):
        "Scrape the images from the URL"
        self.driver.get(url)
        WebDriverWait(self.driver, 2).until(
            EC.visibility_of_element_located((By.TAG_NAME, "body"))
        )
        logging.info("%s URL: %s", datetime.today().strftime("%Y-%m-%D %H:%M:%S"), url)

        page_images = []
        for img in self.driver.find_elements(By.TAG_NAME, "img"):
            try:
                page_images.append(img.get_attribute("src"))
            except Exception as e:
                logging.error("Error with %s: %s", url, e)

        page_images = list(set(page_images))
        try:
            page_text = self.driver.find_element(By.XPATH, "/html/body").text
            page_text_list = self.clean_page_text(page_text, url)
            return page_text_list, page_images
        except NoSuchElementException:
            logging.error("Error with %s", url, exc_info=True)
            return [], page_images

    def quit(self) -> None:
        "exit the driver"
        self.driver.quit()

    def tag_visible(self, element):
        "remove the style, script, and head tags"
        if element.parent.name in [
            "style",
            "script",
            "meta",
            "[document]",
            # "link"
        ]:  # could add 'head', 'title', if needed
            return False
        if element.parent.hidden:
            return False
        if isinstance(element, Comment):
            return False
        return True

    def clean_page_text(self, page_texts, url):
        """
        Input
            page_texts (list): list of strings from the page
            url (str): url that we scrapped
        Output:
            page_texts (list): cleaned page text list
        """
        if isinstance(page_texts, str):
            page_texts = page_texts.split("\n")

        page_texts_non_empty = []
        # cycle through the page_texts and remove
        # any that repeats from the parent tree node or empty
        for text in page_texts:
            text = re.sub(r"[^a-zA-Z0-9 \n\.]", "", text)  # pylint: disable=W1401
            text = text.strip()

            # if the text is empty, skip
            if text == "\n" or text == "" or text == " ":
                continue

            # if the url_depth is greater than 0, check the parent for the entry
            page_texts_non_empty.append(text)

        if len(page_texts_non_empty) == 0:
            logging.warning("%s has no text for us to scrape", url)

        return page_texts_non_empty

    def get_url_text(self, url) -> str:
        "Get the text from the URL"
        try:
            if url.endswith(".pdf"):
                page_text = self.scrape_pdf(url)
                page_text_list = self.clean_page_text(page_text, url)
                return page_text_list

            page_text = self.get_url(url)
            page_text_list = self.clean_page_text(page_text, url)
            return page_text_list

        except AttributeError as e:
            logging.error("Error with %s: %s", url, e)
            return []

    async def async_get_url_text(self, url) -> str:
        "Get the text from the URL asynchronously"
        await asyncio.sleep(0.1)  # Simulate async sleep
        try:
            if url.endswith(".pdf"):
                page_text = await asyncio.to_thread(self.scrape_pdf, url)
                page_text_list = self.clean_page_text(page_text, url)
                return page_text_list

            page_text = await asyncio.to_thread(self.get_url, url)
            page_text_list = self.clean_page_text(page_text, url)
            self.quit()
            return page_text_list

        except AttributeError as e:
            logging.error("Error with %s: %s", url, e)
            self.quit()
            return []

    def scrape_pdf(self, url: str) -> list:
        "Scrape the text from the PDF"
        try:
            pdf_loader = PyPDFLoader(url, headers=headers)
            pages = pdf_loader.load()
            pdf_text = ""
            for page in pages:
                pdf_text += page.page_content
            return pdf_text

        except ValueError as e:
            logging.error("Error with %s: %s", url, e)
            return ""

        except Exception as e:
            logging.error("Error with %s: %s", url, e)
            return ""

    # def __del__(self):
    #     self.quit()


if __name__ == "__main__":
    sel_driver = SeleniumDriverChrome(headless_mode=False)

    s = time.time()

    for i in [
        "https://en.wikipedia.org/wiki/List_of_Formula_One_World_Drivers%27_Champions"
    ]:

        a = sel_driver.get_url_text(i)
        print("\n".join(a))

        print("\n\n\n")

        print(time.time() - s)
