import asyncio
import logging
from datetime import datetime

import aiohttp
import feedparser
import requests
import urllib3
from bs4 import BeautifulSoup

from services.PressRelease.PRSources.data_models.PR_Model import (PRModel,
                                                                  RSSFeed)
from utils.url_parser import parsed_url

logging.basicConfig(level=logging.INFO)
# Initialize logging
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class PRScraper:
    """Class to parse press release data from RSS feeds"""

    request_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    def __init__(self):
        pass
        # adapter = HTTPAdapter(max_retries=5)
        # self.session = requests.Session()
        # self.session.mount('https://', adapter)

    async def scrape_rss_feed(self, rss_feed: RSSFeed) -> list[PRModel]:
        """Fetch data from the RSS feed and return a list of dictionaries"""

        logging.info(f"Fetching data from RSS feed: {rss_feed.url}")
        async with aiohttp.ClientSession() as session:
            try:
                response = await session.get(
                    rss_feed.url, timeout=10, headers=self.request_headers
                )
                content = await response.text()
                feed = feedparser.parse(content)
            except Exception as e:
                logging.error(f"Error parsing rss feed with url {rss_feed.url}: {e}")
                return []

        if feed.bozo:  # Check if the feed was parsed successfully
            logging.error(f"Error parsing feed: {feed.bozo_exception}")
            return []

        entries = feed.entries
        data = []
        for entry in entries:
            parsed_date = date_parser(
                entry.get("published", entry.get("updated", ""))
            ).isoformat()
            # This is a semi constructed pr object. The pr data is scraped separately
            pr_entry = PRModel(
                title=entry.title,
                url=entry.link,
                date=parsed_date,
                company_name=getattr(entry, "author", None),
                company_url=entry.get("author_detail", {}).get("href"),
                text="",
                source=parsed_url(rss_feed.url).domain,
                rss_feed_type=rss_feed.name,
            )
            pr_entry.update_modified()
            data.append(pr_entry)

        logging.info(f"Fetched {len(data)} entries from RSS feed")
        return data

    async def scrape_pr(self, pr: PRModel) -> PRModel:
        """Fetch the text content of the press release from the URL"""
        rss_feed = RSSFeed[pr.rss_feed_type]
        try:
            logging.info(f"Fetching PR text from URL: {pr.url}")
            async with aiohttp.ClientSession() as session:
                response = await session.get(
                    pr.url, timeout=10, headers=self.request_headers
                )
                response.raise_for_status()
                content = await response.text()

            soup = BeautifulSoup(content, "html.parser")

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            # Get text from attribute if specified, else get text from entire page
            if rss_feed.attributes:
                feed_content = soup.find(rss_feed.tag_name, rss_feed.attributes)
                text = feed_content.text if feed_content else soup.get_text()
            else:
                text = soup.get_text()

            # Break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # Drop blank lines
            text = "\n".join(chunk for chunk in chunks if chunk)
            pr.text = text

            logging.info(f"Successfully extracted text from URL: {pr.url}")
        except (requests.exceptions.ReadTimeout, requests.exceptions.HTTPError) as e:
            logging.error(f"Error fetching PR text from URL: {e}")
        except Exception as e:
            logging.error(
                f"Error fetching PR text from URL {pr.url} with unknown error: {e}"
            )
        return pr


def date_parser(date):
    date_formats = [
        "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%dT%H:%M:%S.%f",
        "%Y-%m-%dT%H:%M:%S.%f%z",
        "%a, %d %b %Y %H:%M:%S %Z",
    ]

    str_date = date
    if str_date:
        for date_format in date_formats:
            try:
                return datetime.strptime(str_date, date_format)
            except ValueError:
                continue
    return datetime.now()


if __name__ == "__main__":
    scraper = PRScraper()
    feed = asyncio.run(scraper.scrape_rss_feed(RSSFeed.prlog))
    for entry in feed:
        pr = asyncio.run(scraper.scrape_pr(entry))
        print(pr)
