import asyncio
import json
import logging
import random
from decimal import Decimal
from itertools import chain
from typing import Optional

from boto3.dynamodb.conditions import Attr

from services.PressRelease.pr_scraper import PRScraper
from services.PressRelease.pr_summarizer import PRSummarizer
from services.PressRelease.PRSources.data_models.PR_Model import (PRModel,
                                                                  RSSFeed)
from utils.dynamo_connector import DynamoConnector

MAX_RETRIES = 10


class PRProcessor:
    table_name = "press_releases"

    def __init__(self):
        self.scraper = PRScraper()
        self.summarizer = PRSummarizer()
        self.db = DynamoConnector().dynamodb
        self.table = self.db.Table(self.table_name)

    def get_prs_to_summarize(self) -> list[PRModel]:
        # get all items from the db
        items = self.table.get_all_table_items(
            filter_expression=Attr("processed").eq(False)
        )
        return [PRModel(**item) for item in items]

    def persist_data(self, items: list[PRModel] = None):
        """Store the press release data in DB"""
        item_set = set()

        with self.table.batch_writer() as writer:
            for pr_entry in items:
                if pr_entry.url in item_set:
                    logging.error(f"Duplicate item found: {pr_entry.url}")
                    continue
                item_set.add(pr_entry.url)
                logging.info(f"Writing data from {pr_entry.url} to ddb.")
                pr_entry_dict = pr_entry.model_dump()
                # Convert datetime to str before persisting
                pr_entry_dict["date"] = pr_entry.date.isoformat()
                pr_entry_dict["created_date"] = pr_entry.created_date.isoformat()
                pr_entry_dict["modified_date"] = pr_entry.modified_date.isoformat()
                # TODO: Fix this rather than brute forcing
                item = json.loads(json.dumps(pr_entry_dict), parse_float=Decimal)
                writer.put_item(Item=item)

        logging.info(f"Stored {len(items)} entries in DynamoDB")

    async def get_prs_to_scrape(self, items: list[PRModel] = None) -> list[PRModel]:
        """Check if the press releases with the given links are already in the database"""

        def chunker(seq, size):
            """Helper function to split a sequence into chunks of a given size"""
            for pos in range(0, len(seq), size):
                yield seq[pos : pos + size]

        item_urls = list(set([item.url for item in items]))
        urls = [{"url": url} for url in item_urls]

        max_batch_size = 99  # DynamoDB limit for batch_get_item
        fetched_prs = []
        fetched_urls = set()

        for chunk in chunker(urls, max_batch_size):
            response = self.db.batch_get_item(
                RequestItems={
                    self.table_name: {
                        "Keys": chunk,
                    }
                }
            )
            logging.info(f"Dynamodb response for chunk: {response}")

            if response.get("Responses") and response.get("Responses").get(
                self.table_name
            ):
                chunk_fetched_prs = response.get("Responses").get(self.table_name)
                fetched_prs.extend(chunk_fetched_prs)
                fetched_urls.update(pr["url"] for pr in chunk_fetched_prs)

        non_existent_prs = []
        # Ensure there are no duplicates in non-existent prs as well
        non_existent_urls = set()
        for item in items:
            if item.url not in fetched_urls and item.url not in non_existent_urls:
                non_existent_prs.append(item)
                non_existent_urls.add(item.url)
                logging.info(f"Press Release: {item.url} not found in db, will scrape")

        logging.info(
            f"Out of {len(items)} prs, {len(non_existent_urls)} not found in db."
        )
        return non_existent_prs

    async def process(self):
        # Step 1: Fetch PRs from multiple RSS feeds concurrently
        prs_fetched_lists = await asyncio.gather(
            *[self.scraper.scrape_rss_feed(feed) for feed in RSSFeed]
        )

        # Flatten the list of lists into a single list of PRs
        prs_fetched = list(chain.from_iterable(prs_fetched_lists))
        # Shuffle to reduce back to back spamming a single website, which results in failures
        random.shuffle(prs_fetched)

        # Step 2: Determine which PRs need to be scraped
        prs_to_scrape = await self.get_prs_to_scrape(prs_fetched)
        retry_count = 0

        while prs_to_scrape and retry_count < MAX_RETRIES:
            logging.info(f"Processing {len(prs_to_scrape)} PRs; retries: {retry_count}")
            # Step 3: Scrape and summarize PRs concurrently
            scrape_pr_result = await asyncio.gather(
                *[self.process_pr(pr) for pr in prs_to_scrape]
            )

            fully_scraped_prs = [pr for pr, ok in scrape_pr_result if ok]
            prs_to_scrape = [pr for pr, ok in scrape_pr_result if not ok]

            logging.info(f"Persisting {len(fully_scraped_prs)} PRs to DynamoDB")
            # Step 4: Persist the scraped and summarized PRs
            self.persist_data(fully_scraped_prs)

            if prs_to_scrape:
                await backoff(1, retry_count, MAX_RETRIES)

            retry_count += 1

    async def process_pr(self, pr: PRModel) -> tuple[Optional[PRModel], bool]:
        # Scrape the PR
        fetched_pr = await self.scraper.scrape_pr(pr)
        if fetched_pr.text != "":
            # Summarize the fetched PR
            return await self.summarizer.summarize(fetched_pr), True
        return fetched_pr, False


async def backoff(base_delay, retry_count, max_retries):
    delay = base_delay * (2**retry_count) + random.uniform(0, 1)
    logging.info(
        f"Retry {retry_count + 1}/{max_retries}. Waiting for {delay:.2f} seconds before next attempt."
    )
    await asyncio.sleep(delay)


# For AWS Lambda
def handler(event, context):
    logging.info("Starting PR Processor")
    asyncio.run(PRProcessor().process())
    logging.info("Finished PR Processor")


if __name__ == "__main__":
    # Local execution doesn't require event or context
    handler(None, None)
