"Class to create the company profile"

import asyncio
import json
import logging
import os
import sys
import requests
from dotenv import load_dotenv
from langchain_core.documents import Document

sys.path.append(r".")
from configs.config import pic_file_extension
from services.company_profile import LLM_queries, pe_LLM_queries
from services.company_profile.data_classes.company_info import (
    CompanyInfo,
    FundInfo,
    Logo,
)
from services.company_profile.data_classes.llm_results import Leadership, LLMResults
from utils import load_sic_codes
from utils.chroma_db import ChromaDB
from utils.dynamo_db import DynamoDB
from utils.llm_company_profile import LLMChat
from utils.query.retriver import Compressor
from utils.search_google import GoogleSearch
from utils.selenium_driver_chrome import SeleniumDriverChrome
from utils.url_parser import parsed_url

load_dotenv()


OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
BRAND_FETCH_API = os.environ["BRAND_FETCH_API_KEY"]
sic_dict = load_sic_codes.load_sic_codes()


class CompanyProfileMaker:
    "Create the company profile for a given company url"

    visited_websites = {}
    company_name = None
    logo = None
    company_linkedin = None
    # main to set these
    llm_results = []
    industry = None
    sector = None
    leadership = None
    images = []

    def __init__(
        self, company_url: str, stock_ticker: str = None, is_pe_shop: bool = False
    ) -> None:
        """Create the company profile for a given company url."""
        self.parsed_url = parsed_url(company_url)
        self.company_domain = self.parsed_url.domain
        self.company_url = self.parsed_url.url
        self.stock_ticker = stock_ticker
        self.pe_shop = is_pe_shop
        self.llm_chat = LLMChat()

    async def main(self, sic_code=None):
        "Main function to create the company profile"

        logging.info("%s: Starting company profile creation", self.company_url)

        # Concurrently execute image retrieval, brand fetching, and SIC code mapping
        await asyncio.gather(
            self.get_images(), self.get_brand_from_url(), self.llm_runs()
        )

        ## due to the relience on LLM calls, this needs to be after
        results_sic_leadership = await asyncio.gather(
            self.product_to_sic(sic_code), self.map_leadership()
        )

        self.leadership = results_sic_leadership[1]

        # Concurrently upload data
        await asyncio.gather(self.upload_to_dynamodb(), self.upload_to_chromadb())

        logging.info("%s: Company profile created", self.company_url)

    async def llm_runs(self):
        "Run the llm query"
        llm_queries = (
            LLM_queries.queries if not self.pe_shop else pe_LLM_queries.queries
        )

        # Use asyncio.gather for concurrent execution of search and scrape
        search_tasks = [
            self.search_google(llm_query, "search") for llm_query in llm_queries
        ]
        await asyncio.gather(*search_tasks)

        search_urls = await self.get_urls_from_results(llm_queries)

        # Scrape URLs concurrently
        url_text_map = await self.scrape_urls(search_urls)

        # Efficiently map search_urls_text to llm_queries
        for llm_query in llm_queries:
            for search_result in llm_query["search_results"]:
                search_result["page_text"] = url_text_map.get(search_result["link"], "")

        # Ask LLM questions concurrently
        llm_queries = await asyncio.gather(
            *[self.ask_llm(query) for query in llm_queries]
        )

        for llm_query in llm_queries:
            llm_query.pop("search_results")

        self.llm_results = llm_queries

    async def get_urls_from_results(self, results):
        "Get the urls from the search results"
        return {url["link"] for result in results for url in result["search_results"]}

    async def search_google(self, llm_query, search_type):
        "Search google for a given query"
        search_results = await GoogleSearch().search_google(
            f"[{self.parsed_url.domain}] {llm_query['search']}",
            search_type,
            domain_check=llm_query.get("check_domain", False),
            domain_name_to_check=self.company_domain,
        )
        llm_query["search_results"] = search_results
        return llm_query

    async def ask_llm(self, llm_query: dict):
        "Ask the LLM the questions for the llm_query"
        if not any(
            search_result["page_text"] for search_result in llm_query["search_results"]
        ):
            if not llm_query.get("allow_empty", True):
                raise ValueError(
                    f"No search results for {self.company_url} - {llm_query['vector_db_query']}"
                )

        retriever = Compressor()
        # Create documents from search results
        documents = [
            Document(
                page_content=search_result["page_text"],
                metadata={
                    "source": search_result["link"],
                    "title": search_result["title"],
                },
            )
            for search_result in llm_query["search_results"]
            if search_result["page_text"]
        ]

        # Compress and retrieve relevant documents
        retriever.compress(documents)
        results = await retriever.aretrieve(llm_query["vector_db_query"], k=7)

        # Prepare content for LLM request
        combined_content = "\n".join(doc.page_content for doc in results)

        # Request LLM response
        llm_response = await self.llm_chat.openai_request(
            combined_content, llm_query["LLM_query"]
        )

        llm_query["response"] = llm_response

        return llm_query

    def scrape_url(self, url_to_scrape):
        "Scrape data from a given url asynchronously"
        if url_to_scrape in self.visited_websites:
            return self.visited_websites[url_to_scrape]

        driver = SeleniumDriverChrome()
        page_text = driver.get_url_text(url_to_scrape)
        self.visited_websites[url_to_scrape] = page_text

        return page_text

    async def scrape_urls(self, urls_to_scrape):
        "Scrape data from a given url asynchronously"
        from utils.webscrape.crawl_pages import crawl_pages

        urls_to_scrape = set(urls_to_scrape)
        results = await crawl_pages(urls_to_scrape)

        # driver = SeleniumDriverChrome()
        # urls_to_scrape = set(urls_to_scrape)
        # results = {}

        # for url_to_scrape in urls_to_scrape:
        #     if url_to_scrape not in self.visited_websites:
        #         page_text = driver.get_url_text(url_to_scrape)
        #         self.visited_websites[url_to_scrape] = page_text
        #     results[url_to_scrape] = self.visited_websites[url_to_scrape]

        return results

    async def get_brand_from_url(self):
        """
        Fetches brand information from a given URL using the brandfetch API

        Args:
            url (str): The URL to fetch the text from.

        Returns:
            brand (dict):
                company_name (str): The name of the company
                logos (dict):
                    logo_url (str): The url of the logo
                    logo_extension (int): The file extension of the logo
                    logo_dark (bool): If the logo is dark or not
                linkedin (str): The linkedin url of the company
        """

        request_header = {
            "accept": "application/json",
            "Authorization": "Bearer " + BRAND_FETCH_API,
        }

        request_url = (
            "https://api.brandfetch.io/v2/brands/" + self.company_url.split("//")[1]
        )
        try:
            req = requests.get(request_url, headers=request_header, timeout=20)
            if req.status_code != 200:
                self.company_name = self.company_domain.split(".")[0].title()
                return None
        except requests.exceptions.ReadTimeout:
            return None

        req = req.json()

        self.company_name = req["name"] if "name" in req else self.company_domain
        self.logo = (
            await self.find_logo_from_logos(req["logos"]) if "logos" in req else None
        )
        self.company_linkedin = await self.find_linkedin_from_req(req["links"])

        return None

    async def find_logo_from_logos(self, logos):
        "Find the logo from the logos returned"
        if len(logos) == 0:
            print("NO LOGO FOUND")
            return False

        final_logo_url = ""
        final_logo_extension = 0
        final_logo_dark = False
        # go throught array of logos where each entry is a dictionary. Get the link
        for logo in logos:
            for f in logo["formats"]:
                logo_url = f["src"]
                logo_extension = (
                    pic_file_extension[f["format"]] + 0
                    if logo["theme"] == "dark"
                    else 10
                )
                logo_dark = logo["theme"] == "dark"

                if len(final_logo_url) == 0 or (logo_extension < final_logo_extension):
                    final_logo_url = logo_url
                    final_logo_extension = logo_extension
                    final_logo_dark = logo_dark

        return Logo(
            logo_url=final_logo_url,
            logo_extension=final_logo_extension,
            logo_dark=final_logo_dark,
        )

    async def find_linkedin_from_req(self, links):
        "Find the linkedin from the links returned"
        for link in links:
            if link["name"] == "linkedin":
                return [link["url"]]
        return []

    async def product_to_sic(self, sic_code=None, product_info=None):
        """Get the SIC code for the company,
        if running outside of the class, product_info is required
        """
        if self.pe_shop:
            return {
                "industry": "Fund",
                "sector": "Private Equity",
            }
        elif sic_code is None:
            product_info = (
                find_llm_response(self.llm_results, "Products")
                if product_info is None
                else product_info
            )
            sic_code = await self.llm_chat.product_to_sic(product_info)
        try:

            self.industry = sic_dict[int(float(sic_code))]["industry"]
            self.sector = sic_dict[int(float(sic_code))]["sector"]
        except (KeyError, ValueError):  # KeyErorr or couldn't convert to int
            self.industry = None
            self.sector = None

        return {
            "industry": self.industry,
            "sector": self.sector,
        }

    async def get_images(self):
        "Get the images for the company"
        google_search = GoogleSearch()
        search_results = await google_search.search_google(
            f"[{self.parsed_url.domain}] products | solutions | company",
            "images",
            True,
            self.company_domain,
            50,
        )

        if len(search_results) < 5:
            search_results = await google_search.search_google(
                f"[{self.parsed_url.domain}] products | solutions | company",
                "images",
                False,
                self.company_domain,
            )
            logging.warning(
                "%s: Not enough images found for %s",
                self.company_url,
                self.company_domain,
            )

        self.images = [
            {"count": 1000, "src": result["imageUrl"], "type": "img"}
            for result in search_results
        ]

    async def upload_to_dynamodb(self):
        "Upload the company profile to dynamodb"
        db = DynamoDB()

        if self.pe_shop:
            llm_info = LLMResults(
                root_url=self.company_url,
                LLM_results=self.llm_results,
                # leadership=self.leadership if self.leadership is not None else None,
            )
            fund_info = FundInfo(
                root_url=self.company_url,
                company_name=self.company_name,
                images=self.images,
                linkedin=self.company_linkedin,
                logo=self.logo,
                business_model=None,
                industry=self.industry,
                sector=self.sector,
                stock_exchange=None,
                stock_ticker=(
                    self.stock_ticker if self.stock_ticker is not None else None
                ),
                LLM_results=llm_info.LLM_results,
            )

            db.upload_to_dynamodb(db.fund_info_table, fund_info.model_dump())

        else:

            company_info = CompanyInfo(
                root_url=self.company_url,
                company_name=self.company_name,
                images=self.images,
                linkedin=self.company_linkedin,
                logo=self.logo,
                business_model=None,
                industry=self.industry,
                sector=self.sector,
                stock_exchange=None,
                stock_ticker=(
                    self.stock_ticker if self.stock_ticker is not None else None
                ),
            )

            db.upload_to_dynamodb(db.company_info_table, company_info.model_dump())

            llm_info = LLMResults(
                root_url=self.company_url,
                LLM_results=self.llm_results,
                leadership=self.leadership if self.leadership is not None else None,
            )

            db.upload_to_dynamodb(db.llm_table, json.loads(llm_info.model_dump_json()))

            db.create_or_update_company_list(self.company_url, scrapped=True)

    async def upload_to_chromadb(self):
        "Upload the company profile to chromadb"

        chroma_db = ChromaDB()
        if self.pe_shop:  # if it is a PE shop
            chroma_item = {
                "root_url": self.company_url,
                # use a lambda function to loop through
                # the llm_results and get the overview response
                # use a lambda function to loop through
                # the llm_results and get the overview response
                "document": find_llm_response(self.llm_results, "Overview")
                + find_llm_response(self.llm_results, "Investment Thesis")
                + find_llm_response(self.llm_results, "AUM"),
                "industry": "Fund",
                "sector": "Private Equity",
                "public": False,
            }
            chroma_db.add_item(chroma_db.fund_collection, chroma_item, "document")

        else:

            chroma_item = {
                "root_url": self.company_url,
                # use a lambda function to loop through the llm_results and get the overview response
                "product": find_llm_response(self.llm_results, "Products"),
                "overview": find_llm_response(self.llm_results, "Overview"),
                "industry": (
                    self.industry if self.industry is not None else ""
                ),  # pylint: disable=line-too-long
                "sector": self.sector if self.sector is not None else "",
                "public": True if self.stock_ticker is not None else False,
                "stock_ticker": (
                    self.stock_ticker if self.stock_ticker is not None else None
                ),
            }

            chroma_db.add_item(chroma_db.product_collection, chroma_item, "product")
            chroma_db.add_item(chroma_db.overview_collection, chroma_item, "overview")

    async def map_leadership(self, company_leadership: str = None):
        """
        Map the leadership to linkedin and convert to structured data.
        If running outside of the lcass, company leadership is required"
        """

        if company_leadership is None:
            if self.llm_results is None:
                raise ValueError("LLM results are required to map leadership")

            company_leadership = find_llm_response(self.llm_results, "Leadership")

            if company_leadership is None or "not found" in company_leadership:
                leadership = {"leadership": [{"name": "not found"}]}

        llm_chat = LLMChat()
        await asyncio.sleep(0.1)
        leadership = await llm_chat.map_leadership(company_leadership)
        if not leadership.get("leadership", False):
            leadership = await llm_chat.map_leadership(company_leadership)

        new_leadership = []

        for leader in leadership["leadership"]:
            new_leadership.append(Leadership(**leader))

        return {"leadership": new_leadership}

    def non_async_load_brand(self, supplied_company_info: CompanyInfo):
        """
        Should be used only if the brand is not found in the company_info
        The function needs a supplied company info input
        """

        self.get_brand_from_url()
        supplied_company_info.logo = self.logo
        supplied_company_info.company_name = (
            self.company_name
            if self.company_name
            else supplied_company_info.company_name
        )
        # add the linkedin if it is not already in the company info
        if self.company_linkedin:
            if not supplied_company_info.linkedin:
                supplied_company_info.linkedin = [self.company_linkedin]
            else:
                supplied_company_info.linkedin.extend(self.company_linkedin)

        # update the company info in the dynamodb
        supplied_company_info.update_modify()
        db = DynamoDB()
        db.upload_to_dynamodb(db.company_info_table, supplied_company_info.model_dump())

        return supplied_company_info

    async def load_brand(self, supplied_company_info: CompanyInfo):
        """
        Should be used only if the brand is not found in the company_info
        The function needs a supplied company info input
        """

        await self.get_brand_from_url()
        supplied_company_info.logo = self.logo
        supplied_company_info.company_name = (
            self.company_name
            if self.company_name
            else supplied_company_info.company_name
        )
        # add the linkedin if it is not already in the company info
        if self.company_linkedin:
            if not supplied_company_info.linkedin:
                supplied_company_info.linkedin = [self.company_linkedin]
            else:
                supplied_company_info.linkedin.extend(self.company_linkedin)

        # update the company info in the dynamodb
        supplied_company_info.update_modify()
        db = DynamoDB()
        db.upload_to_dynamodb(db.company_info_table, supplied_company_info.model_dump())

        return supplied_company_info


def find_llm_response(responses: list, category_to_find: str):

    "Get the specific item from the LLM responses"
    result_return = ""
    for response in responses:
        if response["category"] == category_to_find:
            result_return += response["response"] + "\n"

    return result_return


if __name__ == "__main__":
    import time

    global start_time
    start_time = time.time()
    for url in ["https://munichmotorsport.com/"]:

        # URL = "https://www.unity.com/"
        COMPANY_STOCK_TIKCER = None
        IS_PE_SHOP = False
        company_profile_maker = CompanyProfileMaker(
            url, stock_ticker=COMPANY_STOCK_TIKCER, is_pe_shop=IS_PE_SHOP
        )

        asyncio.run(company_profile_maker.main())
        end_time = time.time()

        print(end_time - start_time)