"research agents"

import sys

sys.path.append(r".")

import logging
import os
from concurrent import futures
from datetime import datetime

from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI

from services.ppt_generator.data_classes.project import Project, Slide
from utils import dynamo_db
from utils.researcher.compression import ContextCompressor
from utils.researcher.dataclass.research_llm_classes import (
    Questions, ResearchLLM, ResearchLLM_AnswerOnly)
from utils.search_google import GoogleSearch
from utils.selenium_driver_chrome import SeleniumDriverChrome


class Researcher:
    "researcher class"

    llm35 = ChatOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"), model="gpt-3.5-turbo", temperature=0.1
    )

    llm35_sub_query = ChatOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"), model="gpt-3.5-turbo", temperature=0.3
    )

    llm4_sub_query = ChatOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4-turbo", temperature=0.3
    )

    max_workers = 5

    max_number_of_questions = 5

    summary_token_limit = 1000
    max_follow_up_level = 0  # this is the depth you should go down

    COT_sub_instructions = """The questions should be chain of thought questions that can help provide context to a large-language model.\
Such as:
Parent question: What is the market share of Company X?
Sub-questions: what is market does Company X operate in? What is the revenue of Company X? What is the market size of the market Company X operates in?\

Parent question: What trends and risks are faced by companies in the automotive industry (specific industry)?
Sub-questions: Who are the public companies in the space? For each of those companies, what are the trends and risks they are facing?\
            """

    links_researched = {}

    def __init__(
        self, slide: Slide = None, parent_question: str = None, primary_research=None
    ):
        "initialize the class"
        self.slide = slide
        self.question = parent_question
        if self.slide:
            self.sub_query = self._get_slide_sub_questions
            self.sub_query_input = self.slide
            self.question = "\n".join(slide.questions)
        elif self.question:
            self.sub_query = self._get_parent_sub_questions
            self.sub_query_input = self.question
        self.primary_research = primary_research if primary_research else ""

        pass

    def researcher(self):
        "research method"
        print("Starting researching")
        logging.info("Starting researching")
        # sub_questions = self._get_slide_sub_questions(self.slide)
        sub_questions = self.sub_query(self.sub_query_input)
        sub_questions = Questions(**sub_questions)

        # go through the sub questions one by one and then add the results the parent question and use that as some of the context for the next sub question
        results = []
        results.append(
            self.search_scrape_summarize(self.question, prev_question_results="")
        )
        for i, question in enumerate(sub_questions.questions):
            result = self.search_scrape_summarize(
                question, prev_question_results=results
            )
            results.append(result)

        research_text = list(results)

        logging.info("ending researching")
        print("ending researching")

        # research_text = self.summarize_research(research_text)
        research_text = "\n\n\n".join(
            [a["query"] + "\n" + a["summary"] for a in research_text]
        )

        return research_text

    def update_question(self, question: str, prev_question_results: list):
        "update the question with the previous question results"

        chat_prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """You are an expert asking intelligent questions to Google.
                You will be provided a question. Update the question if needed with the results provided from the previous question.
                You goal is to answer the parent question.
            """,
                ),
                ("human", "The parent question is: {parent_question}"),
                ("human", "The question is: {question}"),
                ("human", "The previous question results are: {prev_question_results}"),
                (
                    "system",
                    "Return the updated question only. Only update the question if you need to update it.",
                ),
            ]
        )

        chain = chat_prompt | self.llm35

        chain_output = chain.invoke(
            {
                "parent_question": self.question,
                "question": question,
                "prev_question_results": "\n".join(
                    [a["summary"] for a in prev_question_results]
                ),
            }
        )

        return chain_output.content

    def summarize_research(self, research_text):
        "summarize the research"

        chat_prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                You are a research analyst. With the provided context, answer the question. Keep as much of the information as possible.
                You strive to provide an answer backed with data and quantifiable data-points.
                You are not providing factual answer not opinions.
                Do not provide any additional questions or follow-up questions.""",
                ),
                ("human", "The question is: {question}"),
                ("human", "The context is: ** \n{context} \n **"),
                (
                    "human",
                    "Provide a quantitative and verbose answer at least a minimum of 1000 tokens",
                ),
                ("human", "JSON output: {json_output}"),
            ]
        )

        parser = JsonOutputParser(pydantic_object=ResearchLLM_AnswerOnly)

        chain = chat_prompt | self.llm35 | parser

        chain_output = chain.invoke(
            {
                "question": self.question,
                "context": "\n".join([summary["summary"] for summary in research_text]),
                "json_output": parser.get_format_instructions(),
            }
        )

        # add the links to the output
        links_researched = set()
        for summary in research_text:
            for link in summary["search_results_links"]:
                links_researched.add(link)

        chain_output["sub_questions"] = "\n".join(
            [summary["query"] for summary in research_text]
        )
        chain_output["links"] = links_researched

        return chain_output

    def _get_parent_sub_questions(self, question: str):
        "get questions to ask for the parent question"

        chat_prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    f"""\
                You are an expert researcher skilled in finding information and reasoning.\
                You strive to provide data-points and quantitative answers. For any market or industry questions, you need to go higher level and then dig deep.
                You are not providing an opinion but rather a factual answer and current analysis.\
                {self.COT_sub_instructions}
                """,
                ),
                ("human", "The tasks you are getting an answer for is: {question}"),
                (
                    "system",
                    """\
                Give back a list of {number_of_questions} sub-questions which can help provide context to a large-language model. Each question should only cover one topic.
                Use today's date if needed {today_date}
                Provide your answer in a list format in such as JSON: {json_output}
                """,
                ),
            ]
        )

        parser = JsonOutputParser(pydantic_object=Questions)

        chain = chat_prompt | self.llm4_sub_query | parser

        chain_output = chain.invoke(
            {
                "question": question,
                "questions": self.max_number_of_questions,
                "today_date": datetime.now().strftime("%Y-%m-%d"),
                "number_of_questions": self.max_number_of_questions,
                "json_output": parser.get_format_instructions(),
            }
        )

        return chain_output

    def _get_slide_sub_questions(self, slide: Slide):
        "get questions to ask for the slide"

        chat_prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    f"""\
                You are an expert researcher skilled in finding information and reasoning.\
                You strive to provide data-points and quantitative answers. For any market or industry questions, you need to go higher level and then dig deep.
                You are not providing an opinion but rather a factual answer and current analysis.\
                {self.COT_sub_instructions}
                """,
                ),
                (
                    "system",
                    """\
                You are building a slide for an investment banking deck.\
                """,
                ),
                ("human", "The slide title is {slide_title}"),
                (
                    "human",
                    "The reader should have this takeaway for the slide: {slide_content}",
                ),
                ("human", "The data required for the slides: {data_required}"),
                (
                    "human",
                    "The information we have for the slide already is: {slide_primary_research}",
                ),
                ("human", "Your focus is to answer these questions: {questions}"),
                (
                    "system",
                    """\
                Give back a list of {number_of_questions} sub-questions which can help provide context to a large-language model. Each question should only cover one topic.
                Use today's date if needed {today_date}
                Provide your answer in a list format in such as JSON: {json_output}
                """,
                ),
                ("system", self.COT_sub_instructions),
            ]
        )

        parser = JsonOutputParser(pydantic_object=Questions)

        chain = chat_prompt | self.llm4_sub_query | parser

        chain_output = chain.invoke(
            {
                "slide_title": slide.title,
                "slide_content": slide.content,
                "data_required": slide.data,
                "questions": "\n".join(slide.questions),
                "today_date": datetime.now().strftime("%Y-%m-%d"),
                "number_of_questions": self.max_number_of_questions,
                "json_output": parser.get_format_instructions(),
                "slide_primary_research": "\n".join(
                    [a["summary"] for a in slide.primary_research_text]
                ),
            }
        )

        return chain_output

    def search_scrape_summarize(
        self, query, follow_up_level=0, prev_question_results=""
    ):
        "search, scrape, and summarize the results"
        logging.info("Searching for: %s", query)

        search = GoogleSearch()
        search_results = search._search_google(
            query=query,
            num_of_results=7,
            type_of_search="search",
        )

        with futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            search_results_text_exec = executor.map(self._scrape_url, search_results)

        search_results_text = list(search_results_text_exec)

        cc = ContextCompressor(
            documents=search_results_text,
            max_results=7,
        )

        compressed_context = cc.get_context(query=query)

        summary = self._summarize_results(
            query, compressed_context, follow_up_level, prev_question_results
        )["answer"]

        return {
            "query": query,
            "summary": summary,
            "search_results_links": [result["link"] for result in search_results],
        }

    def _summarize_results(
        self, query, compressed_context, follow_up_level=0, prev_question_results=""
    ):
        "summarize the results"

        research_parser = JsonOutputParser(pydantic_object=ResearchLLM)

        chat_template = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """You are a research analyst. Answer the question with the context provided and provided all the details supporting it. Get as close as possible to {summary_token_limit} tokens.
                    You are trained to provide an answer backed with data and quantifiable data-points when answering the question.
                    Your answer needs to be factual data-points and concise. Do not include any opinions.
                    Try to not ask follow-up questions, but if you need more information, you can ask for it in the form of a google searchable query. You have a limit of {max_questions} questions.
                    You MUST include all relevant source urls in your answer.
                    """,
                ),
                (
                    "system",
                    """""",
                ),
                ("human", "The question is: {question}"),
                # ("human", f"You also have access to this content: {self.primary_research}"),
                ("human", "The context is: {context}"),
                (
                    "human",
                    "Return a summary of the context and return a verbose answer and the follow-up questions. JSON output: {json_output}",
                ),
            ]
        )

        chain = chat_template | self.llm35 | research_parser

        chain_output = chain.invoke(
            {
                "context": "Past results"
                + "\n".join([result["summary"] for result in prev_question_results])
                + "\n\n\n New Context: \n\n\n"
                + compressed_context,
                "question": query,
                "summary_token_limit": self.summary_token_limit,
                "json_output": research_parser.get_format_instructions(),
                "max_questions": self.max_number_of_questions,
            }
        )

        # only do this if there is something to follow up on and only do it once
        if (
            len(chain_output["questions"]["questions"]) > 0
            and follow_up_level < self.max_follow_up_level
        ):
            with futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                questions = chain_output["questions"]["questions"]

                follow_up_results = executor.map(
                    self.search_scrape_summarize,
                    questions,
                    [follow_up_level + 1] * len(questions),
                    [prev_question_results] * len(questions),
                )
                # update the answer with the follow_up_results
                self._update_answer_with_follow_up(
                    parent_query=query,
                    answer_to_update=chain_output["answer"],
                    new_information=[
                        result["summary"] for result in list(follow_up_results)
                    ],
                )

        return chain_output

    def _scrape_url(self, result):
        "scrape the url"

        # check if the link has already been researched
        if result["link"] in self.links_researched:
            return self.links_researched[result["link"]]

        logging.info("Scraping: %s", result["link"])
        sel_driver = SeleniumDriverChrome()
        sel_resutls = sel_driver.get_url_text(result["link"])
        page_text = "\n".join(sel_resutls)

        return_data = {
            "page_text": page_text,
            "title": result["title"],
            "link": result["link"],
        }
        # store in class
        self.links_researched[result["link"]] = return_data

        return return_data

    def _update_answer_with_follow_up(
        self, parent_query: str, answer_to_update: str, new_information: list[str]
    ):
        "update the answer with the follow up results"

        chat_prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """You are a research analyst. Update the summary with context to get the answer to the parent question.
                    You are trained to provide an answer backed with data and quantifiable data-points when answering the question.
                    Think through the question step by step. Examples:
                    1. Market share analysis: If you have data about the entire market, you can start by finding the market size, then take the company's revenue to find the market share.
                    1.1. If you have competitors data, you can compare the company's revenue to the competitors to find the market share.
                    2. Competitor analysis: find the competitors' strengths and weaknesses, then compare them to the company.
                    3. Industry analysis: size of the industry, find the industry growth rates and trends, then compare the company to the industry.
                    Try to not ask follow-up questions, but if you need more information, you can ask for it in the form of a google searchable query. You have a limit of {max_questions} questions.""",
                ),
                (
                    "system",
                    """Limit your answer to {summary_token_limit} tokens. Only return the answer JSON format like: {json_output}.
                    Your answer needs to be factual and if possible include data-points. If the data-point is associated with a timeline, show that. Do not include any opinions or additional information.
                    """,
                ),
                (
                    "system",
                    """You MUST include all relevant source urls in your answer. Every url should be hyperlinked: [url website](url)""",
                ),
                ("human", "The parent question is: {question}"),
                (
                    "human",
                    f"This is information provided other research: {self.primary_research}",
                ),
                ("human", "The initial answer to be updated is: {initial_answer}"),
                ("human", "The new information to be added: {new_information}"),
            ]
        )

        parser = JsonOutputParser(pydantic_object=ResearchLLM)

        chain = chat_prompt | self.llm35 | parser
        chain_output = chain.invoke(
            {
                "question": parent_query,
                "initial_answer": answer_to_update,
                "new_information": "\n".join(new_information),
                "summary_token_limit": self.summary_token_limit,
                "max_questions": self.max_number_of_questions,
                "json_output": parser.get_format_instructions(),
            }
        )

        return chain_output


# create a tool that can be called by an LLM which takes in 2 dictionaries and a CAGR and return the future value of the first dictionary
# based on the CAGR


if __name__ == "__main__":
    "main function"
    import json

    # from utils import dynamo_db
    # from services.ppt_generator.data_classes.project import Project
    # db = dynamo_db.DynamoDB()
    # project = db.get_item(db.projects, "sunbelt_demo_1")
    # project = Project(**project)
    # r = Researcher(project.sections[0].slides[0])
    # r = Researcher(
    #     parent_question="Who are the public competitors of Windowcrafters from Tucker Georgia? Once you have them, provide the stock tickers for them."
    # )
    r = Researcher(
        parent_question="What are the growth trends identified by Jeld-Wen - JELD? Give back a verbose answer with detailed data-points."
    )
    results = r.researcher()

    print(results)
