# import sys

# sys.path.append(
#     r"."
# )

import os

from langchain_community.document_loaders import AmazonTextractPDFLoader

from services.ppt_generator.data_classes.project import Project
# from utils.selenium_driver_chrome import SeleniumDriverChrome
from utils.client_check import ClientConfig

# we will loop through the tables and paragraphs to extract the text


class LoadPDF:
    "Load a pdf and extract the text."

    def __init__(self, doc_path: str):
        self.doc_path = doc_path
        self.file_name = os.path.basename(doc_path)
        self.delimiter_text = "_+_"
        self.document_text_chunked = self.load_doc()

    def load_doc(self):
        "Load the doc"
        doc = AmazonTextractPDFLoader(self.doc_path)
        pages = doc.load()

        document_text_chunked = []

        for page in pages:
            chunk_str = page.page_content
            # remove duplicate \n and only keep one
            chunk_str = "\n".join(chunk_str.split("\n"))
            document_text_chunked.append({"text": chunk_str, "type": "text"})

        return document_text_chunked


if __name__ == "__main__":
    doc_path = "s3://xcap-bucket/Sunbelt/Stairworx/2024 YTD comparison.pdf"
    doc_loader = LoadPDF(doc_path)
    text_chunks = doc_loader.load_doc()
    print(text_chunks)
