o
    gY                     @   s  d dl Z e jd d dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- e. Z/e0dZ1ej2d Z3dZ4dZ5dZ6dZ7g dZ8G dd deZ9G dd deZ:G dd deZ:dd d!gd"d#ee9d$d%dd d!gddddZ;G d&d' d'Z<d(e=d)e=d*ed+e"fd,d-Z>	.d@d/e=d0e"d1e?d2e	e=ef fd3d4Z@d5e
e= d(e=d*ed2e	e=ef fd6d7ZAd5e
e= d(e=d*ed+e"d2df
d8d9ZBeCd:krAd;ZDe"d<E ZFe@eDeFd=d> ej2d Z3eFj0 d?eD d?ZGdS dS )A    N.)ThreadPoolExecutor)DictListOptionalUnionAny)ChatPromptTemplate)%SentenceTransformersTokenTextSplitter)JsonOutputParser)
ChatOpenAI)	BaseModelField)	OPENAI_EFOPENAI_MODEL_MINI)Project)ChromaDB)ClientConfig)LoadPDF)LoadWordDoc)DynamoDB)SitemapScrape)S3BucketStorageget_default_bucket_names3xcap_s3_storage).pdf.docxz.pptxi  皙?z_+_)comapny_infocompany_financialindustryotherwebsitec                   @   sN   e Zd ZU dZedddZeed< edddZeed< edddZ	eed	< d
S )FinancialInfoz)Model for financial document information..zDocument Typetitledocument_typezStart Date YYYY-MM-DD
start_datezEnd Date YYYY-MM-DDend_dateN)
__name__
__module____qualname____doc__r   r'   str__annotations__r(   r)    r0   r0   =/var/www/html/XCapMarket/utils/document_loader/DocUploader.pyr$   /   s
   
 r$   c                   @   &   e Zd ZU dZedddZeed< dS DocClassificationz"Model for document classification..zClassification of the documentr%   classificationNr*   r+   r,   r-   r   r5   r.   r/   r0   r0   r0   r1   r4   7      
 r4   c                   @   r2   r3   r6   r0   r0   r0   r1   r4   >   r7   z'What is the legal name of the business?company_alt_names)question	attributeaH  What type of financial document is this: tax, income_statement, balance_sheet, cash_flow, Other?                     If other, explain it. It can be multiple typesWhat is the date range of the document? Start date and end date                     return a JSON format: document_type, start_date: YYYY-MM-DD, end_date: YYYY-MM-DDcompany_financialspydantic_object)r9   r:   output_parserc                   @   s\  e Zd ZdZ	d,dededee fddZdedee	e
eef   fd	d
ZdedefddZde	e fddZ				d-de
eef dee dee dee dee de
eef fddZd.dededefddZedd Zd/ddZd e
eef ddfd!d"Zd#e	e defd$d%Zd#ee	e ef dee	e ef fd&d'Zd/d(d)Zd/d*d+ZdS )0DocUploaderz2Upload and process documents to various databases.Nprojectclientdoc_pathc                 C   s   t ttjd dd| _t| _t| jt | _	t
 | _d| _|| _|| _d| _g | _d| _g | _|| _d| _d| _d| _d| _d| _d| _d| _d| _|rQ| jj| _|ritj|| _| | j| _| || _dS dS )zInitialize the DocUploader.

        Args:
            project: Project object containing project information
            client: ClientConfig object with client-specific settings
            doc_path: Path to the document to be processed
        OPENAI_API_KEYr   )modelapi_keytemperatureTN )r   r   osenvironllm35DEFAULT_CHUNK_SIZE
chunk_sizeintDEFAULT_CHUNK_OVERLAP_RATIOchunk_overlapr   db	supportedr@   rA   customer_chroma_collectionchunksdoc_summarychunks_summary	file_pathdoc_type	file_namedoc_classificationdocument_text_chunked	chroma_dbpathbasenameclassify_documentget_document_text)selfr@   rA   rB   r0   r0   r1   __init__b   s>   
zDocUploader.__init__returnc                 C   s|   |  }|drt|}d| _|j	S |ds|dr)t|| j}d| _|j	S d| _td|  td|  dS )	zGet the document text in chunks.

        Args:
            doc_path: Path to the document

        Returns:
            List of document chunks or None if document type is not supported
        r   pdf_docr   z.docword_docFzUnsupported document type: N)
lowerendswithr   rW   r   r@   rQ   
XCM_loggerwarningrZ   )r`   rB   lower_doc_path
doc_loaderr0   r0   r1   r_      s   	

zDocUploader.get_document_textfilenamec                 C   sR   t g d}ttd}|| jB |B }|dtdt|| d}|d S )zClassify the document based on its filename.

        Args:
            filename: Name of the document file

        Returns:
            Classification of the document
        ))systemz2You are a financial analyst able to classify items)rl   zGYou will choose between the following classes: {classification_choices})rl   au  
            Here are some examples of the classification:
            "TTM through March 2024" -> "company_financial"
            "questionnaire" -> "company_info"
            "23611D Remodeling in the US Industry Report -IBIS" -> "industry"
            "2023 P and L-Bal Sheet" -> "company_financial"
            "2024 YTD comparison"" -> "company_financial"
            )humanz%The document file name is: {filename})rm   zJReturn the answer in JSON format as only the classification: {JSON_format}r<   
)classification_choicesro   rk   JSON_formatr5   )	r	   from_messagesr   r4   rJ   invokejoinDOCUMENT_CLASSIFICATIONSget_format_instructions)r`   rk   promptparserchainchain_outputr0   r0   r1   r^      s   	
	zDocUploader.classify_documentc                 C   s   t  }|jj| jtd}g }d}| jdu rg S g }g }g }| jD ]S}t| j| jd}	|		|d }
t
|
D ]-\}}| ||}| |}d|v rN|d |d< || || || || q5td|t| j| j |d7 }q |j|||d |S )	z_Upload the document to ChromaDB.

        Returns:
            List of document chunks
        embedding_function   N)rL   rO   text
table_namez.Uploaded chunk %s of %s 
 File: %s to ChromaDBids	documents	metadatas)r   chroma_clientget_or_create_collectionrR   r   rZ   r
   rL   rO   
split_text	enumerate	unique_idstandardize_metadataappendrg   infolenrX   upsert)r`   chromadb
collectionrS   chunk_number
unique_idsdocuments_chunksr   chunksplittertext_chunksi
text_chunkr   metadatar0   r0   r1   upload_to_chromadb   sL   






zDocUploader.upload_to_chromadbr   rX   rW   rY   
chunk_typec                 C   sN   |r|n| j }|r|n| j}|r|n| j}|r|n|d }| jj||||dS )aV  Standardize the metadata for document chunks.

        Args:
            chunk: Document chunk
            file_name: Name of the file
            doc_type: Type of document
            doc_classification: Classification of document
            chunk_type: Type of chunk

        Returns:
            Standardized metadata dictionary
        type)
project_idrX   r'   rY   r   )rX   rW   rY   r@   r   )r`   r   rX   rW   rY   r   r0   r0   r1   r     s   z DocUploader.standardize_metadatar   r   r   c                 C   s    t | jj| jt|t|gS )zCreate a unique ID for the document chunk.

        Args:
            chunk_number: Number of the chunk
            i: Index within the chunk

        Returns:
            Unique ID string
        )DELIMITER_TEXTrs   r@   r   rX   r.   )r`   r   r   r0   r0   r1   r   3  s   
zDocUploader.unique_idc                 C   s   d dd | D S )z5Remove non-printable characters from the text fields.rG   c                 s   s    | ]	}|  r|V  qd S N)isprintable).0cr0   r0   r1   	<genexpr>D  s    z-DocUploader._sanitize_text.<locals>.<genexpr>)rs   )r}   r0   r0   r1   _sanitize_textA  s   zDocUploader._sanitize_textc                    s   t d jj t jjddd} fdd|j D }t }|D ].}ddl}|	d  fd	d
| D }|
 jj| t d|d  t d|d  q" j
 jj jjdd|jd  |j dS )z,Upload the website to ChromaDB and DynamoDB.z Uploading website %s to ChromaDBT)	summarizeexclude_blogsc              
      s6   g | ]\}} j j| d |d |d dqS )rn   r}   summary)r   urlr}   r   )r@   r   r   rs   )r   r   itemr`   r0   r1   
<listcomp>M  s    z1DocUploader.vectorize_website.<locals>.<listcomp>r   Ng      ?c                    s*   i | ]\}}|t |tr |n|qS r0   )
isinstancer.   r   )r   kvr   r0   r1   
<dictcomp>\  s    z1DocUploader.vectorize_website.<locals>.<dictcomp>zUploaded website %s to DynamoDBr   images)r   r   r}   r   )rg   r   r@   company_urlr   urls_scrapeditemsr   timesleepupload_to_dynamodbrP   	web_pagesr   
image_urlsupload_website_to_vectorDB)r`   scrapeitems_to_uploadrP   r   r   
item_cleanr0   r   r1   vectorize_websiteF  s4   




	zDocUploader.vectorize_websitewebsite_scrapedc                 C   s   t  }|jj| jtd}d| _g }g }g }| D ]1\}}|d du s)|d dkr*q| |}	| j||dddd}
|	|	 |	|
 |	|d  q|j
|||d	 dS )
zzUpload the website to VectorDB.

        Args:
            website_scraped: Dictionary of scraped website content
        rz   Websiter   NrG   r#   r}   )rW   rY   r   r   )r   r   r   rR   r   rX   r   r   r   r   r   )r`   r   r   r   r   r   document_textr   r   r   r   r0   r0   r1   r   o  s6   



z&DocUploader.upload_website_to_vectorDBrS   c                    sV   |  |I dH }t|dkr|d }nd|}|  |I dH }|| _|| _||fS )zSummarize the document chunks.

        Args:
            chunks: List of document chunks

        Returns:
            Tuple of (chunks_summary, doc_summary)
        Nr|   r   rn   )summarize_chunkr   rs   rT   rU   )r`   rS   rU   rT   r0   r0   r1   summarize_chunks  s   	

zDocUploader.summarize_chunksc                    sn   t g d}|| jB }t|tr(|dd |D I dH }dd |D }|S |d|iI dH }|j}|S )zSummarize a chunk or list of chunks.

        Args:
            chunks: Single chunk or list of chunks to summarize

        Returns:
            Summarized content
        ))rl   za You are skilled investment banker and expert in summarizing information as a financial analyst. )rl   zSummarize the following text:)rm   zThe following text:)rm   z{chunk}c                 S   s   g | ]}d |iqS )r   r0   )r   r   r0   r0   r1   r     s    z/DocUploader.summarize_chunk.<locals>.<listcomp>Nc                 S   s   g | ]}|j qS r0   )content)r   chunk_outputr0   r0   r1   r     s    r   )r	   rq   rJ   r   listabatchainvoker   )r`   rS   rv   rx   ry   outputr0   r0   r1   r     s   

zDocUploader.summarize_chunkc                 C   s<   t  }||j| jj| j| j| j| j| j	| j
| jd dS )z Upload the document to DynamoDB.)r   rB   rY   rX   rW   rT   rU   rS   N)r   r   project_docsr@   r   rV   rY   rX   rW   rT   rU   rS   )r`   rP   r0   r0   r1   r     s   zDocUploader.upload_to_dynamodbc           
   
   C   st  | j du s
| j tvrdS t| j du rdS t| j D ]}g d}d}|ddur@|d t|}|| jB |dB }n
t|}|| jB }tdt	| j
dD ]B}| j
||d  }d|}|ddur|||d	 |||d d
}n||d	 ||d}t|dddu r|}qS|j}qSt| j|d }	|	du rg }	|	| t| j|d |	 q| j  dS )z-Process follow-up questions for the document.N))rl   zYYou are a skilled knowledge worker. Answer the questions asked and return only the answer)rm   zThe questions is {question})rm   zBYou have access to the following chunk of the document: {document})rm   z]You have access to the previous answer to the question as well: {previous_answer}. Update it.rG   r>   )rm   z>Your output should be in the following format: {output_format}r      z

r9   )r9   documentprevious_answeroutput_format)r9   r   r   r   r:   )rY   rt   follow_up_questions_to_askgetr   r	   rq   rJ   ranger   rS   rs   rr   ru   getattrr   r@   setattrupdate_project_in_db)
r`   r9   prompt_messagesr   rv   rx   r   r   ry   attribute_to_updater0   r0   r1   follow_up_questions  s^   







zDocUploader.follow_up_questionsr   )NNNN)r   )rb   N)r*   r+   r,   r-   r   r   r   r.   ra   r   r   r   r_   r^   r   r   rM   r   staticmethodr   r   r   tupler   r   r   r   r   r0   r0   r0   r1   r?   _   sV    
"/18


#

))

"r?   bucket_namerB   r@   client_configc                 C   s   zB	 t d| d d|  d| }t|||}td|j  | |_|  t	|
|j |  td| d W d S  tyV   tjd| d	d
 Y d S w )Nz
Uploading z to ChromaDBzs3:///zDocument classification: z	Uploaded z to DynamoDBzError processing document: T)exc_info)printr?   rg   r   rY   r   rS   r   asynciorunr   r   	Exceptionerror)r   rB   r@   r   doc_uploaderr0   r0   r1   r   8  s   
r   Fr   rA   process_on_threadsrb   c                 C   s`   |}|j  d|  d}tj| d}tt d}|t|}|r*t|t|| ddiS t|t|S )a  Upload documents to ChromaDB.

    Args:
        project_id: ID of the project
        client: ClientConfig object
        process_on_threads: Whether to process documents on threads

    Returns:
        Dictionary with task information or success message
    r   )r   )r   messagezFiles uploaded successfully.)	rA   r   check_project_in_dbr   r   list_files_in_folderr   process_docs_on_threadsdoc_upload_create_task)r   rA   r   r   folder_namer@   
s3_storagefilesr0   r0   r1   doc_uploader_callL  s   r   r   c           	      C   s^   ddl m} ddl m} g }| D ]}||j|||j}|| q||j|j}||dS )a  Create Celery tasks for document uploading.

    Args:
        files: List of file paths
        bucket_name: Name of the S3 bucket
        project: Project object
        client_config: ClientConfig object

    Returns:
        Dictionary with task IDs
    r   )process_uploaded_doc)r   )
file_tasksr#   )tasks.presentation_tasksr   r   delayr   rA   r   )	r   r   r@   r   celery_vectorize_websiter   file	file_taskwebsite_vector_taskr0   r0   r1   r   h  s   
r   c                    sh   t tdt d d fdd| D }|D ]}|  qW d   dS 1 s-w   Y  dS )zProcess documents on threads for concurrent uploads.

    Args:
        files: List of file paths
        bucket_name: Name of the S3 bucket
        project: Project object
        client_config: ClientConfig object
    r|   r   )max_workersc              	      s   g | ]} t |qS r0   )submitr   )r   rB   r   r   executorr@   r0   r1   r     s    z+process_docs_on_threads.<locals>.<listcomp>N)r   minrH   	cpu_countresult)r   r   r@   r   futuresfuturer0   r   r1   r     s   
"r   __main__z8east-cobb-lifestyle-8318100a-2160-4435-bdf9-ab16b055e2cesunbeltT)r   r   )F)Hsysr\   r   r   loggingrH   concurrent.futuresr   typingr   r   r   r   r   boto3langchain.promptsr	   langchain.text_splitterr
   langchain_core.output_parsersr   langchain_openair   pydanticr   r   configs.configr   r   +services.ppt_generator.data_classes.projectr   utils.chroma_dbr   utils.client_checkr    utils.document_loader.pdf_loaderr   !utils.document_loader.word_loaderr   utils.dynamo_dbr   utils.webscrape.sitemap_scraper   utils.s3_storager   r   	getLoggerrg   rA   r   rI   r   VALID_DOCUMENT_EXTENSIONSrK   rN   r   rt   r$   r4   r   r?   r.   r   boolr   r   r   r*   r   get_client_configr   r   r0   r0   r0   r1   <module>   s    

	
   \




 


