o
    sg                  
   @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ G d	d
 d
ZedkrddlZejd ddlZddlmZ ddlmZ ddlmZ ddlmZ dedefddZdZdZ ee j!Z!ej"d Z#e!j  de dZ$e Z%e%&e%j'eZ(ed(i e(Z(ee#e$Z)g Z*e)dd D ]2Z+e,de+ d de+ Z+ee(e!e+Z-e,e-j. e*ed/dd e-j0D e+ee e-j.d d! qed"d#Z1e1j2e*d$d% d&Z3e14e3Z5e,d'd e5D  dS dS ))z*this is a compression and retrieval module    N)EnsembleRetriever)TokenTextSplitter)Chroma)BM25Retriever)Document)OpenAIEmbeddingsc                   @   s   e Zd ZdZddedefddZ		dd	eeef d
edefddZ	dd Z
ddededefddZddededefddZdS )
CompressorzCompressor classdefaultNcollection_namepersist_directoryc                 C   sP   || _ ttdd| _t| _| j rt| j|| j d| _dS t| j|d| _dS )zO
        Class to compress documents to a contextual retriever format.
        OPENAI_API_KEY)openai_api_key)embedding_functionr
   r   )r   r
   N)	r   r   osgetenv
embeddingsr   bm25_retrieverr   vectorstore)selfr
   r    r   0/var/www/html/XCapMarket/utils/query/retriver.py__init__   s   zCompressor.__init__  	documents
chunk_sizechunk_overlapc                 K   sp   |  |}|du rt|d }|ddr|d ||d}nt||d}||}| j| | j|| _dS )a  
        Compress a list of documents into a contextual retriever format.

        Args:
        - documents (list[Document]): List of documents to compress.
        - chunk_size (int): The size of each chunk to split the document into.
        - chunk_overlap (int): The amount of overlap between each chunk.
        - **kwargs: Additional keyword arguments to pass to the text_splitter.

        Returns:
        - A contextual retriever.
        Ng?token_splitterF)r   r   )	validate_docsintgetr   split_documentsr   add_documentsr   from_documents)r   r   r   r   kwargstext_splittertextsr   r   r   compress$   s   

zCompressor.compressc                 C   s.   t |D ]\}}t|trt|d||< q|S )a  
        Validate that the input documents are Document objects, and convert any
        strings to Document objects.

        Args:
        - documents (list[Document]): The list of documents to validate.

        Returns:
        - The validated list of documents.
        page_content)	enumerate
isinstancestrr   )r   r   idocr   r   r   r   P   s
   
zCompressor.validate_docs         ?querykbm25_weightc                 C   sH   | j }||_| jjd|id}t||g|d| gd}||d| S )f  
        Retrieve a list of documents that match a query.

        Args:
        - query (str): The query to search for.
        - k (int): The number of results to return.
        - bm25_weight (float): The weight to give the BM25 retriever when combining
            with the vectorstore retriever.

        Returns:
        - A list of documents.
        r1   )search_kwargs   )
retrieversweightsr   )r   r1   r   as_retrieverr   invoke)r   r0   r1   r2   r   vectorstore_retrieverensemble_retrieverr   r   r   retrievea   s   
zCompressor.retrievec                    s(   t  }|d| j|||I dH }|S )r3   N)asyncioget_event_looprun_in_executorr<   )r   r0   r1   r2   loopresultsr   r   r   	aretrieve|   s   zCompressor.aretrieve)r	   N)r   N)r.   r/   )__name__
__module____qualname____doc__r+   r   listr   r   r&   r   floatr<   rB   r   r   r   r   r      s    

,r   __main__.)Project)ClientConfig)DocUploader)DynamoDBbucket_namefolder_namec                 C   sN   t d}|j| |d}g }|dg D ]}|d |krq||d  q|S )zList all the files in a folders3)BucketPrefixContentsKey)boto3clientlist_objects_v2r   append)rO   rP   rQ   responsefilesobjr   r   r   list_files_in_folder   s   
r]   	tequity_1tequityxcap_s3_storage/r5   z
Uploading z to ChromaDBzs3://xcap-storage-dev/z


c                 C   s   g | ]}|d  qS )textr   ).0pager   r   r   
<listcomp>   s    re   )source
project_idrW   doc_classification)r(   metadatazpath/to/persist/directory)r      )r   zWhen was the company found?c                 C   s   g | ]}|j qS r   r'   )rc   r-   r   r   r   re      s    r   )6rF   r=   r   langchain.retrieversr   langchain.text_splitterr   langchain_chroma.vectorstoresr   langchain_community.retrieversr   langchain_core.documentsr   langchain_openai.embeddingsr   r   rC   syspathrY   rV   +services.ppt_generator.data_classes.projectrK   utils.client_checkrL   !utils.document_loader.DocUploaderrM   utils.dynamo_dbrN   r+   r]   rg   rW   client_configenvironbucket_name_locproject_folder_namedbget_itemprojectsprojectproject_filestext_chunksdoc_pathprintdoc_uploaderrh   joindocument_text_chunked
compressorr&   questionr<   rA   r   r   r   r   <module>   sj     






