
    dhQ,                         S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
JrJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJrJrJrJrJr  \R<                  " \5      r  " S S	\5      r! " S
 S\5      r"g)z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)AnyDictIterableIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                   6   \ rS rSr% SrSr\\S'        S!SSS.S\S	\	S
\	S\	S\
\	   S\S\
\	   S\	S\4S jjjrS\\   4S jrS"S jrS\\   4S jr\S"S j5       rS\4S jrS\\   4S jrS\S\\   4S jrS\\   4S jrS\S\S\4S jrS\SS4S jrS rg)#PebbloSafeLoader   zcPebblo Safe Loader class is a wrapper around document loaders enabling the data
to be scrutinized.
F_discover_sentNlocal)classifier_locationanonymize_snippetslangchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   r   c                   U(       a  [        U[        5      (       d  [        S5      eX l        [        [        R
                  " 5       5      U l        Xl        [        R                  R                  S5      =(       d    UU l        X0l        X@l        [        U R                  5      U l        / U l        / U l        [        [%        U R                  5      5      R'                  S5      S   R'                  S5      S   n
[)        U
5      U l        [-        U R                  5      U l        [0        U l        U
U R                  U R*                  S.U R.                  S:  a  S[        U R.                  5      0O0 EU l        U R7                  5       U l        [;        UUUU	S	9U l        U R<                  R?                  U R8                  5        g )
NzMust specify a valid name.PEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r#   r   r%   r   ) 
isinstancestr	NameErrorapp_nameuuiduuid4load_idr+   osenvirongetr$   r!   r"   r   r,   docsdocs_with_idtypesplitr   r-   r   r.   r   
batch_sizeloader_details_get_app_detailsappr   	pb_clientsend_loader_discover)selfr   r    r!   r"   r#   r$   r%   r   r   loader_names              c/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/pebblo.py__init__PebbloSafeLoader.__init__%   si    :dC008994::<(&ZZ^^,BCT}
&/<$&	35$t{{+,2237;AA#FqI*;7 /0@0@ A*!++++	
 ((1, $S)>)>%?@	
 ((*/ 3)1	
 	++DHH5    returnc                 x    U R                   R                  5       U l        U R                  5         U R                  $ )z`Load Documents.

Returns:
    list: Documents fetched from load method of the wrapped `loader`.
)r+   loadr9   classify_in_batches)rC   s    rE   rK   PebbloSafeLoader.loadV   s.     KK$$&	  "yyrH   c                    [        U R                  U R                  5      n/ n[        U5      n[	        U5       H  u  pEXCS-
  :H  nXPl        U R                  5       U l        U R                  R                  U R                  U R                  U R                  US9nU R                  U5        U R                  (       a  U R                  U5      nOU R                  5       nUR                  U5        M     X l        g)z
Classify documents in batches.
This is to avoid API timeouts when sending large number of documents.
Batches are generated based on the page_content size.
   )loading_endN)r   r9   r=   len	enumerate_index_docsr:   rA   classify_documentsr@   r>   _add_pebblo_specific_metadatar$   _add_semantic_to_docs_unindex_docsextend)	rC   batchesprocessed_docstotal_batchesibatchis_last_batchclassified_docsbatch_processed_docss	            rE   rL   $PebbloSafeLoader.classify_in_batchesa   s     )DIIt)
 *,G!'*HA"#q'8"8MI $ 0 0 2D"nn??!!##)	 @ O ..?!!'+'A'A/'R$'+'9'9';$!!"67 +" #	rH   c              #     #     U R                   R                  5       n  [        U5      n[        U45      U l	        U R                  5       U l        U R                  R                  U R                  U R                  U R                   5      nU R#                  U5        U R$                  (       a  U R'                  U5      U l	        OU R)                  5       U l	        U R                  S   v   M  ! [         aI  nU R                   R                  R                   S3n[
        R                  U5        [        U5      UeSnAff = f! [         a
    / U l	         gf = f7f)zLoad documents in lazy fashion.

Raises:
    NotImplementedError: raised when lazy_load id not implemented
    within wrapped loader.

Yields:
    list: Documents from loader's lazy loading.
z does not implement lazy_load()Nr   )r+   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr9   listrS   r:   rA   rT   r@   r>   rU   r$   rV   rW   )rC   doc_iteratorexcerr_strdocclassified_docs         rE   rc   PebbloSafeLoader.lazy_load   s+    	8;;002L
 <( cVDI $ 0 0 2D!^^>>!!488T-@-@N ..~>!! 66~F	 ..0	))A,! 	 # 	8..7788WXGLL!%g.C7	8 ! 	sK   EC- EE CE-
E 7AD;;E  EEEEEc                     SU l         g )NT)r   )clss    rE   set_discover_sent"PebbloSafeLoader.set_discover_sent   s
    !rH   c                     [        5       u  p[        U R                  U R                  U R                  U R
                  UU[        [        S[        S5      S9S9nU$ )zDFetch app details. Internal method.

Returns:
    App: App details.
langchain_community)r    r   )r    r!   r"   r5   runtime	frameworkplugin_versionclient_version)	r   r   r2   r!   r"   r5   r   r   r   )rC   ry   rx   r@   s       rE   r?   !PebbloSafeLoader._get_app_details   sZ     )]	**((LL)$* 56
 
rH   c                     [        U R                  5       VVs/ sH)  u  p[        SS[        U5      0UR	                  5       D6PM+     nnnU$ s  snnf )z
Indexes the documents and returns a list of IndexedDocument objects.

Returns:
    List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
pb_id )rR   r9   r   r0   dict)rC   r\   ro   r:   s       rE   rS   PebbloSafeLoader._index_docs   sS     $DII.
. 7#a&7CHHJ7. 	 
 	
s   /Ar_   c           	      `   U R                    Vs0 sH,  nUR                  [        UR                  UR                  S9_M.     nnUR                  5        H/  nUR                  S5      nXS;   d  M  U R                  X5   U5        M1     UR                  5        Vs/ sH  o"PM     nnU$ s  snf s  snf )a  
Adds semantic metadata to the given list of documents.

Args:
    classified_docs (Dict): A dictionary of dictionaries containing the
        classified documents with pb_id as key.

Returns:
    List[Document]: A list of Document objects with added semantic metadata.
page_contentmetadatar~   )r:   r~   r
   r   r   valuesr8   _add_semantic_to_doc)rC   r_   ro   indexed_docsrp   doc_idsemantic_metadata_docss          rE   rV   &PebbloSafeLoader._add_semantic_to_docs   s     ((
( IIxS-=-=UU( 	 

 .446N#''0F%)),*>O 7
 2>1D1D1F!G1F##1F!G%%
 "Hs   2B&
B+c           	          [        U R                  5       VVs/ sH#  u  p[        UR                  UR                  S9PM%     nnnU$ s  snnf )z
Converts a list of IndexedDocument objects to a list of Document objects.

Returns:
    List[Document]: A list of Document objects.
r   )rR   r:   r
   r   r   )rC   r\   ro   r9   s       rE   rW   PebbloSafeLoader._unindex_docs   sN     $D$5$56
6 #"2"2S\\J6 	 
 	
s   )Aro   rp   c                     [        UR                  S0 5      R                  5       5      UR                  S'   [        UR                  S0 5      R                  5       5      UR                  S'   U$ )z
Adds semantic metadata to the given document in-place.

Args:
    doc (Document): A Document object.
    classified_doc (dict): A dictionary containing the classified document.

Returns:
    Document: The Document object with added semantic metadata.
entitiespebblo_semantic_entitiestopicspebblo_semantic_topics)rk   r8   keysr   )rC   ro   rp   s      rE   r   %PebbloSafeLoader._add_semantic_to_doc   sg     48z2.3354
/0 26x,1132
-. 
rH   c           
         U R                    H  nUR                  nU R                  R                  R                  S:X  a)  [        UR                  SU R                  5      5      US'   O8[        UR                  SUR                  SU R                  5      5      5      US'   UR                  UR                  0 5      R                  SS5      US'   M     g)z*Add Pebblo specific metadata to documents.SharePointLoadersource	full_pathpb_checksumN)	r:   r   r+   re   rf   r   r8   r,   r~   )rC   r_   ro   doc_metadatas       rE   rU   .PebbloSafeLoader._add_pebblo_specific_metadata  s    $$C<<L{{$$--1CC,9 $$Xt/?/?@-[) -: $$#\%5%5h@P@P%Q-[)
 +:*=*=cii*L*P*Pt+L' %rH   )r@   r2   r=   r"   r9   r:   r5   r$   r+   r>   r!   rA   r,   r.   r-   ) r   NFN)rI   N)rf   
__module____qualname____firstlineno____doc__r   bool__annotations__r   r0   r	   rF   r   r
   rK   rL   r   rc   classmethodrt   r   r?   r   rS   r   rV   rW   r   r   rU   __static_attributes__r   rH   rE   r   r      sD    !ND  !%#(,/6 $+#(/6$/6 /6 	/6
 /6 #/6 /6 !/6 !/6 !/6b	d8n 	#@ 8H-  D " "# ,T/2 &T &d8n &2tH~  $ 8 &T d rH   r   c                       \ rS rSrSrSSSSS.S\\   S\\   S\\\      S\\	\\
4      S	\\\	\\
4         S
S4S jjrS
\\   4S jrS
\\   4S jrSrg)PebbloTextLoaderi  z
Loader for text data.

Since PebbloSafeLoader is a wrapper around document loaders, this loader is
used to load text data directly into Documents.
N)r   idsr   	metadatastextsr   r   r   r   rI   c                @    Xl         X l        X0l        X@l        XPl        g)af  
Args:
    texts: Iterable of text data.
    source: Source of the text data.
        Optional. Defaults to None.
    ids: List of unique identifiers for each text.
        Optional. Defaults to None.
    metadata: Metadata for all texts.
        Optional. Defaults to None.
    metadatas: List of metadata for each text.
        Optional. Defaults to None.
N)r   r   r   r   r   )rC   r   r   r   r   r   s         rE   rF   PebbloTextLoader.__init__  s    * 
 "rH   c              #     #    [        U R                  5       H  u  pSnU R                  =(       d    0 nU R                  (       aK  U[	        U R                  5      :  a2  U R                  U   (       a  UR                  U R                  U   5        U R                  (       a(  U[	        U R                  5      :  a  U R                  U   n[        X2US9v   M     g7f)zI
Lazy load text data into Documents.

Returns:
    Iterator of Documents
N)idr   r   )rR   r   r   r   rQ   updater   r
   )rC   r\   text_idr   s        rE   rc   PebbloTextLoader.lazy_load9  s      !,GAC}}*H~~!c$..&9"9dnnQ>Oq 12xxADHH-hhqkcxHH -s   CCc                 Z    / nU R                  5        H  nUR                  U5        M     U$ )z@
Load text data into Documents.

Returns:
    List of Documents
)rc   append)rC   	documentsro   s      rE   rK   PebbloTextLoader.loadI  s.     	>>#CS! $rH   )r   r   r   r   r   )rf   r   r   r   r   r   r0   r	   r   r   r   rF   r   r
   rc   rK   r   r   rH   rE   r   r     s     !%#'-148#}# 	#
 d3i # 4S>*# Dc3h01# 
#6I8H- I 
d8n 
rH   r   )#r   loggingr6   r3   importlib.metadatar   typingr   r   r   r   r   r	   langchain_core.documentsr
   )langchain_community.document_loaders.baser   $langchain_community.utilities.pebblor   r   r   r   r   r   r   r   r   r   r   r   	getLoggerrf   rg   r   r   r   rH   rE   <module>r      sa    @  	  & @ @ - @    
		8	$uz up=z =rH   