
    dhc                    *   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKJr  S SK	J
r
  S SKJrJrJrJrJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJrJr  S SKJr  S SKJr  \R@                  " \!5      r"Sr#Sr$Sr%Sr&/ SQr'/ SQr(S/r)/ SQr*\'\(\)\*S.r+ " S S\,\5      r- " S S\5      r. " S S\5      r/ " S S\5      r0 " S S\5      r1 " S  S!\5      r2S-S" jr3S.S# jr4S/S$ jr5S0S% jr6S1S& jr7 S2     S3S' jjr8S4S( jr9S5S) jr:S6S* jr; " S+ S,\5      r<g)7    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)get_from_dict_or_env)	BaseModel)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  )
JSONLoaderS3FileLoaderUnstructuredMarkdownLoaderUnstructuredPDFLoaderUnstructuredFileLoaderUnstructuredJsonLoaderPyPDFLoaderGCSFileLoaderAmazonTextractPDFLoader	CSVLoaderUnstructuredExcelLoaderUnstructuredEmailLoader)DirectoryLoaderS3DirLoaderSlackDirectoryLoaderPyPDFDirectoryLoaderNotionDirectoryLoaderDataFrameLoader)NotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                       \ rS rSrSrSrSrSrg)RoutesC   z2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discover N)__name__
__module____qualname____firstlineno____doc__
loader_docloader_app_discover__static_attributes__r/       \/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/utilities/pebblo.pyr-   r-   C   s    <!J,r8   r-   c                  $    \ rS rSr% SrS\S'   Srg)IndexedDocumentJ   zPebblo Indexed Document.strpb_idr/   Nr0   r1   r2   r3   r4   __annotations__r7   r/   r8   r9   r;   r;   J   s    "J$r8   r;   c                      \ rS rSr% SrSrS\S'    S\S'    S\S'    SrS	\S
'    S\S'    S\S'    S\S'    S\S'    S\S'    SrS\S'   Sr	g)RuntimeQ   zPebblo Runtime.localr=   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimer/   N)
r0   r1   r2   r3   r4   rE   r@   rJ   rP   r7   r/   r8   r9   rB   rB   Q   se    D#,
I#
I)B4M*GOM(GS4r8   rB   c                  0    \ rS rSr% SrS\S'    S\S'   Srg)	Frameworkj   zPebblo Framework instance.r=   nameversionr/   Nr?   r/   r8   r9   rR   rR   j   s    $
I L#r8   rR   c                  x    \ rS rSr% SrS\S'    S\S'    S\S'    S\S'    S	\S
'    S\S'    S\S'    S\S'   Srg)Apps   zPebblo AI application.r=   rT   ownerrI   descriptionload_idrB   rP   rR   	frameworkplugin_versionclient_versionr/   Nr?   r/   r8   r9   rW   rW   s   sJ     
IJ!L-%'**r8   rW   c                      \ rS rSr% SrS\S'    S\S'    S\S'    S\S'    S\S	'    S
\S'    S\S'    S\S'    S\S'    S\S'   Srg)Doc   zPebblo document.r=   rT   rY   listdocsr]   r[   dictloader_detailsboolloading_endsource_ownerclassifier_locationanonymize_snippetsr/   Nr?   r/   r8   r9   r`   r`      s[    
I0J
J.L-+7,%Ur8   r`   c                    U (       a  SU ;   d  SU S   :X  d  U S;   a  U $ [         R                  " U 5      nUR                  5       (       a  UR                  5       n[	        U5      $ )zReturn an absolute local path for a local file/directory,
for a network related path, return as is.

Args:
    path (str): Relative path to be resolved.

Returns:
    str: Resolved absolute path.
z:///r   )unknown-r+   )pathlibPathexistsresolver=   )rG   	full_paths     r9   get_full_pathrt      s\     TM47N11T"I%%'	y>r8   c                P    [         R                  5        H  u  pX;   d  M  Us  $    g)zReturn loader type among, file, dir or in-memory.

Args:
    loader (str): Name of the loader, whose type is to be resolved.

Returns:
    str: One of the loader type among, file/dir/in-memory.
unsupported)LOADER_TYPE_MAPPINGitems)loaderloader_typeloaderss      r9   get_loader_typer|      s+     !4 9 9 ; !< r8   c                   SSK JnJnJnJn  Sn[        U [        5      (       d  [        R                  S5        U$ U R                  n SU;   a\  [        X5      (       a  SU R                   SU R                   3nGO[        X5      (       a  SU R                   SU R                   3nGOS	U;   a'  US	   nU(       a  S
U;   a  US
   nU(       a  U SU 3nGOmSU;   a  US   nGO`SU;   a  US   nGOSSU;   a7  US   nU(       a)  [        U[        5      (       a  [        U5      S:  a  US   nGO[        X5      (       a  SnGO[        X5      (       a  SU R                   3nOU R                   R"                  S:X  a  UR%                  S5      (       a  UR%                  S5      n	SU	 3nOUR%                  S5      (       a8  UR%                  S/ 5      n
SR'                  U
 Vs/ sH	  nSU S3PM     sn5      nOMUR%                  S5      (       a7  UR%                  S/ 5      nSR'                  U Vs/ sH	  nSU S3PM     sn5      n[+        [-        U5      5      $ s  snf s  snf ! [(         a     N*f = f)zReturn an absolute source path of source of loader based on the
keys present in Document.

Args:
    loader (BaseLoader): Langchain document loader, derived from Baseloader.
r   )r%   r   r&   r   rn   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://rl   zs3://sourcechannelrG   	file_path	web_pathsr+   znotiondb://r'   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, z https://drive.google.com/file/d/z/viewdocument_idsz#https://docs.google.com/document/d/z/edit)$langchain_community.document_loadersr%   r   r&   r   
isinstancer   loggererror__dict__r~   blobkeyrb   lendatabase_id	__class__r0   getjoin	Exceptionrt   r=   )ry   r%   r   r&   r   locationloader_dictr   r   r   r   file_idr   doc_ids                 r9   get_loader_full_pathr      sn     Hfj))U	
 //K/{"&00"6==/6;;-@F11"6==/6::,?$"8,HI4%i0"*1WI6H{""6*HK'";/HK'#K0IZ	488S^a=O$Q<00"H//$V%7%7$89H&&*=={++'OOK8	HT,,&??:r:99 (0'/G ;7)5I'/ 00*~rB99 '3&2F >fXUK&2 X''!  sm   2I" <,I" *+I" I" $I" 1;I" .I" I" "AI" )7I"  I/I" 77I" .I=I" 
I" "
I/.I/c                    [        5       n [        SU R                  SS5      S9n[        R                  " 5       n[        UR                  [        R                  S   U R                  SS5      UR                  UR                  [        5       U R                  SS5      U R                  S	S5      S
9nSUR                  ;   a  SUl        SUl        [        R                  SU 35        [        R                  SU 35        X4$ )zFetch the current Framework and Runtime details.

Returns:
    Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
	langchainlibrary_versionN)rT   rU   PWDrK   rm   rP   runtime_version)rF   rG   rK   rL   rM   rJ   rN   rO   DarwindesktopzMac OSXz
framework zruntime )r   rR   r   rK   unamerB   noderL   environsystemrU   get_iprE   rP   r   debug)runtime_envr\   r   rP   s       r9   get_runtimer     s     *+K+//2CT"JI NNEZZZZY7<<==8I6$):IF	G 7:: #
LL:i[)*
LL8G9%&r8   c                     SSK n U R                  5       n U R                  U5      nU$ ! [         a    U R                  S5      n U$ f = f)z>Fetch local runtime ip address.

Returns:
    str: IP address
r   N	localhost)socketgethostnamegethostbynamer   )r   rF   	public_ips      r9   r   r   .  sY     D6((.	   6((5	6s   ) AAc                2   / n/ nSnU  Hr  n[        UR                  R                  S5      5      nXa:  a  UR                  U/5        M@  XF-   U:  a  UR                  U5        / nSnUR                  U5        XF-  nMt     U(       a  UR                  U5        U$ )a  
Generate batches of documents based on page_content size.
Args:
    docs: List of documents to be batched.
    max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
Returns:
    List[List[Document]]: List of batches of documents
r   utf-8)r   page_contentencodeappend)rc   max_batch_sizebatchescurrent_batchcurrent_batch_sizedocdoc_sizes          r9   generate_size_based_batchesr   >  s     %'G$&MC,,33G<=$NNC5!!,~=}- "%&"   %*! & }%Nr8   c                     SSK n[        R                  " U 5      R                  nUR	                  U5      R
                  nU$ ! [         a    Sn U$ f = f)zoFetch owner of local file path.

Args:
    file_path (str): Local file path.

Returns:
    str: Name of owner.
r   Nrm   )pwdrL   statst_uidgetpwuidpw_namer   )r   r   file_owner_uidfile_owner_names       r9   get_file_owner_from_pathr   f  sV    $+22,,~6>>   $#$s   ?A AAc                   U (       d  gSn[         R                  R                  U 5      (       a!  [         R                  R                  U 5      nU$ [         R                  R	                  U 5      (       a  Sn[         R
                  " U 5       Hv  u  p4nU Hj  n[         R                  R                  X65      n[         R                  R                  U5      (       a  MH  U[         R                  R                  U5      -  nMl     Mx     UnU$ )zFetch size of source path. Source can be a directory or a file.

Args:
    source_path (str): Local path of data source.

Returns:
    int: Source size in bytes.
r   )rL   rG   isfilegetsizeisdirwalkr   islink)source_pathsize
total_sizedirpath_	filenamesffps           r9   get_source_sizer   y  s     D	ww~~k""ww{+ K 
{	#	#
%'WW[%9!G	WW\\'-ww~~b))"''//""55J  &:
 Kr8   c                >    U R                  S5      n[        U5      nU$ )zCalculate the content size in bytes:
- Encode the string to bytes using a specific encoding (e.g., UTF-8)
- Get the length of the encoded bytes.

Args:
    data (str): Data string.

Returns:
    int: Size of string in bytes.
r   )r   r   )dataencoded_contentr   s      r9   calculate_content_sizer     s!     kk'*ODKr8   c                  J  ^  \ rS rSr% SrS\S'    SrS\S'    S\S'    S\S	'    S
rS\S'    SU 4S jjrSS jr	 S         SS jjr
SS jrSSS jjr              SS jr\  S           SS jj5       r\        S S j5       r\S!S j5       rSrU =r$ )"PebbloLoaderAPIWrapperi  zWrapper for Pebblo Loader API.rI   api_keyrD   r=   ri   classifier_url	cloud_urlFrf   rj   c                   > [        USSS5      US'   [        USS[        5      US'   [        USS[        5      US'   [        TU ]  " S	0 UD6  g)
z%Validate that api key in environment.r   PEBBLO_API_KEYrH   r   PEBBLO_CLASSIFIER_URLr   PEBBLO_CLOUD_URLNr/   )r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfkwargsr   s     r9   r   PebbloLoaderAPIWrapper.__init__  sh    0I/
y $8$&=?V$
  3K!35N
{ 	"6"r8   c                T   SnUR                  SS9nU R                  S:X  aK  U R                  5       nU R                   [        R
                  R                   3nU R                  SXTU5      nU R                  (       a  U R                  SS9nU(       aB  [        R                  " UR                  5      R                  S5      nUR                  SU05        UR                  S[        05        U R                   [        R
                  R                   3nU R                  SXtU5      ngg)	zk
Send app discovery request to Pebblo server & cloud.

Args:
    app (App): App instance to be discovered.
NTexclude_unsetrD   POSTcloud_requestpebblo_server_versionpebblo_client_version)rd   ri   _make_headersr   r-   r6   valuemake_requestr   jsonloadstextr   updatePLUGIN_VERSIONr   )	r   apppebblo_resppayloadheadersapp_discover_urlr   pebblo_cloud_urlr   s	            r9   send_loader_discover+PebbloLoaderAPIWrapper.send_loader_discover  s    (((.##w.((*G&&'(B(B(H(H'IJ  ++F4DwWK<<((t(<G(,

;3C3C(D(H(H+)%  79NOPNN3^DE"&..!1&2L2L2R2R1ST!!&*:WMA r8   c                d   UR                  SS5      n[        U5      nU R                  XU5      u  pxU R                  X'X6X5      n	0 n
U R                  S:X  a  U R                  5       nU R                   [        R                  R                   3n U R                  SXU	S5      nU(       aM  [        R                  " UR                  5      R                  S/ 5       H  nU
R                  US   U05        M     U R$                  (       aJ  U R                  S:X  a  U R'                  U	S   U
5        U	R)                  S
S	5        U R+                  U	5        U
$ U R                  S:X  a   [         R#                  S5        [-        S5      eU
$ ! [         a   n[         R#                  SU5         S	nANS	nAff = f)ag  
Send documents to Pebblo server for classification.
Then send classified documents to Daxa cloud(If api_key is present).

Args:
    docs_with_id (List[IndexedDocument]): List of documents to be classified.
    app (App): App instance.
    loader_details (dict): Loader details.
    loading_end (bool): Boolean, indicating the halt of data loading by loader.
r   rH   rD   r   i,  rc   r>   z3An Exception caught in classify_documents: local %sNrj   zpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)r   r   prepare_docs_for_classificationbuild_classification_payloadri   r   r   r-   r5   r   r   r   r   r   r   r   r   warningr   update_doc_datapopsend_docs_to_pebblo_cloud	NameError)r   docs_with_idr   re   rg   r   rh   rc   source_aggregate_sizer   classified_docsr   load_doc_urlr   classified_doces                   r9   classify_documents)PebbloLoaderAPIWrapper.classify_documents  s   " %((;/<&*&J&J~'
# 33~5J
 ##w.((*G"11263D3D3J3J2KLLY"//L7C *.**[5E5E*F*J*J6SU*V'..+G4nE +W <<''72 $$WV_oFKK,d3**73
 	 %%7NNQRRSS!  YTVWXXYs   A(F 
F/F**F/c                    U R                  SS9nU R                   [        R                  R                   3n U R                  SX2U5      ng! [         a   n[        R                  SU5         SnAgSnAff = f)zi
Send documents to Pebblo cloud.

Args:
    payload (dict): The payload containing documents to be sent.
Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r-   r5   r   r   r   r   r   )r   r   r   r   r   r
  s         r9   r  0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloud  sx     $$4$8"nn-f.?.?.E.E-FG	U!!&*:WMA 	UNNPRSTT	Us   A 
A7A22A7c                    SSS.nU(       aE  U R                   (       a  UR                  SU R                   05        U$ [        R                  S5        U$ )z
Generate headers for the request.

args:
    cloud_request (bool): flag indicating whether the request is for Pebblo
    cloud.
returns:
    dict: Headers for the request.

zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   r   r   )r   r   r   s      r9   r   $PebbloLoaderAPIWrapper._make_headers(  sM     ).
 ||T\\:;  MNr8   c                    UR                   UR                  U[        UR                  USUU R                  U R
                  S.
nUSL a  SUS'   SU;   a  UUS   S'   [        S	0 UD6R                  SS9nU$ )
a  
Build the payload for document classification.

Args:
    app (App): App instance.
    docs (List[dict]): List of documents to be classified.
    loader_details (dict): Loader details.
    source_owner (str): Owner of the source.
    source_aggregate_size (int): Aggregate size of the source.
    loading_end (bool): Boolean indicating the halt of data loading by loader.

Returns:
    dict: Payload for document classification.
false)
rT   rY   rc   r]   r[   re   rg   rh   ri   rj   Ttruerg   re   r  r   r/   )rT   rY   r   r[   ri   rj   r`   rd   )r   r   rc   re   rh   r  rg   r   s           r9   r   3PebbloLoaderAPIWrapper.build_classification_payload?  s    0 HHYY,{{,"(#'#;#;"&"9"9#
 $%+GM"7*) ()*AB ..%%D%9r8   c           
     H    [        XX#US9n[        R                  SU UR                   R                  [	        [        UR                   R                  (       a  UR                   R                  O/ 5      5      [	        UR                  5      5        UR                  [        R                  :  a$  [        R                  SUR                   35        U$ UR                  [        R                  :  a$  [        R                  SUR                   35        U$ UR                  [        R                  :w  a"  [        R                  SUR                   35        U$ ! [         a    [        R                  SU5         g[         a   n[        R                  SU5         SnAgSnAff = f)	a  
Make a request to the Pebblo API

Args:
    method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
    url (str): URL for the request.
    headers (dict): Headers for the request.
    payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
    timeout (int): Timeout for the request in seconds.

Returns:
    Optional[Response]: Response object if the request is successful.
)methodurlr   r   timeoutz5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   r   r   r  r=   r   bodystatus_coder   INTERNAL_SERVER_ERRORr   BAD_REQUESTr   OKr   r   )r  r  r   r   r  responser
  s          r9   r   #PebbloLoaderAPIWrapper.make_requestk  s`   *	IwH LLG  $$C1A1A1F1F((--BOPH(() ##z'G'GG!6x7K7K6LMN O %%)?)??!Ehmm_UV O %%6C++,.
 O 	=NN6<   	INNDaHH	Is,   CE AE AE  F!8	F!FF!c           
     B   / nSnU  Vs/ sH  oUR                  5       PM     nnSnU GHm  nUR                  S0 5      nUR                  S/ 5      n	US   S:X  a  [        UR                  SUS   5      5      n
O+[        UR                  S	UR                  SU5      5      5      n
UR                  S
[        U
5      5      nUR                  S[	        U
5      5      n[        UR                  S5      5      n[        U5      nXN-  nUR                  SS5      =(       d    SnUR                  UU
UUR                  S0 5      R                  S5      US.U	(       a  SU	0O0 EUb  SU0O0 E5        US   S:X  d  GMM  U(       a  GMW  UR                  S5      US'   SnGMp     X44$ s  snf )a<  
Prepare documents for classification.

Args:
    docs_with_id (List[IndexedDocument]): List of documents to be classified.
    source_path (str): Source path of the documents.
    loader_details (dict): Contains loader info.

Returns:
    Tuple[List[dict], int]: Documents and the aggregate size
    of the source.
r   Fmetadataauthorized_identitiesry   r(   r   r   rs   rY   r   r   r>   Nlast_modified)r   r   r>   r$  
file_ownersource_path_sizesource_full_urlT)rd   r   rt   r   r   r=   r   r   )r  r   re   rc   r  r   doc_contentsource_path_updatedoc_metadatadoc_authorized_identitiesdoc_source_pathdoc_source_ownerdoc_source_sizer   page_content_sizer   s                   r9   r   6PebbloLoaderAPIWrapper.prepare_docs_for_classification  s   $  !-9:\cxxz\:"C77:r2L(4(8(89PRT(U%h'+=="/ $$X~m/LM# #0 $$#$((;?#  ,//1/B  +..v7WXOsww~67L 6| D!6!WWWd+0qFKK'#2#%(WWZ%<%@%@%Q"2 5 12KL +6 ,_=( x(,>>**0<0@0@AR0S}-%)"] ^ **c ;s   Fc           
     
   U  H}  nUR                  US   0 5      nUR                  UR                  S5      UR                  S5      UR                  S0 5      UR                  S0 5      S.5        UR                  S5        M     g)	z
Update the document data with classified information.

Args:
    docs (List[dict]): List of document data to be updated.
    classified_docs (dict): The dictionary containing classified documents.
r>   pb_checksumloader_source_pathentitiestopics)r2  r3  r4  r5  r   N)r   r   r  )rc   r  doc_dataclassified_datas       r9   r  &PebbloLoaderAPIWrapper.update_doc_data  s     H-11(72CRHOOO#2#6#6}#E*9*=*=>R*S / 3 3J C-11(B?	 LL r8   r/   )r   r   )r   rW   returnNone)F)
r  List[IndexedDocument]r   rW   re   rd   rg   rf   r9  rd   )r   rd   r9  r:  )r   rf   r9  rd   )r   rW   rc   
List[dict]re   rd   rh   r=   r  intrg   rf   r9  rd   )N   )r  r=   r  r=   r   rd   r   zOptional[dict]r  r=  r9  zOptional[Response])r  r;  r   r=   re   rd   r9  zTuple[List[dict], int])rc   r<  r  rd   r9  r:  )r0   r1   r2   r3   r4   r@   ri   rj   r   r   r  r  r   r   staticmethodr   r   r  r7   __classcell__)r   s   @r9   r   r     s   ("&&I!!&!$$U#NH "<+< < 	<
 < 
<|U.** * 	*
 *  #* * 
*X 
 #'/// /  	/
 / 
/ /b D++D+D+ D+ 
 	D+ D+L    r8   r   )rG   r=   r9  r=   )ry   r=   r9  r=   )ry   r   r9  r=   )r9  zTuple[Framework, Runtime])r9  r=   )r   )rc   zList[Document]r   r=  r9  zList[List[Document]])r   r=   r9  r=   )r   r=   r9  r=  )r   r=   r9  r=  )=
__future__r   r   loggingrL   ro   rK   enumr   httpr   typingr   r   r   r	   r
   langchain_core.documentsr   langchain_core.envr   langchain_core.utilsr   pydanticr   requestsr   r   requests.exceptionsr   )langchain_community.document_loaders.baser   	getLoggerr0   r   r   r   r   BATCH_SIZE_BYTESfile_loader
dir_loader	in_memorycloud_folderrw   r=   r-   r;   rB   rR   rW   r`   rt   r|   r   r   r   r   r   r   r   r   r/   r8   r9   <module>rS     s;   "   	     3 3 - 6 5  & 0 @			8	$1 1  
 	  	 -S$ -%h %5i 52$	 $+) +*V) V2.E(P>" 1;%
%*-%%P&2 W Y W r8   