
    dhM<                     $   S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
Jr  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  \(       a  SSKJr  SSKJr  \R8                  " \5      r\ " S S5      5       r\" SSSS9 " S S\5      5       r g)zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGIteratorListOptionalSequence)
deprecated)Document)batch_iterate)BaseBlobParser)Blob)get_client_info)	OperationDocumentProcessorServiceClientc                   .    \ rS rSr% Sr\\S'   \\S'   Srg)DocAIParsingResults   z/Dataclass to store Document AI parsing results.source_pathparsed_path N)__name__
__module____qualname____firstlineno____doc__str__annotations____static_attributes__r       j/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/parsers/docai.pyr   r      s    9r    r   z0.0.32z1.0z&langchain_google_community.DocAIParser)sinceremovalalternative_importc                      \ rS rSrSrSSSSS.S\S   S\\   S\\   S	\\   4S
 jjrS\S\	\
   4S jr   S$S\S\S\\   S\\\      S\	\
   4
S jjr   S%S\\   S\\   S\S\S\	\
   4
S jjrS\\   S\	\
   4S jrS\\   S\S   4S jrS\S   S\4S jrSSSSSS.S\\   S\\   S	\\   S \S\S\\   S\S   4S! jjrS\S   S\\   4S" jrS#rg)&DocAIParser&   z`Google Cloud Document AI` parser.

For a detailed explanation of Document AI, refer to the product documentation.
https://cloud.google.com/document-ai/docs/overview
N)clientlocationgcs_output_pathprocessor_namer(   r   r)   r*   r+   c                r   [        U5      [        U5      :X  a  [        S5      eSnU(       a*  [        R                  " XT5      (       d  [        SU S35      eX0l        X@l        U(       a  Xl        g	 SSKJn  SSK	J
n  U" U S
3S9n	U" U	[        SS9S9U l        g	! [         a  n[        S5      UeS	nAff = f)a  Initializes the parser.

Args:
    client: a DocumentProcessorServiceClient to use
    location: a Google Cloud location where a Document AI processor is located
    gcs_output_path: a path on Google Cloud Storage to store parsing results
    processor_name: full resource name of a Document AI processor or processor
        version

You should provide either a client or location (and then a client
    would be instantiated).
zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Zdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)api_endpointzdocument-ai)module)client_optionsclient_info)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientgoogle.api_core.client_optionsr-   google.cloud.documentair   ImportErrorr   )
selfr(   r)   r*   r+   patternr-   r   excoptionss
             r!   __init__DocAIParser.__init__2   s    * <4>) 
 U",,w"G"G!.!1 2   !0-!LHR $ (z)CDG :&+=ADL  != s   2B 
B6%B11B6blobreturnc              #   V   #    U R                  U/U R                  S9 Sh  vN   g N7f)zParses a blob lazily.

Args:
    blobs: a Blob to parse

This is a long-running operation. A recommended way is to batch
    documents together and use the `batch_parse()` method.
r*   N)batch_parser7   )r=   rC   s     r!   
lazy_parseDocAIParser.lazy_parsel   s(      ##TFD<Q<Q#RRRs   )')Tenable_native_pdf_parsing
field_mask
page_rangec              #     ^^^#     SSK Jn  SSKJnJnJn   SSKJm  U(       a  U" US9OSn
U(       a  U" US	9OSnU R                  R                  UR                  U R                  UR                  TR                  TR                  =(       d    S
S9U" U
US9SUS95      mUUU4S jTR                   R"                   5        Sh  vN   g! [         a  n	[        S5      U	eSn	A	ff = f! [         a  n	[        S5      U	eSn	A	ff = f NA7f)a  Parses a blob lazily using online processing.

Args:
    blob: a blob to parse.
    enable_native_pdf_parsing: enable pdf embedded text extraction
    field_mask: a comma-separated list of which fields to include in the
        Document AI response.
        suggested: "text,pages.pageNumber,pages.layout"
    page_range: list of page numbers to parse. If `None`,
        entire document will be parsed.
r   
documentai)IndividualPageSelector	OcrConfigProcessOptionsr.   N_text_from_layoutjdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`rJ   )pagesapplication/pdfgcs_uri	mime_type)
ocr_configindividual_page_selectorT)namegcs_documentprocess_optionsskip_human_reviewrK   c              3      >#    U HK  n[        T" UR                  TR                  R                  5      UR                  TR
                  S .S9v   MM     g7f)pagesource)page_contentmetadataN)r
   layoutdocumenttextpage_numberpath).0rd   rT   rC   responses     r!   	<genexpr>-DocAIParser.online_process.<locals>.<genexpr>   sP      	
 0 .t{{H<M<M<R<RS ,,"ii 0s   AA)google.cloudrO    google.cloud.documentai_v1.typesrP   rQ   rR   r<   -google.cloud.documentai_toolbox.wrappers.pagerT   r9   process_documentProcessRequestr8   GcsDocumentrl   mimetyperi   rW   )r=   rC   rJ   rK   rL   rO   rP   rQ   rR   r?   r\   r]   rT   rn   s    `          @@r!   online_processDocAIParser.online_processw   s;    $	/ 	W ) 0IJ 	 9C"4 	! <<00%%))'33 II"mm@/@ 4  !/)-E! #'% & 
	
 !))//	
 		
 		
I  	9 	  	A 	:		
sQ   DC C+ B)DD	D
C(C##C((D+
D5DDDblobstimeout_seccheck_in_interval_secc              #   0  #    U=(       d    U R                   nU(       d  [        S5      eU R                  XS9nU Vs/ sH  owR                  R                  PM     nn[
        R                  SU5        Sn	U R                  U5      (       a[  [        R                  " U5        X-  n	X:  a  [        SU S35      e[
        R                  S5        U R                  U5      (       a  M[  U R                  US9n
U R                  U
5       S	h  vN   g	s  snf  N
7f)
a  Parses a list of blobs lazily.

Args:
    blobs: a list of blobs to parse.
    gcs_output_path: a path on Google Cloud Storage to store parsing results.
    timeout_sec: a timeout to wait for Document AI to complete, in seconds.
    check_in_interval_sec: an interval to wait until next check
        whether parsing operations have been completed, in seconds
This is a long-running operation. A recommended way is to decouple
    parsing from creating LangChain Documents:
    >>> operations = parser.docai_parse(blobs, gcs_path)
    >>> parser.is_running(operations)
    You can get operations names and save them:
    >>> names = [op.operation.name for op in operations]
    And when all operations are finished, you can use their results:
    >>> operations = parser.operations_from_names(operation_names)
    >>> results = parser.get_results(operations)
    >>> docs = parser.parse_from_results(results)
:An output path on Google Cloud Storage should be provided.rF   z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r7   r4   docai_parse	operationr^   loggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)r=   rz   r*   r{   r|   output_pathr   opoperation_namestime_elapsedresultss              r!   rG   DocAIParser.batch_parse   s
    4 &>)>)>L  %%e%I
7ABz<<,,zBG	
 ooj))JJ,-1L)"9/9J'R  LL ooj)) ""j"9**7333 C 	4s#   ;DDB	D&#D	D
Dr   c              #      ^^	#     SSK Jn  SSKJn  SSKJm  U H4  m	U" T	R                  5      u  pVU" XV5      nUU	4S jU 5        S h  vN   M6     g ! [         a  n[        S5      UeS nAff = f N'7f)Nr   )split_gcs_uri)_get_shardsrS   rU   c              3      >#    U HT  nUR                    HA  n[        T" UR                  UR                  5      UR                  TR
                  S .S9v   MC     MV     g7frc   )rW   r
   rh   rj   rk   r   )rm   shardrd   rT   results      r!   ro   1DocAIParser.parse_from_results.<locals>.<genexpr>  sZ      
 $E!KKD !24;;

!K&*&6&6&BTBTU
 ( $s   AA)7google.cloud.documentai_toolbox.utilities.gcs_utilitiesr   1google.cloud.documentai_toolbox.wrappers.documentr   rs   rT   r<   r   )
r=   r   r   r   r?   gcs_bucket_name
gcs_prefixshardsrT   r   s
           @@r!   r   DocAIParser.parse_from_results   s     
	 VW F*78J8J*K'O =F
 $     	A 	s3   A3A 1A3	A1
	A3
A.A))A..A3r   r   c           	           SSK Jn  U Vs/ sH   nU R                  R	                  U" US9S9PM"     sn$ ! [         a  n[        S5      UeSnAff = fs  snf )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`N)r^   )request)!google.longrunning.operations_pb2r   r<   r9   get_operation)r=   r   r   r?   r^   s        r!   operations_from_names!DocAIParser.operations_from_names
  ss    	 (
' LL&&/B/M&N'
 	
  	: 	
s   5 &A
AAAr   c                 &    [        S U 5       5      $ )Nc              3   H   #    U H  oR                  5       (       + v   M     g 7f)N)done)rm   r   s     r!   ro   )DocAIParser.is_running.<locals>.<genexpr>  s     6:Rwwy==:s    ")any)r=   r   s     r!   r   DocAIParser.is_running  s    6:666r    i  )r*   r+   
batch_sizerJ   rK   r   c                    SSK Jn  SSKJnJn	  U=(       d    U R                  nUc  [        S5      eU=(       d    U R                  nUc  [        S5      e/ n[        XAS9 H  nUR                  UR                  U Vs/ sH0  nUR                  UR                  UR                  =(       d    S	S
9PM2     snS9S9nUR                  UR                  R!                  XS9S9nU(       a
  U	" U" US9S9OSnUR#                  U R$                  R'                  UR)                  UUUUSS95      5        M     U$ ! [
         a  n
[        S5      U
eSn
A
ff = fs  snf )a  Runs Google Document AI PDF Batch Processing on a list of blobs.

Args:
    blobs: a list of blobs to be parsed
    gcs_output_path: a path (folder) on GCS to store results
    processor_name: name of a Document AI processor.
    batch_size: amount of documents per batch
    enable_native_pdf_parsing: a config option for the parser
    field_mask: a comma-separated list of which fields to include in the
        Document AI response.
        suggested: "text,pages.pageNumber,pages.layout"

Document AI has a 1000 file limit per batch, so batches larger than that need
to be split into multiple requests.
Batch processing is an async long-running operation
and results are stored in a output GCS bucket.
r   rN   )rQ   rR   r.   Nr~   z0A Document AI processor name should be provided.)sizeiterablerX   rY   )	documents)gcs_documents)rZ   rK   )gcs_output_configrV   )r\   T)r^   input_documentsdocument_output_configr`   ra   )rq   rO   rr   rQ   rR   r<   r7   r4   r8   r   BatchDocumentsInputConfigGcsDocumentsrv   rl   rw   DocumentOutputConfigGcsOutputConfigappendr9   batch_process_documentsBatchProcessRequest)r=   rz   r*   r+   r   rJ   rK   rO   rQ   rR   r?   r   r   batchrC   input_configoutput_configr`   s                     r!   r   DocAIParser.docai_parse  s   6	/R &>)>)>L  (?4+?+?!OPP
"
CE%??(55 %*
 %*D	 #..$(II&*mm&H7H /  %* 6  @ 
L ';;","A"A"Q"Q' #R # < M - (2K    4422+(4/<(7*. 3 
9 DN o  	9 	&s   D5 >6E5
E?EEc           
          SSK Jn  U VVs/ sH  n[        UR                  U5      (       a  UR                  R
                  O.UR                  UR                  R                  5      R
                   H!  n[        UR                  UR                  S9PM#     M     snn$ ! [         a  n[        S5      UeS nAff = fs  snnf )Nr   )BatchProcessMetadatar.   )r   r   )google.cloud.documentai_v1r   r<   
isinstancerg   individual_process_statusesdeserializevaluer   input_gcs_sourceoutput_gcs_destination)r=   r   r   r?   r   statuss         r!   r   DocAIParser.get_resultsu  s    	G !

 ! bkk+?@@ 77)55KK%%--	.   "33"99. !
 	
  	9 	
s   B BB=
B:)B55B:)r9   r7   r8   )TNN)Ni  <   )r   r   r   r   r   r   r   rA   r   r   r
   rH   r3   r   intrx   r   rG   r   r   r   r   r   r   r   r   r    r!   r&   r&   &   s    >B"&)-(,8 9:8 3-	8
 "#8 !8t	St 	S(: 	S +/$(*.F
F
 $(F
 SM	F

 T#Y'F
 
(	F
V *.%'/4~/4 "#/4 	/4
  #/4 
(	/4b/0	(	4
T#Y 
4CT 
"7T+%6 74 7 *.(,*.$(U~U "#	U
 !U U $(U SMU 
k	Un
d;&7 
DAT<U 
r    r&   )!r   loggingr5   r   dataclassesr   typingr   r   r   r   r   langchain_core._api.deprecationr	   langchain_core.documentsr
   langchain_core.utils.iterr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   &langchain_community.utilities.vertexair   google.api_core.operationr   r;   r   	getLoggerr   r   r   r&   r   r    r!   <module>r      s     	  ! D D 6 - 3 D B B3F 
		8	$    
?
`
. `

`
r    