
    dh                    F   S r SSKJr  SSKrSSKrSSKrSSKrSSKrSSKJr  SSK	J
r
  SSKJr  SSKJrJrJrJrJrJrJrJrJrJrJr  SSKJr  SSKrSSKrSS	KJr  SS
KJ r   SSK!J"r"  SSK#J$r$J%r%  \(       a  SSK&r&SSK'r'SSK(r(SSK)r)SSK*J+r+  / SQr,/ SQr-    S)S jr.\R^                  " \05      r1Sr2Sr3Sr4Sr51 Skr6S*S jr7S+S jr8S+S jr9SS/r:S,S jr; " S S\ 5      r< " S S\ 5      r= " S S \ 5      r> " S! S"\ 5      r? " S# S$\ 5      r@ " S% S&\ 5      rA " S' S(\ 5      rBg)-z(Module contains common parsers for PDFs.    )annotationsN)datetime)Path)TemporaryDirectory)TYPE_CHECKINGAnyBinaryIOIterableIteratorLiteralMappingOptionalSequenceUnioncast)urlparse)Document)BaseBlobParser)Blob)BaseImageBlobParserRapidOCRBlobParser)TextLinearizationConfig)	DCTDecodeDCT	JPXDecode)	LZWDecodeLZWFlateDecodeFlASCII85DecodeA85ASCIIHexDecodeAHxRunLengthDecodeRLCCITTFaxDecodeCCFJBIG2Decodec                     SSK Jn  U" 5       nSnU  H=  nU" U5      u  pVU(       d  M  U Vs/ sH  o3S   PM	     nnWSR                  U5      -  nM?     U$ ! [         a    [        S5      ef = fs  snf )zExtract text from images with RapidOCR.

Args:
    images: Images to extract text from.

Returns:
    Text extracted from images.

Raises:
    ImportError: If `rapidocr-onnxruntime` package is not installed.
r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime`    
)rapidocr_onnxruntimer*   ImportErrorjoin)imagesr*   ocrtextimgresult_s          h/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/parsers/pdf.py!extract_from_images_with_rapidocrr8   @   s    
1 *CDH	6*01&$1g&F1DIIf%%D	 
 K  
1
 	

 2s   A A/A,z

{image_text}

r-   z
>   sourcecreatorproducertotal_pagescreationdatec                    U(       aZ  U R                   =(       d    SnUS:X  a  UR                  SS5      nSU SU S3nU$ US:X  a  S	[        R                  " US
S9 SU S3nU$ )a  Format the content of the image with the source of the blob.

blob: The blob containing the image.
format::
  The format for the parsed output.
  - "text" = return the content as is
  - "markdown-img" = wrap the content into an image markdown link, w/ link
  pointing to (`![body)(#)`]
  - "html-img" = wrap the content as the `alt` text of an tag and link to
  (`<img alt="{body}" src="#"/>`)
#zmarkdown-img]z\\]z![z]()zhtml-imgz
<img alt="T)quotez src="z" />)r9   replacehtmlescape)blobcontentformatr9   s       r7   _format_inner_imagerI   i   sx     #^#ooc62G7)2fXQ/G N z!"4;;wd#C"DF6(RVWGN    c                    [         R                  U R                  5       5      (       d  [        S5      e[	        U R                  SS5      [        5      (       d  [        S5      eU $ )zValidate that the metadata has all the standard keys and the page is an integer.

The standard keys are:
- source
- total_page
- creationdate
- creator
- producer

Validate that page is an integer if it is present.
z3The PDF parser must valorize the standard metadata.pager   z(The PDF metadata page must be a integer.)_STD_METADATA_KEYSissubsetkeys
ValueError
isinstancegetint)metadatas    r7   _validate_metadatarU      sP     &&x}}77NOOhll61-s33CDDOrJ   c                <   0 nSSS.nU R                  5        H  u  p4[        U5      [        [        4;  a  [        U5      nUR	                  S5      (       a  USS nUR                  5       nUS;   a;   [        R                  " UR                  SS	5      S
5      R                  S5      X'   M  X2;   a  XAX#   '   XAU'   M  [        U[        5      (       a  UR                  5       X'   M  [        U[        5      (       d  M  XAU'   M     U$ ! [         a	    XAU'    GM  f = f)zPurge metadata from unwanted keys and normalize key names.

Args:
    metadata: The original metadata dictionary.

Returns:
    The cleaned and normalized the key format of metadata dictionary.
r<   r9   )
page_count	file_path/r,   N)r=   moddate'r+   zD:%Y%m%d%H%M%S%zT)itemstypestrrS   
startswithlowerr   strptimerC   	isoformatrP   rQ   strip)rT   new_metadatamap_keykvs        r7   _purge_metadatari      s    $&L#G  73*$AA<<!"AGGI++$"*"3"3IIc2&(:#)C. 
 \'($O3ggiLO3O) !*   $"#Q$s   38DDDz




c                   ^         SU4S jjmT" XS5      nU(       d5  SnSR                  [        S U 5      5      nU(       a  [        S   U-   nX-   nU$ )a  Insert extras such as image/table in a text between two paragraphs if possible,
else at the end of the text.

Args:
    extras: List of extra content (images/tables) to insert.
    text_from_page: The text content from the page.

Returns:
    The merged text with extras inserted.
c                &  > U (       a  [          Hw  nUR                  U5      nUS:w  d  M  S nU(       a  T	" XS U S5      nU(       a  XQUS  -   nO6SnSR                  [        S U 5      5      nU(       a  X8-   nUS U U-   XS  -   n  U$    S n U$ UnU$ )NFr+   rj   c                    U $ N xs    r7   <lambda>O_merge_text_and_extras.<locals>._recurs_merge_text_and_extras.<locals>.<lambda>   s    !rJ   )_PARAGRAPH_DELIMITERrfindr0   filter)
extrastext_from_pagerecursdelimposprevious_textall_text
all_extras
str_extras_recurs_merge_text_and_extrass
            r7   r   =_merge_text_and_extras.<locals>._recurs_merge_text_and_extras   s     -$**51"9$(M(E"4C$8%) %#0#$3G#G%'
%+[[V1L%M
%).);J*4C0:=t@TT ! 
 1 .*    &HrJ   Tr+   rj   c                    U $ ro   rp   rq   s    r7   rs   (_merge_text_and_extras.<locals>.<lambda>   s    !rJ   rm   )rx   	list[str]ry   r_   rz   boolreturnOptional[str])r0   rw   ru   )rx   ry   r~   r   r   r   s        @r7   _merge_text_and_extrasr      sr    +.8<	< -VTJH
[[V!<=
-b1J>J!.OrJ   c                  |   ^  \ rS rSrSr  SS\SSSSS.               SU 4S jjjjrSS	 jrSS
 jrSr	U =r
$ )PyPDFParser   a  Parse a blob from a PDF using `pypdf` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images.
    It integrates the 'pypdf' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFParser

            parser = PyPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NrL   r3   plain)modepages_delimiterimages_parserimages_inner_formatextraction_modeextraction_kwargsc                  > [         T	U ]  5         US;  a  [        S5      eX l        U(       a  U(       d
  [	        5       nXPl        X`l        Xl        X0l        X@l	        Xpl
        U=(       d    0 U l        g)uM  Initialize a parser based on PyPDF.

Args:
    password: Optional password for opening encrypted PDFs.
    extract_images: Whether to extract images from the PDF.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    extraction_mode: “plain” for legacy functionality, “layout” extract text
        in a fixed width format that closely adheres to the rendered layout in
        the source pdf.
    extraction_kwargs: Optional additional parameters for the extraction
        process.

Raises:
    ValueError: If the `mode` is not "single" or "page".
singlerL   mode must be single or pageN)super__init__rP   extract_imagesr   r   r   passwordr   r   r   r   )
selfr   r   r   r   r   r   r   r   	__class__s
            r7   r   PyPDFParser.__init__$  sh    J 	)):;;,-.0M*#6  	..!2!8brJ   c              #  J  ^ ^#     SSK mSUU 4S jjnUR                  5        nTR                  " UT R                  S9n[        SSSS	.[        [        UR                  =(       d    0 5      -  UR                  [        UR                  5      S
.-  5      n/ n[        UR                  5       H  u  pxU" US9n	T R                  U5      n
[        U
/U	5      R                  5       nT R                   S:X  a*  [#        U[%        UUUR&                  U   S.-  5      S9v   Mq  UR)                  U5        M     T R                   S:X  a.  [#        T R*                  R-                  U5      [%        U5      S9v   SSS5        g! [         a    [        S5      ef = f! , (       d  f       g= f7f)  
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.

Args:
    blob: The blob to parse.

Raises:
    ImportError: If the `pypdf` package is not found.

Yield:
    An iterator over the parsed documents.
r   NzE`pypdf` package not found, please install it with `pip install pypdf`rL   c                   > TR                   R                  S5      (       a  U R                  5       $ U R                  " SSTR                  0TR                  D6$ )z
Extract text from image given the version of pypdf.

Args:
    page: The page object to extract text from.

Returns:
    str: The extracted text.
3r   rp   )__version__r`   extract_textr   r   )rL   pypdfr   s    r7   _extract_text_from_page7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagem  sZ       ++C00((**(( $($8$8,, rJ   r   PyPDFr+   r;   r:   r=   )r9   r<   )rL   )rL   
page_labelpage_contentrT   r   )rL   zpypdf.PageObjectr   r_   )r   r/   as_bytes_io	PdfReaderr   ri   r   dictrT   r9   lenpages	enumerateextract_images_from_pager   rd   r   r   rU   page_labelsappendr   r0   )r   rF   r   pdf_file_obj
pdf_readerdoc_metadatasingle_textspage_numberrL   ry   images_from_pager~   r   s   `           @r7   
lazy_parsePyPDFParser.lazy_parseW  s    		 	$ <NJ*$"MtZ006B78 #kk#&z'7'7#8L L%.z/?/?%@!!8d!C#'#@#@#F 1%&%'  99&"%-!3((3.8.D.D[.Q"	 	 !''1% &A& yyH$!%!5!5!:!:<!H/= A  /  	W 	.  s3   F#E9 F#EF0	F#9FF#
F F#c           	        U R                   (       d  gSSKnSSKJn  S[	        [
        US   5      R                  5       ;  a  gUS   S   R                  5       n/ nU GH  nSnXF   S   S:X  d  M  [        XF   S	   5      UR                  R                  R                  L a
  XF   S	   S
S OXF   S	   S   S
S nU[        ;   aQ  XF   S   XF   S   p[        R                  " XF   R                  5       [        R                   S9R#                  XS5      nOiU[$        ;   aJ  [        R&                  " UR)                  [*        R,                  " XF   R                  5       5      5      5      nO[.        R1                  S5        Uc  GM%  [*        R,                  " 5       nUR3                  5       R4                  S:X  a  GM[  UR7                  U5      R9                  USS9  [:        R<                  " UR?                  5       SS9n[A        U R                   RC                  U5      5      RD                  nURG                  [I        XU RJ                  5      5        GM     [L        RO                  [P        RS                  [U        SU5      5      S9$ )Extract images from a PDF page and get the text using images_to_text.

Args:
    page: The page object from which to extract images.

Returns:
    str: The extracted text from the images on the page.
r+   r   NImagez/XObjectz
/Resourcesz/Subtypez/Imagez/Filterr,   z/Heightz/Widthdtyperm   Unknown PDF Filter!PNG)rH   z	image/png	mime_type
image_text)+r   r   PILr   r   r   rO   
get_objectr^   generic_base
NameObject_PDF_FILTER_WITHOUT_LOSSnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSarrayopenioBytesIOloggerwarning	getbuffernbytes	fromarraysaver   	from_datagetvaluenextr   r   r   rI   r   _FORMAT_IMAGE_STRrH   _JOIN_IMAGESr0   rw   )r   rL   r   r   xObjectr1   objnp_image
img_filterheightwidthimage_bytesrF   r   s                 r7   r   $PyPDFParser.extract_images_from_page  s'    !!T$\(:;@@BB|$Z0;;=C H|J'83 GL348K8K8V8VV L+AB/ i03AB7 
 !99$+L$;W\(=SE!}}--/rxx gfR0   #88!xx

2::gl>S>S>U3V(WXH NN#89'"$**,K",,.55: OOH-22;u2M>>+*>*>*@KXD!%d&8&8&C&CD&I!J!W!WJMM+Dd>V>VW9 > !''#((f)=> ( 
 	
rJ   )r   r   r   r   r   r   r   r   NF)r   zOptional[Union[str, bytes]]r   r   r   Literal['single', 'page']r   r_   r   Optional[BaseImageBlobParser]r   +Literal['text', 'markdown-img', 'html-img']r   zLiteral['plain', 'layout']r   Optional[dict[str, Any]]rF   r   r   Iterator[Document])rL   zpypdf._page.PageObjectr   r_   )__name__
__module____qualname____firstlineno____doc___DEFAULT_PAGES_DELIMITERr   r   r   __static_attributes____classcell__r   s   @r7   r   r      s    .d 15$19
 +177;KQ6=6:19-19 19
 (19 19 519 I19 419 419 19fKZ4
 4
rJ   r   c                     ^  \ rS rSrSrSr SSS\SSSS.             SU 4S jjjjr\SS	 j5       r	\SS
 j5       r
  S       SS jjrSS jrSrU =r$ )PDFMinerParseri  a  Parse a blob from a PDF using `pdfminer.six` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pdfminer.six pillow

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PDFMinerParser

            parser = PDFMinerParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNr   r3   )r   r   r   r   r   concatenate_pagesc               N  > [         TU ]  5         US;  a  [        S5      eU(       a  U(       d
  [        5       nXl        XPl        X`l        X l        X0l        X@l	        UbF  [        R                  (       d   S[        l        [        R                  S5        U(       a  SOSU l        gg)a  Initialize a parser based on PDFMiner.

Args:
    password: Optional password for opening encrypted PDFs.
    mode: Extraction mode to use. Either "single" or "page" for page-wise
        extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from PDF.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    concatenate_pages: Deprecated. If True, concatenate all PDF pages
        into one a single document. Otherwise, return one document per page.

Returns:
    This method does not directly return data. Use the `parse` or `lazy_parse`
    methods to retrieve parsed documents with content and metadata.

Raises:
    ValueError: If the `mode` is not "single" or "page".

Warnings:
    `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
    instead.
r   r   NTzS`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'` instead.r   rL   )r   r   rP   r   r   r   r   r   r   r   r   _warn_concatenate_pagesr   r   )	r   r   r   r   r   r   r   r   r   s	           r7   r   PDFMinerParser.__init__  s    P 	)):;;-.0M,*#6  	.(!999=6= %66DI )rJ   c                  ^ SSK Jm  [        U [        5      (       a&  U R	                  S5      (       a  [        U SS SS5      $  S U  5       nS	R                  U4S
 jU 5       5      $ ! [         a    [        U 5      s $ f = f)z
Decodes a PDFDocEncoding string to Unicode.
Adds py3 compatibility to pdfminer's version.

Args:
    s: The string to decode.

Returns:
    str: The decoded Unicode string.
r   )PDFDocEncodings      Nzutf-16beignorec              3  d   #    U H'  n[        U[        5      (       a  [        U5      OUv   M)     g 7fro   )rQ   r_   ord).0cs     r7   	<genexpr>-PDFMinerParser.decode_text.<locals>.<genexpr>]  s%     CAjC00CFa7s   .0r+   c              3  ,   >#    U H
  nTU   v   M     g 7fro   rp   )r
  or  s     r7   r  r  ^  s     ;d>!,ds   )pdfminer.utilsr  rQ   bytesr`   r_   r0   
IndexError)sordsr  s     @r7   decode_textPDFMinerParser.decode_textL  st     	2aALL$=$=quj(33	CCD77;d;;; 	q6M	s   #A( (B ?B c                   SSK Jn  [        U S5      (       a  U R                  5       n [	        U [
        5      (       a#  [        [        [        R                  U 5      5      $ [	        X5      (       a  [        R                  U R                  5      $ [	        U [        [        45      (       a  [        R                  U 5      $ [	        U [        5      (       a2  U R                  5        H  u  p#[        R                  U5      X'   M     U $ U $ )z
Recursively resolve the metadata values.

Args:
    obj: The object to resolve and decode. It can be of any type.

Returns:
    The resolved and decoded object.
r   )	PSLiteralresolve)pdfminer.psparserr  hasattrr  rQ   listmapr   resolve_and_decoder  namer_   r  r   r]   )r   r  rg   rh   s       r7   r  !PDFMinerParser.resolve_and_decodeb  s     	03	""++-Cc4  N==sCDD''!--chh77c5\**!--c22T""		'::1= $J
rJ   c           	        SSK JnJnJn  U" U5      nU" XrUS9n0 n	UR                   H  n
U	R                  U
5        M     U	R                  5        H  u  p [        R                  U5      X'   M     [        [        UR                  U5      5      5      U	S'   U	$ ! [         a+  n[        R                  SU[        U5      5         SnAMy  SnAff = f)a  
Extract metadata from a PDF file.

Args:
    fp: The file pointer to the PDF file.
    password: The password for the PDF file, if encrypted. Defaults to an empty
        string.
    caching: Whether to cache the PDF structure. Defaults to True.

Returns:
    Metadata of the PDF file.
r   )PDFDocumentPDFPage	PDFParser)r   cachingzD[WARNING] Metadata key "%s" could not be parsed due to exception: %sNr<   )pdfminer.pdfpager"  r#  r$  infoupdater]   r   r  	Exceptionr   r   r_   r   r  create_pages)r   fpr   r%  r"  r#  r$  parserdocrT   r'  rg   rh   es                 r7   _get_metadataPDFMinerParser._get_metadata~  s    $ 	ED 2&WEHHDOOD! NN$DA
,??B % #&d7+?+?+D&E"F   $F	 s   B
C# C		Cc              #    ^ ^^^^^^#     SSK nSSKJn  SSKJnJmJmJnJnJ	mJ
m  SSKJnJn  SSKJn	  [!        UR"                  5      S:  a  [%        S5      e UR'                  5        n
[)        5        mU	R+                  U
T R,                  =(       d    S
S9nU" 5       n[/        SSS
S.T R1                  U
T R,                  =(       d    S
S9-  5      nUR2                  US'    " UUUUU UU4S jSU5      n[4        R6                  " 5       mU" X" X" 5       S95      n/ n[9        U5       H  u  nnTR;                  S5        TR=                  S5        UR?                  U5        TRA                  5       nURC                  5       nT RD                  S:X  a>  TR;                  S5        TR=                  S5        [G        U[I        USU0-  5      S9v   M  URK                  S5      (       a  USS nURM                  U5        M     T RD                  S:X  a0  T RN                  RQ                  U5      n[G        U[I        U5      S9v   SSS5        SSS5        g! [$         a    [%        S	5      ef = f! , (       d  f       N0= f! , (       d  f       g= f7f)a0  
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.

Args:
    blob: The blob to parse.

Raises:
    ImportError: If the `pdfminer.six` or `pillow` package is not found.

Yield:
    An iterator over the parsed documents.
r   N)PDFLayoutAnalyzer)LAParamsLTContainerLTImageLTItemLTPageLTText	LTTextBox)PDFPageInterpreterPDFResourceManager)r#  i:>4zThis parser is tested with pdfminer.six version 20201018 or later. Remove pdfminer, and install pdfminer.six with `pip uninstall pdfminer && pip install pdfminer.six`.zMpdfminer package not found, please install it with `pip install pdfminer.six`r+   r   PDFMinerr   r9   c                  `   >^  \ rS rSr  S       SU 4S jjjrSUUUUUUU4S jjrSrU =r$ )*PDFMinerParser.lazy_parse.<locals>.Visitori  c                "   > [         TU ]  XUS9  g )N)pagenolaparams)r   r   )r   rsrcmgrr@  rA  r   s       r7   r   3PDFMinerParser.lazy_parse.<locals>.Visitor.__init__  s     G$Wh$OrJ   c           	     6   >^ SUUUUUUUU	4S jjmT" U5        g )Nc                  > [        U T5      (       a  U  H  nT" U5        M     O0[        U T	5      (       a  TR                  U R                  5       5        [        U T
5      (       a  TR                  S5        g [        U T5      (       a  TR                  (       a  SSKJn  U" T5      nUR                  U 5      n[        R                  " [        T5      U-  5      nSUR                  S'   [        TR                  R                  U5      5      R                  nTR                  [        XVTR                  5      5        g g g )Nr-   r   )ImageWriterr?   r9   )rQ   writeget_textr   pdfminer.imagerF  export_imager   	from_pathr   rT   r   r   r   rI   r   )itemchildrF  image_writerfilenamerF   r   r4  r5  r8  r9  renderr   tempdirtext_ios          r7   rP  IPDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout.<locals>.render  s   %dK88)- &u *.'f55#MM$--/:%dI66#MM$/'g66#11 F/:7/C+7+D+DT+J'+~~d7mh6N'O:=h 7-1$($6$6$A$A$$G.""., !+ !($7(,$:R:R%&!"  2" !rJ   )rL  r6  r   Nonerp   )
meltpagerP  r4  r5  r8  r9  r   rQ  rR  s
     @r7   receive_layout9PDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout  s    ! !8 6NrJ   rp   )r,   N)rB  r;  r@  rS   rA  zOptional[LAParams]r   rT  )rV  r7  r   rT  )r   r   r   r   r   rW  r   r   )r   r4  r5  r8  r9  r   rQ  rR  s   @r7   Visitorr>    sP     #$37	P/P  P 1	P
 P P# # #rJ   rY  )rA  rL   r   rm   r   ))pdfminerpdfminer.converterr2  pdfminer.layoutr3  r4  r5  r6  r7  r8  r9  pdfminer.pdfinterpr:  r;  r&  r#  rS   r   r/   r   r   	get_pagesr   ri   r/  r9   r   StringIOr   truncateseekprocess_pager   rd   r   r   rU   endswithr   r   r0   )r   rF   r[  r2  r3  r6  r7  r:  r;  r#  r   r   rB  r   rY  visitor_for_allall_contentirL   r~   document_contentr4  r5  r8  r9  rQ  rR  s   `                    @@@@@@r7   r   PDFMinerParser.lazy_parse  s[    	<   R08''(83!L  4 <1C1E%%lT]]=Pb%QE(*G*'JPRS$$\DMM<OR$PQL &*[[L"&# &#+ &#P kkmG08:>O K$U+4  #Q,,T2"++-#>>+99&$$Q'LLO"%-!3LFA;4N!O 
  ((..#+CR=&&x0% ,& yyH$#'#7#7#<#<[#I !1/= Y 2F  	2 	 2F1EsN   JAI J*I;5GI* I;	JI''J*
I8	4I;;
J	Jr   r   r   r   r   r   F)r   r   r   r   r   r   r   r_   r   r   r   r   r   zOptional[bool])r  zUnion[bytes, str]r   r_   )r   r   r   r   )r+   T)r+  r	   r   r_   r%  r   r   dict[str, Any]r   )r   r   r   r   r   r  r   r   staticmethodr  r  r/  r   r   r   r   s   @r7   r   r     s    0d $  %:B #'*277;KQ,0:B:B  	:B
 (:B :B 5:B I:B *:B :Bx  *  < 	,, , 	,
 
,\y yrJ   r   c            	         ^  \ rS rSrSr\R                  " 5       r  SSS\SSSSS.                   SU 4S jjjjr	SS jr
 S     SS	 jjr        SS
 jrSS jr      SS jrSS jrSrU =r$ )PyMuPDFParseri(  a  Parse a blob from a PDF using `PyMuPDF` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pymupdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyMuPDFParser

            parser = PyMuPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
                # extract_tables="markdown",
                # extract_tables_settings=None,
                # text_kwargs=None,
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NrL   r3   )r   r   r   r   r   extract_tablesextract_tables_settingsc               $  > [         T
U ]  5         US;  a  [        S5      eU(       a  US;  a  [        S5      eX@l        XPl        X0l        U=(       d    0 U l        U(       a  U(       d
  [        5       nX l        Xpl	        X`l
        Xl        Xl        g)a  Initialize a parser based on PyMuPDF.

Args:
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from the PDF.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    extract_tables: Whether to extract tables in a specific format, such as
        "csv", "markdown", or "html".
    extract_tables_settings: Optional dictionary of settings for customizing
        table extraction.

Returns:
    This method does not directly return data. Use the `parse` or `lazy_parse`
    methods to retrieve parsed documents with content and metadata.

Raises:
    ValueError: If the mode is not "single" or "page".
    ValueError: If the extract_tables format is not "markdown", "html",
    or "csv".
r   r   )markdownrD   csvzmode must be markdownN)r   r   rP   r   r   r   text_kwargsr   r   r   r   rp  rq  )r   ru  r   r   r   r   r   r   rp  rq  r   s             r7   r   PyMuPDFParser.__init__a  s    V 	)):;;n4OO455	. &,"-.0M,#6 *,'>$rJ   c                $    U R                  U5      $ ro   )_lazy_parse)r   rF   s     r7   r   PyMuPDFParser.lazy_parse  s    
 	
rJ   c              #  "  #     SSK nU=(       d    U R                  nU R                  (       dN  SSKJnJnJnJn  0 SS_SS_SS_SS_S	S_S
U_SS_SS_SU_SS_SS_SS_SU_SU_SS_SS_SS_SSSSSS.EU l        [        R                     UR                  5        nUR                  c  UR                  " U5      n	OUR                  " USS9n	U	R                  (       a  U	R                  U R                   5        SSSS.U R#                  X5      -  n
/ nU	 Hj  nU R%                  XU5      R'                  5       nU R(                  S:X  a&  [+        U[-        U
SUR.                  0-  5      S9v   MY  UR1                  U5        Ml     U R(                  S :X  a.  [+        U R2                  R5                  U5      [-        U
5      S9v   SSS5        SSS5        g! [         a    [        S5      ef = f! , (       d  f       N0= f! , (       d  f       g= f7f)!a  Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.

Args:
    blob: The blob to parse.
    text_kwargs: Optional keyword arguments to pass to the `get_text` method.
        If provided at run time, it will override the default text_kwargs.

Raises:
    ImportError: If the `pypdf` package is not found.

Yield:
    An iterator over the parsed documents.
r   N)DEFAULT_JOIN_TOLERANCEDEFAULT_MIN_WORDS_HORIZONTALDEFAULT_MIN_WORDS_VERTICALDEFAULT_SNAP_TOLERANCEclipvertical_strategylineshorizontal_strategyvertical_lineshorizontal_linessnap_tolerancesnap_x_tolerancesnap_y_tolerancejoin_tolerancejoin_x_tolerancejoin_y_toleranceedge_min_length   min_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerance)text_tolerancetext_x_tolerancetext_y_tolerancestrategy	add_lineszGpymupdf package not found, please install it with `pip install pymupdf`pdf)streamfiletypePyMuPDFr+   r   rL   r   r   )pymupdfru  rq  pymupdf.tabler{  r|  r}  r~  r/   ro  _lockr   datar   is_encryptedauthenticater   _extract_metadata_get_page_contentrd   r   r   rU   numberr   r   r0   )r   rF   ru  r  r{  r|  r}  r~  rX   r-  r   full_contentrL   r~   s                 r7   rx  PyMuPDFParser._lazy_parse  s    ,)	%9)9)9K// 0D0 (0 *7	0
 %d0 '0 %&<0 '0 '0 %&<0 '0 '0 &q0 )*D0 +,H0  -a!0" /#0$ /%0& '(()() $!%/0,>   !!#y99$!,,y1C!,,i%HC##$$T]]3 )($&  **35	 6
  "D#55cMSSUHyyF*&)1%7 ,/D D&  %++H5   99("%)%9%9%>%>|%L!3L!A 5 $ !   	- 	 $# ! sN   HA8G <HG>D&G-G>	HG**H-
G;	7G>>
HHc                   UR                   " S0 0 U R                  EUED6nU R                  X5      nU R                  U5      n/ nU(       a  UR	                  U5        U(       a  UR	                  U5        [        Xt5      nU$ )zGet the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.

Args:
    doc: The PyMuPDF document object.
    page: The PyMuPDF page object.
    blob: The blob being parsed.

Returns:
    str: The text content of the page.
rp   )rH  ru  _extract_images_from_page_extract_tables_from_pager   r   )	r   r-  rL   ru  ry   r   tables_from_pagerx   r~   s	            r7   r  PyMuPDFParser._get_page_content  s|    " M)LD,<,<)L)LM99#D99$?MM*+MM*+)&ArJ   c                n   [        0 SSSUR                  UR                  [        U5      S.EUR                   Vs0 sH<  n[	        UR                  U   [
        [        45      (       d  M-  X1R                  U   _M>     snE5      nS H%  nX1R                  ;   d  M  UR                  U   XC'   M'     U$ s  snf )zExtract metadata from the document and page.

Args:
    doc: The PyMuPDF document object.
    blob: The blob being parsed.

Returns:
    dict: The extracted metadata.
r  r+   )r;   r:   r=   r9   rX   r<   )modDatecreationDate)ri   r9   r   rT   rQ   r_   rS   )r   r-  rF   rg   rT   s        r7   r  PyMuPDFParser._extract_metadata!  s     # )($&"kk!%#&s8 !\\)!#,,q/C:> 'A||A&)
" -ALL !ll1o - s   +B2
*B2
c                B   U R                   (       d  gSSKnUR                  5       n/ nU GHA  nU R                   (       d  M  US   nUR                  " X5      n[        R
                  " UR                  [        R                  S9R                  UR                  UR                  S5      n	[        R                  " 5       n
U
R                  5       R                  S:X  a  M  [        R                   " X5        ["        R$                  " U
R'                  5       SS9n[)        U R                   R+                  U5      5      R,                  nUR/                  [1        XU R2                  5      5        GMD     [4        R7                  [8        R;                  [=        SU5      5      S9$ )	zExtract images from a PDF page and get the text using images_to_text.

Args:
    doc: The PyMuPDF document object.
    page: The PyMuPDF page object.

Returns:
    str: The extracted text from the images on the page.
r+   r   Nr   rm   application/x-npyr   r   )r   r  
get_imagesPixmapr   r   samplesr   r   r   r   r   r   r   r   numpyr   r   r   r   r   r   r   r   rI   r   r   rH   r   r0   rw   )r   r-  rL   r  img_listr1   r4   xrefpiximager   rF   r   s                r7   r  'PyMuPDFParser._extract_images_from_pageA  sA    !!??$C!!!1vnnS/ckkBJJJJ		2 !jjl((*11Q6

;.~~((*6I "$"4"4"?"?"EFSS
'$:R:RS# ( !''#((f)=> ( 
 	
rJ   c                   U R                   c  gSSKn[        UR                  R                  " U40 U R
                  D65      nU(       a  U R                   S:X  a2  [        R                  U Vs/ sH  oDR                  5       PM     sn5      $ U R                   S:X  aB  [        R                  U Vs/ sH"  nUR                  5       R                  SSSS9PM$     sn5      $ U R                   S:X  aA  [        R                  U Vs/ sH!  nUR                  5       R                  SSS	9PM#     sn5      $ [        S
U R                    S35      egs  snf s  snf s  snf )zExtract tables from a PDF page.

Args:
    page: The PyMuPDF page object.

Returns:
    str: The extracted tables in the specified format.
Nr+   r   rs  rD   F)headerindex	bold_rowsrt  )r  r  zextract_tables z not implemented)rp  r  r  tablefind_tablesrq  _JOIN_TABLESr0   to_markdown	to_pandasto_htmlto_csvrP   )r   rL   r  tables_listr  s        r7   r  'PyMuPDFParser._extract_tables_from_pagek  sr    &MM%%dKd.J.JK
 ""j0#((;)W;%*;*;*=;)WXX$$.#(( &1 &1E )11#("'&+ 2 
 &1	 	 $$-#(( &1
 &1E	 )00#("' 1  &1  !%d&9&9%::JK  5 *Xs   -E	/(E'E)	r   rp  rq  r   r   r   r   r   ru  r   )ru  r   r   r   r   r   r   r   r   r_   r   r   r   r   rp  z/Union[Literal['csv', 'markdown', 'html'], None]rq  r   r   rT  r   ro   )rF   r   ru  r   r   r   )r-  pymupdf.DocumentrL   pymupdf.Pageru  rl  r   r_   )r-  r  rF   r   r   r   )r-  r  rL   r  r   r_   )rL   r  r   r_   )r   r   r   r   r   	threadingLockr  r   r   r   rx  r  r  r  r  r   r   r   s   @r7   ro  ro  (  s3   2l NNE 15$;?
 #'*077;KQJN<@;?-;? ;?
  ;? (;? ;? 5;? I;? H;? ":;? 
;? ;?z
 15__
 ._ 
_B  $	
 
:@(
#(
+7(
	(
T, ,rJ   ro  c                     ^  \ rS rSrSr\R                  " 5       r SSS\SSS.             SU 4S jjjjr	SS jr
SS	 jrS
rU =r$ )PyPDFium2Parseri  ao  Parse a blob from a PDF using `PyPDFium2` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdfium2

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFium2Parser

            parser = PyPDFium2Parser(
                # password=None,
                mode="page",
                pages_delimiter="
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NrL   r3   )r   r   r   r   r   c                  > [         TU ]  5         US;  a  [        S5      eXl        U(       a  U(       d
  [	        5       nXPl        X`l        X l        X0l        X@l	        g)u  Initialize a parser based on PyPDFium2.

Args:
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from the PDF.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    extraction_mode: “plain” for legacy functionality, “layout” for experimental
        layout mode functionality
    extraction_kwargs: Optional additional parameters for the extraction
        process.

Returns:
    This method does not directly return data. Use the `parse` or `lazy_parse`
    methods to retrieve parsed documents with content and metadata.

Raises:
    ValueError: If the mode is not "single" or "page".
r   r   N)
r   r   rP   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   s          r7   r   PyPDFium2Parser.__init__  sU    L 	)):;;,-.0M*#6  	.rJ   c              #  z  #     SSK n[        R                     UR	                  5        nSn UR
                  " X0R                  SS9n/ nSSSS.[        UR                  5       5      -  nUR                  US	'   [        U5      US
'   [        U5       H  u  pxUR                  5       n	SR                  U	R                  5       R                  5       5      n
U	R!                  5         U R#                  U5      n[%        U/U
5      R'                  5       nUR!                  5         U R(                  S:X  a8  UR+                  S5      (       d  US-  n[-        U[/        0 UESU0E5      S9v   M  UR1                  U5        M     U R(                  S:X  a.  [-        U R2                  R                  U5      [/        U5      S9v   U(       a  UR!                  5         SSS5        SSS5        g! [         a    [        S5      ef = f! U(       a  UR!                  5         f f = f! , (       d  f       NM= f! , (       d  f       g= f7f)r   r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`T)r   	autoclose	PyPDFium2r+   r   r9   r<   r-   rL   r   r   )	pypdfium2r/   r  r  r   PdfDocumentr   ri   get_metadata_dictr9   r   r   get_textpager0   get_text_range
splitlinescloser  r   rd   r   rd  r   rU   r   r   )r   rF   r  rX   r   r  r   r   rL   	text_pagery   image_from_pager~   s                r7   r   PyPDFium2Parser.lazy_parse  s)    	 ""!!#y!
1+!*!6!6!MMT"J $&L %0#.(*$ (
(D(D(FG	$HL
 .2[[L*25j/L/-6z-B)$($5$5$7	)-%446AAC* ")*.*H*H*N#9,-~$%' ! 

99.#+#4#4T#:#: (D 0"*-5);%&*6%&(.%&*"#  )//95 .C8 yyH,&)-)=)=)B)B<)P%7%E 
 ""((*g $ #"  	+ 	v ""((* "e $# #"sa   H;G# H;H*HFG<;HH*	H;#G99H;<HH
H'	#H**
H84H;c                   U R                   (       d  gSSKJn  [        UR	                  UR
                  4S95      nU(       d  g/ nU GH  n[        R                  " 5       nUR                  5       R                  5       nUR                  S:  a  MI  [        R                  " XeR                  5       R                  5       5        [        R                  " UR                  5       SS9n[!        U R                   R#                  U5      5      R$                  n	UR'                  [)        XU R*                  5      5        UR-                  5         GM     [.        R1                  [2        R5                  U5      S9$ )	r   r+   r   N)rw   r  r  r   r   )r   pypdfium2.rawrawr  get_objectsFPDF_PAGEOBJ_IMAGEr   r   
get_bitmapto_numpysizer  r   r   r   r   r   r   r   r   rI   r   r  r   rH   r   r0   )
r   rL   pdfium_cr1   
str_imagesr  r   r   rF   text_from_images
             r7   r  )PyPDFium2Parser._extract_images_from_pageR  s    !!(d&&x/J/J.L&MN
E**,K'')224H}}q JJ{$4$4$6$?$?$AB>>+"6"6"8DWXD"4#5#5#@#@#FGTTO#D4;S;ST KKM  !''<3D3DZ3P'QQrJ   rj  rk  )r   r   r   r   r   r   r   r_   r   r   r   r   r   rT  r   )rL   zpypdfium2._helpers.page.PdfPager   r_   )r   r   r   r   r   r  r  r  r   r   r   r  r   r   r   s   @r7   r  r    s    0h NNE  %0/ #'*077;KQ0/0/  	0/
 (0/ 0/ 50/ I0/ 
0/ 0/dM+^R RrJ   r  c                  X    \ rS rSrSr   S	       S
S jjrSS jrSS jrSS jrSr	g)PDFPlumberParseris  zParse `PDF` with `PDFPlumber`.Nc                x     SSK nU=(       d    0 U l        X l        X0l        g! [         a    [        S5      ef = f)zInitialize the parser.

Args:
    text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
    dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
r   NzEpillow package not found, please install it with `pip install pillow`)r   r/   ru  deduper   )r   ru  r  r   r   s        r7   r   PDFPlumberParser.__init__v  sE    	
 ',",  	W 	s   # 9c              #  n  #    SSK nUR                  5        nUR                  " U5      nUR                   VVs/ sH  n[	        U R                  U5      S-   U R                  U5      -   [        UR                  UR                  UR                  S-
  [        UR                  5      S.40 UR                   Vs0 sH:  n[        UR                  U   5      [        [        4;   d  M+  XdR                  U   _M<     snD6S9PM     snn Sh  vN   SSS5        gs  snf s  snnf  N! , (       d  f       g= f7f)Lazily parse the blob.r   Nr-   r,   )r9   rX   rL   r<   r   )
pdfplumberr   r   r   r   _process_page_contentr  r   r9   r   r   rT   r^   r_   rS   )r   rF   r  rX   r-  rL   rg   s          r7   r   PDFPlumberParser.lazy_parse  s"    9//),C*  II'& &D% !%!;!;D!A"44T:"; "&*kk)-$($4$4q$8+.syy>	 &)\\%1#CLLO4c
B /A||A.%1	$ &'       sR   D5"D$A>D7)D$D7DD$	D"
D$	D5DD$$
D2.D5c                    U R                   (       a*  UR                  5       R                  " S0 U R                  D6$ UR                  " S0 U R                  D6$ )z)Process the page content based on dedupe.rp   )r  dedupe_charsr   ru  )r   rL   s     r7   r  &PDFPlumberParser._process_page_content  sF    ;;$$&33Gd6F6FGG  44#3#344rJ   c                   SSK Jn  U R                  (       d  g/ nUR                   GHG  nUS   S   R                  [
        ;   a  US   S   S:X  af  UR                  [        R                  " UR                  SUS   S	   US   S
   4US   R                  5       5      R                  S5      5      5        M  UR                  [        R                  " US   R                  5       [        R                  S9R                  US   S
   US   S	   S5      5        M  US   S   R                  [        ;   a%  UR                  US   R                  5       5        GM1  [         R"                  " S5        GMJ     [%        U5      $ )z8Extract images from page and get the text with RapidOCR.r   r   r+   r  FilterBitsPerComponentr,   1WidthHeightLr   rm   r   )r   r   r   r1   r  r   r   r   r   	frombytesr   convertr   r   r   r   warningswarnr8   )r   rL   r   r1   r4   s        r7   r  *PDFPlumberParser._extract_images_from_page  sH   "";;C8}X&++/GGx=!349MM!OO #!$Xw!7Xx9P Q #H 6 6 8 &gcl MMc(m&<&<&>bhhOWWM(3S]75KR
 Xx(--1FFc(m446734+ . 188rJ   )r  r   ru  )NFF)ru  zOptional[Mapping[str, Any]]r  r   r   r   r   rT  r   )rL   zpdfplumber.page.Pager   r_   )
r   r   r   r   r   r   r   r  r  r   rp   rJ   r7   r  r  s  sJ    ( 48$	-0- - 	-
 
-,:59rJ   r  c                  J    \ rS rSrSr  SSS.       S	S jjjrS
S jrSrg)AmazonTextractPDFParseri  a  Send `PDF` files to `Amazon Textract` and parse them.

For parsing multi-page PDFs, they have to reside on S3.

The AmazonTextractPDFLoader calls the
[Amazon Textract Service](https://aws.amazon.com/textract/)
to convert PDFs into a Document structure.
Single and multi-page documents are supported with up to 3000 pages
and 512 MB of size.

For the call to be successful an AWS account is required,
similar to the
[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
requirements.

Besides the AWS configuration, it is very similar to the other PDF
loaders, while also supporting JPEG, PNG and TIFF and non-native
PDF formats.

```python
from langchain_community.document_loaders import AmazonTextractPDFLoader
loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
documents = loader.load()
```

One feature is the linearization of the output.
When using the features LAYOUT, FORMS or TABLES together with Textract

```python
from langchain_community.document_loaders import AmazonTextractPDFLoader
# you can mix and match each of the features
loader=AmazonTextractPDFLoader(
    "example_data/alejandro_rosalez_sample-small.jpeg",
    textract_features=["TABLES", "LAYOUT"])
documents = loader.load()
```

it will generate output that formats the text in reading order and
try to output the information in a tabular structure or
output the key/value pairs with a colon (key: value).
This helps most LLMs to achieve better accuracy when
processing these texts.

``Document`` objects are returned with metadata that includes the ``source`` and
a 1-based index of the page number in ``page``. Note that ``page`` represents
the index of the result returned from Textract, not necessarily the as-written
page number in the document.

N)linearization_configc                   SSK nSSKJs  Jn  X@l        XPl        Ub&  U Vs/ sH  odR                  U5      PM     snU l        O/ U l        Ub  X0l        O"U R
                  R                  SSSSS9U l         U(       d   SSKnUR                  S	5      U l        gX l        gs  snf ! [         a    [        S5      ef = f! [         a    [        S
5      ef = f)a  Initializes the parser.

Args:
    textract_features: Features to be used for extraction, each feature
                       should be passed as an int that conforms to the enum
                       `Textract_Features`, see `amazon-textract-caller` pkg
    client: boto3 textract client
    linearization_config: Config to be used for linearization of the output
                          should be an instance of TextLinearizationConfig from
                          the `textractor` pkg
r   NTz# z## *)hide_figure_layouttitle_prefixsection_header_prefixlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.textractzRCould not import boto3 python package. Please install it with `pip install boto3`.)textractcallertextractor.entities.documententitiesdocumenttc
textractorTextract_Featurestextract_featuresr  r   r/   boto3clientboto3_textract_client)r   r  r  r  r  r  fr  s           r7   r    AmazonTextractPDFParser.__init__  s    &	'==G(O ,5F*5F((+5F*& *,&#/,@),0OO,S,S'+!%*/(+	 -T -) -2\\*-E* *0&E*  	< 	  !B s-    B& B!B& !B& ?B? !B& &B<?Cc              #    #    UR                   (       a  [        [        UR                   5      5      OSnU(       ad  UR                  S:X  aT  UR                  (       aC  U R
                  R                  [        UR                   5      U R                  U R                  S9nO\U R
                  R                  UR                  5       U R                  U R
                  R                  R                  U R                  S9nU R                  R                  R                  U5      n[        UR                   5       H8  u  pV[        UR#                  U R$                  S9UR&                  US-   S.S9v   M:     g7f)	zIterates over the Blob pages and returns an Iterator with a Document
for each page, like the other parsers If multi-page document, blob.path
has to be set to the S3 URI and for single page docs
the blob.data is taken
Ns3)input_documentfeaturesr  )r  r  	call_moder  )configr,   r9   rL   r   )pathr   r_   schemenetlocr  call_textractr  r  as_bytesTextract_Call_Mode
FORCE_SYNCr  r   r   r   r   rH  r  r9   )r   rF   url_parse_resulttextract_response_jsonr  idxrL   s          r7   r   "AmazonTextractPDFParser.lazy_parseC  s"     8<yy8C		N3d  ''4/ ''%)WW%:%:"499~//&*&@&@ &; &" &*WW%:%:#}}//''44??&*&@&@	 &; &" ??++001GH"8>>2IC!]]$2K2K]L$(KKqA  3s   E0E2)r  r  r  r  r  )NN)r  zOptional[Sequence[int]]r  zOptional[Any]r  z!Optional[TextLinearizationConfig]r   rT  r   )r   r   r   r   r   r   r   r   rp   rJ   r7   r  r    sN    0h 6: $=0
 CG=02=0 =0
 @=0 
=0~!rJ   r  c                  6    \ rS rSrSrSS jrS	S jrS
S jrSrg)DocumentIntelligenceParserig  zfLoads a PDF with Azure Document Intelligence
(formerly Form Recognizer) and chunks at character level.c                H    [         R                  " S5        Xl        X l        g )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)r  r  r  model)r   r  r-  s      r7   r   #DocumentIntelligenceParser.__init__k  s    	
 
rJ   c              #     #    UR                    H\  nSR                  UR                   Vs/ sH  oDR                  PM     sn5      n[	        UUR
                  UR                  S.S9nUv   M^     g s  snf 7f)N r  r   )r   r0   r  rG   r   r9   r   )r   rF   r5   plinerG   ds          r7   _generate_docs)DocumentIntelligenceParser._generate_docsw  s`     AhhAABG$"kkMMA G As   )A5A0
6A5c              #    #    UR                  5        nU R                  R                  U R                  U5      nUR	                  5       nU R                  X5      nU Sh  vN   SSS5        g N! , (       d  f       g= f7f)r  N)r   r  begin_analyze_documentr-  r5   r4  )r   rF   file_objpollerr5   docss         r7   r   %DocumentIntelligenceParser.lazy_parse  sf      8[[77

HMF]]_F&&t4DOO     s/   BAA0 A.!A0%	B.A00
A>:B)r  r-  N)r  r   r-  r_   )rF   r   r5   r   r   r   r   )	r   r   r   r   r   r   r4  r   r   rp   rJ   r7   r+  r+  g  s    A
	rJ   r+  )r1   z,Sequence[Union[Iterable[np.ndarray], bytes]]r   r_   )rF   r   rG   r_   rH   r_   r   r_   )rT   rl  r   rl  )rx   r   ry   r_   r   r_   )Cr   
__future__r   rD   r   loggingr  r  r   pathlibr   tempfiler   typingr   r   r	   r
   r   r   r   r   r   r   r   urllib.parser   r  r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   3langchain_community.document_loaders.parsers.imagesr   r   r  r  r   r  )textractor.data.text_linearization_configr   r   r   r8   	getLoggerr   r   r   r   r  r   rM   rI   rU   ri   ru   r   r   r   ro  r  r  r  r+  rp   rJ   r7   <module>rH     sC   . "  	      '    "   - D B
 Q9  "8> 
		8	$* ! U ,&#N 
 2je
. e
PJ^ JZ
oN odVRn VRr[9~ [9|Sn Sl& &rJ   