
    dh                         S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJ	r	  S SK
JrJr  S SKJrJrJrJrJrJrJrJrJrJr  S SKJr  S SKrS SKJr  S SKJr  S S	KJ r   S S
K!J"r"  S SK#J$r$  S SK%J&r&  S SK'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/  S SK0J1r1  \(       a  S SK2J3r3  \Rh                  " \55      r6 " S S\15      r7 " S S\ \5      r8 " S S\85      r9 " S S\85      r: " S S\85      r; " S S\ 5      r< " S S\85      r= " S S\85      r> " S  S!\85      r? " S" S#\85      r@ " S$ S%\85      rA " S& S'\85      rB " S( S)\$5      rC " S* S+\85      rD " S, S-\85      rE\:rFg).    N)ABC)StringIO)PathPurePath)
TYPE_CHECKINGAnyBinaryIOIteratorLiteralMappingOptionalSequenceUnioncast)urlparse)Document)get_from_dict_or_env)
BaseLoader)Blob)DedocBaseLoader)BaseImageBlobParser)_DEFAULT_PAGES_DELIMITERAmazonTextractPDFParserDocumentIntelligenceParserPDFMinerParserPDFPlumberParserPyMuPDFParserPyPDFium2ParserPyPDFParser)UnstructuredFileLoader)TextLinearizationConfigc                   Z   ^  \ rS rSrSr S
S\\\4   S\S\4U 4S jjjr	S\
4S jrS	rU =r$ )UnstructuredPDFLoader2   a  Load `PDF` files using `Unstructured`.

You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.

Examples
--------
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader(
    "example.pdf", mode="elements", strategy="fast",
)
docs = loader.load()

References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
	file_pathmodeunstructured_kwargsc                 @   > [        U5      n[        TU ]  " SXS.UD6  g)z

Args:
    file_path: The path to the PDF file to load.
    mode: The mode to use when loading the file. Can be one of "single",
        "multi", or "all". Default is "single".
    **unstructured_kwargs: Any kwargs to pass to the unstructured.
)r%   r&   N )strsuper__init__)selfr%   r&   r'   	__class__s       `/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/pdf.pyr,   UnstructuredPDFLoader.__init__J   s$     	N	O9O;NO    returnc                 J    SSK Jn  U" SSU R                  0U R                  D6$ )Nr   )partition_pdffilenamer)   )unstructured.partition.pdfr4   r%   r'   )r-   r4   s     r/   _get_elements#UnstructuredPDFLoader._get_elements[   s"    <QdnnQ8P8PQQr1   r)   )single)__name__
__module____qualname____firstlineno____doc__r   r*   r   r   r,   listr7   __static_attributes____classcell__r.   s   @r/   r#   r#   2   sU    4 Pd#P P  #	P P"Rt R Rr1   r#   c                       \ rS rSrSrSS.S\\\4   S\\	   4S jjr
SS	 jr\S
\S\4S j5       r\S
\S\4S j5       r\S
\S\4S j5       r\S\4S j5       rSrg)BasePDFLoadera   zBase Loader class for `PDF` files.

If the file is a web path, it will download it to a temporary file, use it, then
    clean up the temporary file after completion.
Nheadersr%   rG   c                &   [        U5      U l        SU l        X l        SU R                  ;   a.  [        R
                  R                  U R                  5      U l        [        R
                  R                  U R                  5      (       Gd  U R                  U R                  5      (       Ga  [        R                  " 5       U l        [        R
                  R                  U R                  5      u  p4U R                  U R                  5      (       a1  [        U R                  5      R
                  R                  S5      S   n[        R
                  R!                  U R                  R"                  SU 35      nU R                  U l        U R%                  U R                  5      (       d  [&        R(                  " U R                  U R                  S9nUR*                  S:w  a  [-        SUR*                  -  5      e[/        US	S
9 nUR1                  UR2                  5        SSS5        [        U5      U l        gg[        R
                  R                  U R                  5      (       d  [-        SU R                  -  5      eg! , (       d  f       Ng= f)zInitialize with a file path.

Args:
    file_path: Either a local, S3 or web path to a PDF file.
    headers: Headers to use for GET request to download a file from a web path.
N~/tmprF      z3Check the url of your file; returned status code %swb)r&   z'File path %s is not a valid file or url)r*   r%   web_pathrG   ospath
expanduserisfile_is_valid_urltempfileTemporaryDirectorytemp_dirsplitext_is_s3_presigned_urlr   splitjoinname
_is_s3_urlrequestsgetstatus_code
ValueErroropenwritecontent)r-   r%   rG   _suffixtemp_pdfrfs           r/   r,   BasePDFLoader.__init__h   s    Y$.. WW//?DN ww~~dnn--$2D2DT^^2T2T$779DM((8IA((88!$..166<<SA"Eww||DMM$6$6#fXGH NNDM??4>>22LLF==C'$M--( 
 (.!GGAII& /!$X 3 //FWXX 0 /.s   J
Jr2   c                 \    [        U S5      (       a  U R                  R                  5         g g )NrW   )hasattrrW   cleanupr-   s    r/   __del__BasePDFLoader.__del__   s$    4$$MM!!# %r1   urlc                 z    [        U 5      n[        UR                  5      =(       a    [        UR                  5      $ )zCheck if the url is valid.)r   boolnetlocscheme)rq   parseds     r/   rT   BasePDFLoader._is_valid_url   s*     #FMM":tFMM'::r1   c                      [        U 5      nUR                  S:X  a  UR                  (       a  gg! [         a     gf = f)zcheck if the url is S3s3TF)r   ru   rt   ra   rq   results     r/   r]   BasePDFLoader._is_s3_url   s:    	c]F}}$ 		s   ,0 
==c                      [        U 5      n[        [        R                  " SUR                  5      5      $ ! [
         a     gf = f)z'Check if the url is a presigned S3 url.z\.s3\.amazonaws\.com$F)r   rs   researchrt   ra   rz   s     r/   rY   "BasePDFLoader._is_s3_presigned_url   s=    	c]F		":FMMJKK 		s   47 
AAc                 L    U R                   b  U R                   $ U R                  $ N)rO   r%   rn   s    r/   sourceBasePDFLoader.source   s     $ 9t}}Mt~~Mr1   )r%   rG   rW   rO   )r2   N)r:   r;   r<   r=   r>   r   r*   r   r   dictr,   ro   staticmethodrs   rT   r]   rY   propertyr   r@   r)   r1   r/   rD   rD   a   s     MQ$YsH}-$Y;CD>$YL$ ;3 ;4 ; ;
     # $   N N Nr1   rD   c                   ,    \ rS rSrSrS\\   4S jrSrg)OnlinePDFLoader   zLoad online `PDF`.r2   c                 ^    [        [        U R                  5      5      nUR                  5       $ )zLoad documents.)r#   r*   r%   load)r-   loaders     r/   r   OnlinePDFLoader.load   s!    &s4>>':;{{}r1   r)   N)	r:   r;   r<   r=   r>   r?   r   r   r@   r)   r1   r/   r   r      s    d8n r1   r   c                      ^  \ rS rSrSr   SSSS\SSS.S\\\4   S	\	\\\
4      S
\	\   S\S\S   S\	\   S\S   S\S\S   S\	\   SS4U 4S jjjjrS\\   4S jrSrU =r$ )PyPDFLoader   a  Load and parse a PDF file using 'pypdf' library.

    This class provides methods to load and parse PDF documents, supporting various
    configurations such as handling password-protected files, extracting images, and
    defining extraction mode. It integrates the `pypdf` library for PDF processing and
    offers both synchronous and asynchronous document loading.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdf

        Instantiate the loader:

        .. code-block:: python

            from langchain_community.document_loaders import PyPDFLoader

            loader = PyPDFLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
                # headers = None
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_parser = RapidOCRBlobParser(),
            )

        Lazy load documents:

        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        Load documents asynchronously:

        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    Npagetextplain)r&   images_parserimages_inner_formatpages_delimiterextraction_modeextraction_kwargsr%   passwordrG   extract_imagesr&   r9   r   r   r   r   zmarkdown-imgzhtml-imgr   r   r   layoutr   r2   c                J   > [         TU ]  XS9  [        UUUUUUU	U
S9U l        g)u<  Initialize with a file path.

Args:
    file_path: The path to the PDF file to be loaded.
    headers: Optional headers to use for GET request to download a file from a
      web path.
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from the PDF.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    extraction_mode: “plain” for legacy functionality, “layout” extract text
        in a fixed width format that closely adheres to the rendered layout in
        the source pdf
    extraction_kwargs: Optional additional parameters for the extraction
        process.

Returns:
    This method does not directly return data. Use the `load`, `lazy_load` or
    `aload` methods to retrieve parsed documents with content and metadata.
rF   )r   r&   r   r   r   r   r   r   N)r+   r,   r   parser)r-   r%   r   rG   r   r&   r   r   r   r   r   r.   s              r/   r,   PyPDFLoader.__init__   s:    V 	4!)' 3++/	
r1   c              #   <  #    U R                   (       aB  [        R                  " [        U R                  S5      R                  5       U R                   S9nO [        R                  " U R                  5      nU R                  R                  U5       Sh  vN   g N7fz
Lazy load given path as pages.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
rbrQ   N	rO   r   	from_datarb   r%   read	from_pathr   
lazy_parser-   blobs     r/   	lazy_loadPyPDFLoader.lazy_load%  `      ==>>$t~~t"<"A"A"C$--XD>>$..1D;;))$///   BBBBr   )NNF)r:   r;   r<   r=   r>   r   r   r*   r   r   bytesr   rs   r   r   r,   r
   r   r   r@   rA   rB   s   @r/   r   r      s    1l 15"&$5
 +17;KQ76=,05
h'5
 5e,-5
 $	5

 5
 &'5
   345
 %%GH5
 5
 !!235
 $D>5
 
5
 5
n0	(	0 0r1   r   c                      ^  \ rS rSrSrS\SSSSSS.S\\\4   S	\	S
   S\S\
\   S\S\
\   S\	S   S\
\   4U 4S jjjrS\\   4S jrSrU =r$ )PyPDFium2Loaderi4  a  Load and parse a PDF file using the `pypdfium2` library.

    This class provides methods to load and parse PDF documents, supporting various
    configurations such as handling password-protected files, extracting images, and
    defining extraction mode.
    It integrates the `pypdfium2` library for PDF processing and offers both
    synchronous and asynchronous document loading.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdfium2

        Instantiate the loader:

        .. code-block:: python

            from langchain_community.document_loaders import PyPDFium2Loader

            loader = PyPDFium2Loader(
                file_path = "./example_data/layout-parser-paper.pdf",
                # headers = None
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazy load documents:

        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        Load documents asynchronously:

        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    r   NFr   )r&   r   r   r   r   r   rG   r%   r&   r   r   r   r   r   r   r   rG   c          	      F   > [         T	U ]  XS9  [        UUUUUUS9U l        g)a&  Initialize with a file path.

Args:
    file_path: The path to the PDF file to be loaded.
    headers: Optional headers to use for GET request to download a file from a
      web path.
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from the PDF.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)

Returns:
    This class does not directly return data. Use the `load`, `lazy_load` or
    `aload` methods to retrieve parsed documents with content and metadata.
rF   )r&   r   r   r   r   r   N)r+   r,   r   r   )
r-   r%   r&   r   r   r   r   r   rG   r.   s
            r/   r,   PyPDFium2Loader.__init__i  s4    H 	4%)' 3+
r1   r2   c              #   <  #    U R                   (       aB  [        R                  " [        U R                  S5      R                  5       U R                   S9nO [        R                  " U R                  5      nU R                  R                  U5       Sh  vN   g N7fr   )	rO   r   r   rb   r%   r   r   r   parser   s     r/   r   PyPDFium2Loader.lazy_load  s`      ==>>$t~~t"<"A"A"C$--XD>>$..1D;;$$T***r   r   )r:   r;   r<   r=   r>   r   r   r*   r   r   r   rs   r   r   r,   r
   r   r   r@   rA   rB   s   @r/   r   r   4  s    2p +17"&$7;KQ"&,
h',
 &'	,

 ,
 3-,
 ,
   34,
 %%GH,
 $,
 ,
\+	(	+ +r1   r   c                       \ rS rSrSr     SSSSSSSS.S\\\4   S\S	\S
\S\S\S\	\   S\
S   S\	\   S\	\   S\
S   S\	\   4S jjjr\S\S\4S j5       rS\\   4S jrSrg)PyPDFDirectoryLoaderi  ak  Load and parse a directory of PDF files using 'pypdf' library.

This class provides methods to load and parse multiple PDF documents in a directory,
supporting options for recursive search, handling password-protected files,
extracting images, and defining extraction modes. It integrates the `pypdf` library
for PDF processing and offers synchronous document loading.

Examples:
    Setup:

    .. code-block:: bash

        pip install -U langchain-community pypdf

    Instantiate the loader:

    .. code-block:: python

        from langchain_community.document_loaders import PyPDFDirectoryLoader

        loader = PyPDFDirectoryLoader(
            path = "./example_data/",
            glob = "**/[!.]*.pdf",
            silent_errors = False,
            load_hidden = False,
            recursive = False,
            extract_images = False,
            password = None,
            mode = "page",
            images_to_text = None,
            headers = None,
            extraction_mode = "plain",
            # extraction_kwargs = None,
        )

    Load documents:

    .. code-block:: python

        docs = loader.load()
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    Load documents asynchronously:

    .. code-block:: python

        docs = await loader.aload()
        print(docs[0].page_content[:100])
        print(docs[0].metadata)
Nr   r   )r   r&   r   rG   r   r   rQ   globsilent_errorsload_hidden	recursiver   r   r&   r   r   rG   r   r   r   c                    Xpl         Xl        Xl        X l        X@l        XPl        X0l        X`l        Xl        Xl	        Xl
        Xl        g)uI  Initialize with a directory path.

Args:
    path: The path to the directory containing PDF files to be loaded.
    glob: The glob pattern to match files in the directory.
    silent_errors: Whether to log errors instead of raising them.
    load_hidden: Whether to include hidden files in the search.
    recursive: Whether to search subdirectories recursively.
    extract_images: Whether to extract images from PDFs.
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for extracting the entire
        document or "page" for page-wise extraction.
    images_parser: Optional image blob parser..
    headers: Optional headers to use for GET request to download a file from a
      web path.
    extraction_mode: “plain” for legacy functionality, “layout” for
      experimental layout mode functionality
    extraction_kwargs: Optional additional parameters for the extraction
      process.

Returns:
    This method does not directly return data. Use the `load` method to
    retrieve parsed documents with content and metadata.
N)r   r&   rQ   r   r   r   r   r   r   rG   r   r   )r-   rQ   r   r   r   r   r   r   r&   r   rG   r   r   s                r/   r,   PyPDFDirectoryLoader.__init__  sG    P !			&"*,*.!2r1   r2   c                 D    [        S U R                   5       5      (       + $ )Nc              3   @   #    U H  oR                  S 5      v   M     g7f).N)
startswith).0parts     r/   	<genexpr>3PyPDFDirectoryLoader._is_visible.<locals>.<genexpr>  s     C
s++
s   )anypartsr   s    r/   _is_visible PyPDFDirectoryLoader._is_visible  s    C

CCCCr1   c                 8   [        U R                  5      n/ nU R                  (       a  UR                  U R                  5      OUR	                  U R                  5      nU H  nUR                  5       (       d  M  U R                  UR                  U5      5      (       d  U R                  (       d  MR   [        [        U5      U R                  U R                  U R                  U R                  U R                  U R                   U R"                  S9nUR%                  5       nU H  n[        U5      UR&                  S'   M     UR)                  U5        M     U$ ! [*         a4  nU R,                  (       a  [.        R1                  U5         S nAGM1  UeS nAff = f)N)r   r&   r   r   rG   r   r   r   )r   rQ   r   rglobr   is_filer   relative_tor   r   r*   r   r&   r   r   rG   r   r   r   metadataextend	Exceptionr   loggerwarning)	r-   pdocsitemsir   sub_docsdoces	            r/   r   PyPDFDirectoryLoader.load  s/   O&*nn		"!&&:KAyy{{##AMM!$4559I9I9I$!,F%)]]!%+/+>+>*.*<*<$(LL,0,@,@.2.D.D	" $*;;=#+C58VCLL2 $,H-# .  % $--"NN1--"#G	$s   4B!E
F%&FFF)r   r   r   r   rG   r   r   r&   r   rQ   r   r   )z**/[!.]*.pdfFFFF)r:   r;   r<   r=   r>   r   r*   r   rs   r   r   r   r   r,   r   r   r?   r   r   r@   r)   r1   r/   r   r     s   2n ##!$33 #'*07;"&6=,033CM"33 33 	33
 33 33 33 3-33 &'33   3433 $33 !!2333 $D>33j D( Dt D Dd8n r1   r   c                      ^  \ rS rSrSrSS\SSSSSS.S\\\4   S	\	\   S
\
S   S\S\S\	\   S\
S   S\	\   S\	\   SS4U 4S jjjrS\\   4S jrSrU =r$ )PDFMinerLoaderi2  a  Load and parse a PDF file using 'pdfminer.six' library.

    This class provides methods to load and parse PDF documents, supporting various
    configurations such as handling password-protected files, extracting images, and
    defining extraction mode. It integrates the `pdfminer.six` library for PDF
    processing and offers both synchronous and asynchronous document loading.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pdfminer.six

        Instantiate the loader:

        .. code-block:: python

            from langchain_community.document_loaders import PDFMinerLoader

            loader = PDFMinerLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
                # headers = None
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazy load documents:

        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        Load documents asynchronously:

        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    Nr9   Fr   )r   r&   r   r   r   r   rG   concatenate_pagesr%   r   r&   r   r   r   r   r   r   rG   r   r2   c          
      H   > [         T
U ]  XS9  [        UUUU	UUUS9U l        g)a  Initialize with a file path.

Args:
    file_path: The path to the PDF file to be loaded.
    headers: Optional headers to use for GET request to download a file from a
      web path.
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from the PDF.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    concatenate_pages: Deprecated. If True, concatenate all PDF pages into one
        a single document. Otherwise, return one document per page.

Returns:
    This method does not directly return data. Use the `load`, `lazy_load` or
    `aload` methods to retrieve parsed documents with content and metadata.
rF   )r   r   r   r   r&   r   r   N)r+   r,   r   r   )r-   r%   r   r&   r   r   r   r   rG   r   r.   s             r/   r,   PDFMinerLoader.__init__f  s7    N 	4$)'/+ 3
r1   c              #   <  #    U R                   (       aB  [        R                  " [        U R                  S5      R                  5       U R                   S9nO [        R                  " U R                  5      nU R                  R                  U5       Sh  vN   g N7fr   r   r   s     r/   r   PDFMinerLoader.lazy_load  r   r   r   )r:   r;   r<   r=   r>   r   r   r*   r   r   r   rs   r   r   r,   r
   r   r   r@   rA   rB   s   @r/   r   r   2  s    1n #'*27$7;KQ"&,00
h'0
 3-	0

 &'0
 0
 0
   340
 %%GH0
 $0
 $D>0
 
0
 0
d0	(	0 0r1   r   c                   d   ^  \ rS rSrSrSS.S\\\4   S\\	   4U 4S jjjr
S\\   4S	 jrS
rU =r$ )PDFMinerPDFasHTMLLoaderi  z2Load `PDF` files as HTML content using `PDFMiner`.NrF   r%   rG   c                `   >  SSK Jn  [        TU ]  XS9  g! [         a    [        S5      ef = f)Initialize with a file path.r   extract_text_to_fpzO`pdfminer` package not found, please install it with `pip install pdfminer.six`rF   N)pdfminer.high_levelr   ImportErrorr+   r,   )r-   r%   rG   r   r.   s       r/   r,    PDFMinerPDFasHTMLLoader.__init__  s@    	> 	4  	- 	s    -r2   c           	   #   l  #    SSK Jn  SSKJn  SSKJn  [        5       nU" U R                  S5       nU" [        [        U5      USU" 5       SS9  S	S	S	5        S
U R                  c  [        U R                  5      OU R                  0n[        UR                  5       US9v   g	! , (       d  f       NY= f7f)
Load file.r   r   )LAParams)open_filenamer    html)codeclaparamsoutput_typeNr   page_contentr   )r   r   pdfminer.layoutr   pdfminer.utilsr   r   r%   r   r	   rO   r*   r   getvalue)r-   r   r   r   output_stringfpr   s          r/   r   !PDFMinerPDFasHTMLLoader.lazy_load  s     :,0 
4>>40BXr"!" 1 T]]-Bc$..)
 M$:$:$<xPP 10s   0B4B#AB4#
B1-B4r)   )r:   r;   r<   r=   r>   r   r*   r   r   r   r,   r
   r   r   r@   rA   rB   s   @r/   r   r     sM    < MQ5sH}-5;CD>5 5Q8H- Q Qr1   r   c                     ^  \ rS rSrSrSS\SSSSSSS.	S\\\4   S	\	\   S
\
S   S\S\S\	\   S\
S   S\\
S   S4   S\	\   S\	\\\4      S\SS4U 4S jjjrS\S\\   4S jrS\S\\   4S jrS\\   4S jrSrU =r$ )PyMuPDFLoaderi  a3  Load and parse a PDF file using 'PyMuPDF' library.

    This class provides methods to load and parse PDF documents, supporting various
    configurations such as handling password-protected files, extracting tables,
    extracting images, and defining extraction mode. It integrates the `PyMuPDF`
    library for PDF processing and offers both synchronous and asynchronous document
    loading.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pymupdf

        Instantiate the loader:

        .. code-block:: python

            from langchain_community.document_loaders import PyMuPDFLoader

            loader = PyMuPDFLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
                # headers = None
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_parser = TesseractBlobParser(),
                # extract_tables = "markdown",
                # extract_tables_settings = None,
            )

        Lazy load documents:

        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        Load documents asynchronously:

        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    Nr   Fr   )	r   r&   r   r   r   r   extract_tablesrG   extract_tables_settingsr%   r   r&   r   r   r   r   r   r   r  )csvmarkdownr   rG   r  kwargsr2   c       	         n   > US;  a  [        S5      e[        TU ]	  XS9  [        UUUUUUUUU
S9	U l        g)a  Initialize with a file path.

Args:
    file_path: The path to the PDF file to be loaded.
    headers: Optional headers to use for GET request to download a file from a
      web path.
    password: Optional password for opening encrypted PDFs.
    mode: The extraction mode, either "single" for the entire document or "page"
        for page-wise extraction.
    pages_delimiter: A string delimiter to separate pages in single-mode
        extraction.
    extract_images: Whether to extract images from the PDF.
    images_parser: Optional image blob parser.
    images_inner_format: The format for the parsed output.
        - "text" = return the content as is
        - "markdown-img" = wrap the content into an image markdown link, w/ link
        pointing to (`![body)(#)`]
        - "html-img" = wrap the content as the `alt` text of an tag and link to
        (`<img alt="{body}" src="#"/>`)
    extract_tables: Whether to extract tables in a specific format, such as
        "csv", "markdown", or "html".
    extract_tables_settings: Optional dictionary of settings for customizing
        table extraction.
    **kwargs: Additional keyword arguments for customizing text extraction
        behavior.

Returns:
    This method does not directly return data. Use the `load`, `lazy_load`, or
    `aload` methods to retrieve parsed documents with content and metadata.

Raises:
    ValueError: If the `mode` argument is not one of "single" or "page".
r   zmode must be single or pagerF   )	r   r&   r   text_kwargsr   r   r   r  r  N)ra   r+   r,   r   r   )r-   r%   r   r&   r   r   r   r   r  rG   r  r  r.   s               r/   r,   PyMuPDFLoader.__init__  sR    ` )):;;4#+)' 3)$;

r1   c              +   |  #    U(       a  [         R                  SU S35        U R                  nU R                  (       aB  [        R
                  " [        U R                  S5      R                  5       U R                  S9nO [        R                  " U R                  5      nUR                  X1S9 Sh  vN   g N7f)zLazy load given path as pages or single document (see `mode`).
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
zReceived runtime arguments zd. Passing runtime args to `load` is deprecated. Please pass arguments during initialization instead.r   r   )r	  N)r   r   r   rO   r   r   rb   r%   r   r   _lazy_parse)r-   r  r   r   s       r/   
_lazy_loadPyMuPDFLoader._lazy_loadC  s     
 NN-fX 6W X ==>>$t~~t"<"A"A"C$--XD>>$..1D%%d%???s   B2B<4B:5B<c                 8    [        U R                  " S0 UD65      $ )Nr)   )r?   r  )r-   r  s     r/   r   PyMuPDFLoader.loadT  s    DOO-f-..r1   c              #   @   #    U R                  5        S h  vN   g  N7fr   )r  rn   s    r/   r   PyMuPDFLoader.lazy_loadW  s     ??$$$s   r   )r:   r;   r<   r=   r>   r   r   r*   r   r   r   rs   r   r   r   r,   r
   r   r  r?   r   r   r@   rA   rB   s   @r/   r  r    s/   4t #'*07$7;KQJN"&<@=
h'=
 3-	=

 &'=
 =
 =
   34=
 %%GH=
 g&?@$FG=
 $=
 "*$sCx.!9=
 =
 
=
 =
~@3 @8H+= @"/S /T(^ /%8H- % %r1   r  c                     ^  \ rS rSrSr    SS\\\4   S\S\S\	S\
\\\4      S	\S
S4U 4S jjjr\S
\\\4   4S j5       r\S
\4S j5       r\S
\4S j5       rS
\4S jrS\S
S4S jrS\S
\4S jrS\S
\4S jrS
\\   4S jrSrU =r$ )MathpixPDFLoaderi]  z)Load `PDF` files using `Mathpix` service.Nr%   processed_file_formatmax_wait_time_secondsshould_clean_pdfextra_request_datar  r2   c                    > [        USS5      U l        [        USS5      U l        UR                  SS5        UR                  SS5        [        TU ]  " U40 UD6  X l        Ub  UO0 U l        X0l        X@l	        g)a  Initialize with a file path.

Args:
    file_path: a file for loading.
    processed_file_format: a format of the processed file. Default is "md".
    max_wait_time_seconds: a maximum time to wait for the response from
     the server. Default is 500.
    should_clean_pdf: a flag to clean the PDF file. Default is False.
    extra_request_data: Additional request data.
    **kwargs: additional keyword arguments.
mathpix_api_keyMATHPIX_API_KEYmathpix_api_idMATHPIX_API_IDN)
r   r  r  popr+   r,   r  r  r  r  )r-   r%   r  r  r  r  r  r.   s          r/   r,   MathpixPDFLoader.__init__`  s    (  4%'8 
 3$&6

 	

$d+

#T*-f-%:""4"@b 	 &;" 0r1   c                 4    U R                   U R                  S.$ )N)app_idapp_key)r  r  rn   s    r/   _mathpix_headers!MathpixPDFLoader._mathpix_headers  s    --$:N:NOOr1   c                     g)Nzhttps://api.mathpix.com/v3/pdfr)   rn   s    r/   rq   MathpixPDFLoader.url  s    /r1   c                 j    SU R                   S00U R                  EnS[        R                  " U5      0$ )Nconversion_formatsToptions_json)r  r  jsondumps)r-   optionss     r/   dataMathpixPDFLoader.data  s?     !4#=#=t"D
%%
 

7 344r1   c                 b   [        [        U R                  5      S5       nSU0n[        R                  " U R
                  U R                  X R                  S9nS S S 5        WR                  5       nSU;   a  [        SUS    35      eSU;   a  US   nU$ [        S5      e! , (       d  f       NM= f)Nr   file)rG   filesr-  errorzMathpix request failed: pdf_idzUnable to send PDF to Mathpix.)
rb   r*   r%   r^   postrq   r#  r-  r*  ra   )r-   ri   r1  responseresponse_datar3  s         r/   send_pdfMathpixPDFLoader.send_pdf  s    #dnn%t,QKE}}$"7"7u99H -
 !m#7g8N7OPQQ}$"8,FM=>> -,s   9B  
B.r3  c                    U R                   S-   U-   n[        SU R                  S5       H  n[        R                  " X R
                  S9nUR                  5       nUR	                  SS5      nUR	                  SS5      nUb  SU 3nUb  US	US
    S3-  n[        U5      eUR	                  SS5      n	U	S:X  a    gU	S:X  a  [        S5      e[        R                  SU	5        [        R                  " S5        M     [        e)zMWait for processing to complete.

Args:
    pdf_id: a PDF id.

Returns: None
rJ   r      rF   r2  N
error_infoz%Unable to retrieve PDF from Mathpix: z (id)status	completedz#Unable to retrieve PDF from Mathpixz.Status: %s, waiting for processing to complete)rq   ranger  r^   r_   r#  r*  ra   r   infotimesleepTimeoutError)
r-   r3  rq   re   r5  r6  r2  r;  	error_msgr>  s
             r/   wait_for_processing$MathpixPDFLoader.wait_for_processing  s    hhnv%q$44a8A||C1F1FGH$MMOM "%%gt4E&**<>J CE7K	)2j&6%7q!99I ++"&&x6F$7" !FGGLfU

13 94 r1   c                     U R                  U5        U R                   SU SU R                   3n[        R                  " X R
                  S9nUR                  R                  S5      $ )NrJ   r   rF   zutf-8)rF  rq   r  r^   r_   r#  rd   decode)r-   r3  rq   r5  s       r/   get_processed_pdf"MathpixPDFLoader.get_processed_pdf  s\      (
!F81T%?%?$@A<<-B-BC&&w//r1   contentsc                 f   SR                  UR                  S5       Vs/ sH  o"R                  S5      (       a  M  UPM     sn5      nUR                  SS5      R                  SS5      nUR                  SS5      R                  S	S
5      R                  SS5      R                  SS5      nU$ s  snf )zIClean the PDF file.

Args:
    contents: a PDF file contents.

Returns:


z![]z	\section{z# }r   z\$$z\%%z\((z\)r=  )r[   rZ   r   replace)r-   rL  lines      r/   	clean_pdfMathpixPDFLoader.clean_pdf  s     99&nnT2Q2d//%:PT2Q
 ##L$7??RH UC(WUC WUC WUC 	 	  Rs
   B.B.c                     U R                  5       nU R                  U5      nU R                  (       a  U R                  U5      nU R                  U R                  US.n[        X#S9/$ )N)r   r%   r3  r   )r7  rJ  r  rU  r   r   )r-   r3  rL  r   s       r/   r   MathpixPDFLoader.load  sV    ))&1  ~~h/H"kkvVhBCCr1   )r  r  r  r  r  r  )mdi  FN)r:   r;   r<   r=   r>   r   r*   r   intrs   r   r   r   r,   r   r#  rq   r-  r7  rF  rJ  rU  r?   r   r   r@   rA   rB   s   @r/   r  r  ]  s3   3
 &*%(!&7;%1h'%1  #%1  #	%1
 %1 %T#s(^4%1 %1 
%1 %1N P$sCx. P P 0S 0 0 5d 5 5?# ?## #$ #J0 0 0# # .Dd8n D Dr1   r  c                      ^  \ rS rSrSr    SS\\\4   S\\	\\
4      S\S\\   S\S	S4U 4S
 jjjrS	\\   4S jrSrU =r$ )PDFPlumberLoaderi  z$Load `PDF` files using `pdfplumber`.Nr%   r	  deduperG   r   r2   c                    >  SSK n[        TU ]  XS9  U=(       d    0 U l        X0l        XPl        g! [         a    [        S5      ef = f)r   r   NzMpdfplumber package not found, please install it with `pip install pdfplumber`rF   )
pdfplumberr   r+   r,   r	  r]  r   )r-   r%   r	  r]  rG   r   r_  r.   s          r/   r,   PDFPlumberLoader.__init__  sX    	 	4&,",  	+ 	s	   1 Ac                 \   [        U R                  U R                  U R                  S9nU R                  (       aB  [
        R                  " [        U R                  S5      R                  5       U R                  S9nO [
        R                  " U R                  5      nUR                  U5      $ )r   )r	  r]  r   r   r   )r   r	  r]  r   rO   r   r   rb   r%   r   r   r   )r-   r   r   s      r/   r   PDFPlumberLoader.load
  sy     "((;;..

 ==>>$t~~t"<"A"A"C$--XD>>$..1D||D!!r1   )r]  r   r	  )NFNF)r:   r;   r<   r=   r>   r   r*   r   r   r   r   rs   r   r,   r?   r   r   r@   rA   rB   s   @r/   r\  r\    s    .
 48"&$-h'- gc3h/0- 	-
 $- - 
- -,"d8n " "r1   r\  c                      ^  \ rS rSrSr      SSS.S\\\4   S\\	\      S\\
   S\\   S	\\   S
\\   S\\   S\S   SS4U 4S jjjjrS\\   4S jrS\\   4S jr\S\S\4S j5       rSrU =r$ )AmazonTextractPDFLoaderi  a  Load `PDF` files from a local file system, HTTP or S3.

To authenticate, the AWS client uses the following methods to
automatically load credentials:
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

If a specific credential profile should be used, you must pass
the name of the profile from the ~/.aws/credentials file that is to be used.

Make sure the credentials / roles used have the required policies to
access the Amazon Textract service.

Example:
    .. code-block:: python
        from langchain_community.document_loaders import AmazonTextractPDFLoader
        loader = AmazonTextractPDFLoader(
            file_path="s3://pdfs/myfile.pdf"
        )
        document = loader.load()
N)linearization_configr%   textract_featuresclientcredentials_profile_nameregion_nameendpoint_urlrG   re  r!   r2   c                  > [         TU ]  XS9   SSKn	U(       a  U V
s/ sH  oR                  U
   PM     nn
O/ nU(       d  U(       d  U(       aR   SSKnUb  UR                  US9nOUR                  5       n0 nU(       a  X^S'   U(       a  XnS'   UR                  " S0 UD6n[        UUUS
9U l        g! [         a    [        S5      ef = fs  sn
f ! [         a    [        S5      e[         a  n[        S	U 35      UeSnAff = f)a  Initialize the loader.

Args:
    file_path: A file, url or s3 path for input file
    textract_features: Features to be used for extraction, each feature
                       should be passed as a str that conforms to the enum
                       `Textract_Features`, see `amazon-textract-caller` pkg
    client: boto3 textract client (Optional)
    credentials_profile_name: AWS profile name, if not default (Optional)
    region_name: AWS region, eg us-east-1 (Optional)
    endpoint_url: endpoint url for the textract service (Optional)
    linearization_config: Config to be used for linearization of the output
                          should be an instance of TextLinearizationConfig from
                          the `textractor` pkg
rF   r   NztCould not import amazon-textract-caller python package. Please install it with `pip install amazon-textract-caller`.)profile_nameri  rj  zRCould not import boto3 python package. Please install it with `pip install boto3`.zCould not load credentials to authenticate with AWS client. Please check that credentials in the specified profile name are valid. )rf  rg  re  )textract)r+   r,   textractcallerr   Textract_Featuresboto3Sessionrg  r   ra   r   r   )r-   r%   rf  rg  rh  ri  rj  rG   re  tcxfeaturesrp  sessionclient_paramsr   r.   s                   r/   r,    AmazonTextractPDFLoader.__init__/  s8   6 	4	' 9JK9JA,,Q/9JHKHH#{l+7#mm9QmRG $mmoG "3>-04@.1 DmD .&!5
O  	O 	 L,  !B    //0c3 	s)   B3 CAC 3C	D/C>>Dc                 4    [        U R                  5       5      $ zLoad given path as pages.r?   r   rn   s    r/   r   AmazonTextractPDFLoader.load{      DNN$%%r1   c              #     #    U R                   (       a4  U R                  U R                   5      (       a  [        U R                   S9nOR[        R                  " U R                  5      n[
        R                  U5      S:  a  [        SUR                   S35      eU R                  R                  U5       Sh  vN   g N7f)zLazy load documentsr      z	the file z is a multi-page document,                     but not stored on S3.                     Textract requires multi-page documents to be on S3.N)rO   r]   r   r   r%   rd  _get_number_of_pagesra   rQ   r   r   r   s     r/   r   !AmazonTextractPDFLoader.lazy_load  s      ==T__T]];;T]]+D>>$..1D&;;DAAE 		{ +H I  ;;$$T***s   B6C 8B>9C r   c                     SS K nSSKJnJn  U R
                  S:X  a@  U R                  5        nUR                  U5      n[        UR                  5      sS S S 5        $ U R
                  S:X  aL  SnUR                  U R                  5       5      n[        UR                  U5      5       H  u    nUS-  nM     U$ U R
                  S;   a  g[        SU R
                   35      e! [         a    [	        S5      ef = f! , (       d  f       g = f)	Nr   )ImageImageSequencezcCould not import pypdf or Pilloe python package. Please install it with `pip install pypdf Pillow`.zapplication/pdfz
image/tiffr~  )z	image/pngz
image/jpegzunsupported mime type: )pypdfPILr  r  r   mimetypeas_bytes_io	PdfReaderlenpagesrb   as_bytes	enumerater
   ra   )	r   r  r  r  input_pdf_file
pdf_reader	num_pagesimgre   s	            r/   r  ,AmazonTextractPDFLoader._get_number_of_pages  s    	0 ==--!!#~"__^<
:++, $# ]]l*I**T]]_-C!-"8"8"=>1Q	 ?]]996t}}oFGG%  	E 	 $#s   C# &C<#C9<
D
r   )NNNNNN)r:   r;   r<   r=   r>   r   r*   r   r   r   r   r   r,   r?   r   r   r
   r   r   r   rZ  r  r@   rA   rB   s   @r/   rd  rd    s   0 6: $26%)&*"&J
 EIJ
h'J
 $HSM2J
 	J

 #+3-J
 c]J
 smJ
 $J
 ''@AJ
 
J
 J
X&d8n &+	(	+* H4 HC H Hr1   rd  c                   &    \ rS rSrSrS\4S jrSrg)DedocPDFLoaderi  a
  DedocPDFLoader document loader integration to load PDF files using `dedoc`.
The file loader can automatically detect the correctness of a textual layer in the
    PDF document.
Note that `__init__` method supports parameters that differ from ones of
    DedocBaseLoader.

Setup:
    Install ``dedoc`` package.

    .. code-block:: bash

        pip install -U dedoc

Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import DedocPDFLoader

        loader = DedocPDFLoader(
            file_path="example.pdf",
            # split=...,
            # with_tables=...,
            # pdf_with_text_layer=...,
            # pages=...,
            # ...
        )

Load:
    .. code-block:: python

        docs = loader.load()
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        Some text
        {
            'file_name': 'example.pdf',
            'file_type': 'application/pdf',
            # ...
        }

Lazy load:
    .. code-block:: python

        docs = []
        docs_lazy = loader.lazy_load()

        for doc in docs_lazy:
            docs.append(doc)
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        Some text
        {
            'file_name': 'example.pdf',
            'file_type': 'application/pdf',
            # ...
        }

Parameters used for document parsing via `dedoc`
    (https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html):

    with_attachments: enable attached files extraction
    recursion_deep_attachments: recursion level for attached files extraction,
        works only when with_attachments==True
    pdf_with_text_layer: type of handler for parsing, available options
        ["true", "false", "tabby", "auto", "auto_tabby" (default)]
    language: language of the document for PDF without a textual layer,
        available options ["eng", "rus", "rus+eng" (default)], the list of
        languages can be extended, please see
        https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
    pages: page slice to define the reading range for parsing
    is_one_column_document: detect number of columns for PDF without a textual
        layer, available options ["true", "false", "auto" (default)]
    document_orientation: fix document orientation (90, 180, 270 degrees) for PDF
        without a textual layer, available options ["auto" (default), "no_change"]
    need_header_footer_analysis: remove headers and footers from the output result
    need_binarization: clean pages background (binarize) for PDF without a textual
        layer
    need_pdf_table_analysis: parse tables for PDF without a textual layer
r2   c                 l    SSK Jn  U" [        U R                  5      U R                  U R
                  S9$ )Nr   )make_manager_pdf_config)r%   parsing_paramsrZ   )dedoc.utils.langchainr  r*   r%   parsing_parametersrZ   )r-   r  s     r/   _make_configDedocPDFLoader._make_config  s.    A&$..)22**
 	
r1   r)   N)r:   r;   r<   r=   r>   r   r  r@   r)   r1   r/   r  r    s    Tl
d 
r1   r  c                      ^  \ rS rSrSr  SS\\\4   S\S\S\	\
   SS4
U 4S	 jjjrS\\   4S
 jrS\\   4S jrSrU =r$ )DocumentIntelligenceLoaderi  z+Load a PDF with Azure Document IntelligenceNr%   rg  modelrG   r2   c                 <   > [         TU ]  XS9  [        X#S9U l        g)a  Initialize the object for file processing with Azure Document Intelligence
(formerly Form Recognizer).

This constructor initializes a DocumentIntelligenceParser object to be used
for parsing files using the Azure Document Intelligence API. The load method
generates a Document node including metadata (source blob and page number)
for each page.

Parameters:
-----------
file_path : str
    The path to the file that needs to be parsed.
client: Any
    A DocumentAnalysisClient to perform the analysis of the blob
model : str
    The model name or ID to be used for form recognition in Azure.

Examples:
---------
>>> obj = DocumentIntelligenceLoader(
...     file_path="path/to/file",
...     client=client,
...     model="prebuilt-document"
... )
rF   )rg  r  N)r+   r,   r   r   )r-   r%   rg  r  rG   r.   s        r/   r,   #DocumentIntelligenceLoader.__init__  s"    B 	40Lr1   c                 4    [        U R                  5       5      $ ry  rz  rn   s    r/   r   DocumentIntelligenceLoader.load7  r|  r1   c              #      #    [         R                  " U R                  5      nU R                  R	                  U5       Sh  vN   g N7f)zLazy load given path as pages.N)r   r   r%   r   r   r   s     r/   r   $DocumentIntelligenceLoader.lazy_load;  s1      ~~dnn-;;$$T***s   ?A	AA	r   )zprebuilt-documentN)r:   r;   r<   r=   r>   r   r*   r   r   r   r   r,   r?   r   r   r
   r   r@   rA   rB   s   @r/   r  r    s    5 )"&"Mh'"M "M 	"M
 $"M 
"M "MH&d8n &+	(	+ +r1   r  c            	       d   ^  \ rS rSrSr SS\\\4   S\S\SS4U 4S jjjr	S\
\   4S	 jrS
rU =r$ )ZeroxPDFLoaderiC  a  Document loader utilizing Zerox library:
https://github.com/getomni-ai/zerox

Zerox converts PDF document to series of images (page-wise) and
uses vision-capable LLM model to generate Markdown representation.

Zerox utilizes anyc operations. Therefore when using this loader
inside Jupyter Notebook (or any environment running async)
you will need to:
```python
    import nest_asyncio
    nest_asyncio.apply()
```
r%   r  zerox_kwargsr2   Nc                 :   > [         TU ]  US9   X0l        X l        g )N)r%   )r+   r,   r  r  )r-   r%   r  r  r.   s       r/   r,   ZeroxPDFLoader.__init__S  s'     	9-	$ )
r1   c              #     #    SSK nSSKJn  UR                  U" S[	        U R
                  5      U R                  S.U R                  D65      n[        UR                  5      S:  a[  UR                  S   R                  nUR                   H1  n[        UR                  U R                  UR                  US.S9v   M3     gg7f)	zLazily load pages.r   N)zerox)r%   r  rK   )r   r   r  r   r)   )asynciopyzeroxr  runr*   r%   r  r  r  r  r   r   rd   r   )r-   r  r  zerox_outputr  r   s         r/   r   ZeroxPDFLoader.lazy_loado  s     ! {{WC/tzzWTEVEVW

 |!!"Q&$**2.33I$**!%"&++ $		%.  + 's   CC)r  r  )zgpt-4o-mini)r:   r;   r<   r=   r>   r   r*   r   r   r,   r
   r   r   r@   rA   rB   s   @r/   r  r  C  s\    $ #h'  	
 
 88H-  r1   r  )Gr*  loggingrP   r~   rU   rB  abcr   ior   pathlibr   r   typingr   r   r	   r
   r   r   r   r   r   r   urllib.parser   r^   langchain_core.documentsr   langchain_core.utilsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   *langchain_community.document_loaders.dedocr   3langchain_community.document_loaders.parsers.imagesr   0langchain_community.document_loaders.parsers.pdfr   r   r   r   r   r   r   r   1langchain_community.document_loaders.unstructuredr    )textractor.data.text_linearization_configr!   	getLogger__file__r   r#   rD   r   r   r   r   r   r   r  r  r\  rd  r  r  r  PagedPDFSplitterr)   r1   r/   <module>r     s_     	 	     "   "  - 5 @ B F S	 	 	 UQ			8	$,R2 ,R^MNJ MN`m w0- w0to+m o+dI: IXr0] r0j#Qm #QLK%M K%`QD} QDh%"} %"PSHm SHl^
_ ^
B0+ 0+fB] BL  r1   