
    dhQ                         S SK r S SKrS SKrS SKJrJr  S SKJrJrJ	r	J
r
Jr  S SKJr  S SKJr   " S S\\5      r " S S	\5      r " S
 S\5      rg)    N)ABCabstractmethod)DictIteratorOptionalTupleUnion)Document)
BaseLoaderc            !       Z   \ rS rSrSrSSSSSSS	S
S
SSSSSS.S\S\S\S\\\4   S\S\S\S\S\S\S\\\4   S\\\4   S\\\4   S\	\   S\	\   SS4 S jjr
S\\   4S jr\S\4S j5       rS \S\4S! jrS"\S#\S\\   4S$ jr S*S"\S\S%\	\   S\\   4S& jjrS'\S\\\4   4S( jrS)rg)+DedocBaseLoader   a  
Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).

Loader enables extracting text, tables and attached files from the given file:
    * `Text` can be split by pages, `dedoc` tree nodes, textual lines
        (according to the `split` parameter).
    * `Attached files` (when with_attachments=True)
        are split according to the `split` parameter.
        For attachments, langchain Document object has an additional metadata field
        `type`="attachment".
    * `Tables` (when with_tables=True) are not split - each table corresponds to one
        langchain Document object.
        For tables, Document object has additional metadata fields `type`="table"
        and `text_as_html` with table HTML representation.
documentTF
   
auto_tabbyrus+eng:autoN)splitwith_tableswith_attachmentsrecursion_deep_attachmentspdf_with_text_layerlanguagepagesis_one_column_documentdocument_orientationneed_header_footer_analysisneed_binarizationneed_pdf_table_analysis	delimiterencoding	file_pathr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   returnc                t   [        5       R                  5        VVs0 sH  u  nnUS;  d  M  UU_M     snnU l        1 SkU l        X R                  ;  a  [	        SU SU R                   S35      eX l        X0l        Xl        U R
                  S:X  a  SOSnUU R                  S	'   X@R                  S
'   gs  snnf )a[	  
Initialize with file path and parsing parameters.

Args:
    file_path: path to the file for processing
    split: type of document splitting into parts (each part is returned
        separately), default value "document"
        "document": document text is returned as a single langchain Document
            object (don't split)
        "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
            ODP)
        "node": split document text into tree nodes (title nodes, list item
            nodes, raw text nodes)
        "line": split document text into lines
    with_tables: add tables to the result - each table is returned as a single
        langchain Document object

    Parameters used for document parsing via `dedoc`
        (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

        with_attachments: enable attached files extraction
        recursion_deep_attachments: recursion level for attached files
            extraction, works only when with_attachments==True
        pdf_with_text_layer: type of handler for parsing PDF documents,
            available options
            ["true", "false", "tabby", "auto", "auto_tabby" (default)]
        language: language of the document for PDF without a textual layer and
            images, available options ["eng", "rus", "rus+eng" (default)],
            the list of languages can be extended, please see
            https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
        pages: page slice to define the reading range for parsing PDF documents
        is_one_column_document: detect number of columns for PDF without
            a textual layer and images, available options
            ["true", "false", "auto" (default)]
        document_orientation: fix document orientation (90, 180, 270 degrees)
            for PDF without a textual layer and images, available options
            ["auto" (default), "no_change"]
        need_header_footer_analysis: remove headers and footers from the output
            result for parsing PDF and images
        need_binarization: clean pages background (binarize) for PDF without a
            textual layer and images
        need_pdf_table_analysis: parse tables for PDF without a textual layer
            and images
        delimiter: column separator for CSV, TSV files
        encoding: encoding of TXT, CSV, TSV
>   selfr   r#   r   >   linenodepager   Got $ for `split`, but should be one of ``r(   treelinearstructure_typeneed_content_analysisN)localsitemsparsing_parametersvalid_split_values
ValueErrorr   r   r#   )r&   r#   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   keyvaluer/   s                      b/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/dedoc.py__init__DedocBaseLoader.__init__#   s    F %hnn.#
.
UGG CJ.#

 #G///ug ++,A/  
&"#'::#7X4B 01;K 78##
s
   B4B4c              #     #    SSK n SSKJn  U" U R	                  5       S9nSUR
                  S   l        UR                  5        nUR                  U R                  0 U R                  ESU0ES	9nSSS5        U R                  WR                  5       R                  5       U R                  S
9 Sh  vN   g! [         a    [        S5      ef = f! , (       d  f       Nf= f N/7f)Lazily load documents.r   N)DedocManagerzE`dedoc` package not found, please install it with `pip install dedoc`)manager_configTloggerattachments_dir)r#   
parametersdocument_treer   )tempfilededocr=   ImportError_make_configconfigdisabledTemporaryDirectoryparser#   r3   _split_documentto_api_schemadictr   )r&   rD   r=   dedoc_managertmpdirrC   s         r8   	lazy_loadDedocBaseLoader.lazy_loadw   s     	*
 %D4E4E4GH26X&/((*f)//..Qd55Q7H&Q 0 M +
 '''557<<>djj ( 
 	
 	
  	W 	 +*
	
s@   C$B8 8C$+C1AC$2C"3C$8CC$
CC$c                     g)z]
Make configuration for DedocManager according to the file extension and
parsing parameters.
N r&   s    r8   rG   DedocBaseLoader._make_config   s     	    	paragraphc                     SR                  US    Vs/ sH  nU R                  U5      PM     sn5      nU(       a  US    SU 3nU$ US   nU$ s  snf )z1Get text (recursively) of the document tree node.
subparagraphstext)join	_json2txt)r&   rX   subparagraphsubparagraphs_textr\   s        r8   r^   DedocBaseLoader._json2txt   s    !YY %.o$>$>L |,$>
 "  !$6#78 	
  6" 	
 s   ArC   document_metadatac              #      #    [        US   5      S:  a$  US    H  nU R                  X2S9 Sh  vN   M     g[        US   0 UEUS   ES9v   g N 7f)z4Parse recursively document tree obtained by `dedoc`.r[   r   rC   rb   Nr\   metadatapage_contentre   )len_parse_subparagraphsr
   )r&   rC   rb   r_   s       r8   ri   $DedocBaseLoader._parse_subparagraphs   sw      }_-.2 -o >44". 5    !?
 *62K-Kz1JK 	s   -AA!Aadditional_metadatac              #     #    US   nU(       a  0 UEUEnUS:X  a"  U R                  US   S   S9n[        XTS9v   GOUS:X  a  US   S   S   nUS	   S   S
   nSnU HO  n	U	S   S
   U:X  a  XR                  U	5      -  nM$  [        U0 UES
U0ES9v   U	S   S
   nU R                  U	5      nMQ     [        U0 UES
U0ES9v   O~US:X  a7  US   S   S    H'  n	U	S   n
[        U R                  U	5      0 UEU
ES9v   M)     OAUS:X  a  U R                  US   S   US9 Sh  vN   O[        SU SU R                   S35      eU R
                  (       a8  US   S    H,  nU R                  U5      u  p[        U0 US   ESUS.ES9v   M.     US    H(  nU R                  UU R                  SS0S9 Sh  vN   M*     g N N7f)z=Split document into parts according to the `split` parameter.re   r   content	structure)rX   rf   r)   r[   r   page_id r'   r(   rd   Nr*   r+   r,   tablestable)typetext_as_htmlattachmentsrs   
attachment)rC   r   rk   )	r^   r
   ri   r5   r4   r   
_get_tablerL   r   )r&   rC   r   rk   rb   r\   nodesro   	page_textr(   line_metadatarr   
table_text
table_htmlrv   s                  r8   rL   DedocBaseLoader._split_document   st     **5 L#4 L8K LJ>>M),D[,Q>RDIIf_!),[9/JEAhz*95GI
#I.'9!55I"%.!J$5!Jy'!J  #:.y9G $t 4I  &B-By'B 
 f_%i0=oN $Z 0!%!5C 1C]C  O f_00+I6{C"3 1    ug ++,A/ 
 &y1(;)-)?&
!+
+ '(2  < (6J++(jj%+\$: ,    7/0s%   DG F< BG 2F>3
G >G rr   c           
         SnUS    H4  nU H&  nUSR                  S US    5       5      -  nUS-  nM(     US-  nM6     SnUS    Hp  nUS	-  nU H]  nSR                  S
 US    5       5      n[        R                  " U5      nUS-  nUS   (       a  US-  nUSUS    SUS    SU S3-  nM_     US-  nMr     US-  nX%4$ )z.Get text and HTML representation of the table.rp   cells c              3   (   #    U H	  oS    v   M     g7fr\   NrT   .0r'   s     r8   	<genexpr>-DedocBaseLoader._get_table.<locals>.<genexpr>  s     &NF|   lines	rZ   zK<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
z<tr>
c              3   (   #    U H	  oS    v   M     g7fr   rT   r   s     r8   r   r     s     %M}t6l}r   z<td	invisiblez style="display: none" z
 colspan="colspanz" rowspan="rowspanz">z</td>
z</tr>
z</tbody>
</table>)r]   htmlescape)r&   rr   r{   rowcellr|   	cell_texts          r8   rw   DedocBaseLoader._get_table   s   
>Cchh&NW&NNN
d"
  $J	 " 	 >C("J II%MtG}%MM	 KK	2	e#
$";;J i 1 2Y(9+W>
  )#J " 	**
%%rW   )r#   r3   r   r4   r   N)__name__
__module____qualname____firstlineno____doc__strboolr	   intr   r9   r   r
   rQ   r   rN   rG   r^   ri   rL   r   rw   __static_attributes__rT   rW   r8   r   r      s   (   -2*,#/!&,$*8=.348#'"&#RLRL 	RL
 RL  T	*RL %(RL !RL RL RL !$RL "RL &+39%5RL !d+RL "'sDy!1RL  C=!RL" 3-#RL$ 
%RLh
8H- 
, d  4 C !6:	(	& /3	II I &d^	I
 
(	IV& &sCx &rW   r   c                   &    \ rS rSrSrS\4S jrSrg)DedocFileLoaderi  a  
DedocFileLoader document loader integration to load files using `dedoc`.

The file loader automatically detects the file type (with the correct extension).
The list of supported file types is gives at
https://dedoc.readthedocs.io/en/latest/index.html#id1.
Please see the documentation of DedocBaseLoader to get more details.

Setup:
    Install ``dedoc`` package.

    .. code-block:: bash

        pip install -U dedoc

Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import DedocFileLoader

        loader = DedocFileLoader(
            file_path="example.pdf",
            # split=...,
            # with_tables=...,
            # pdf_with_text_layer=...,
            # pages=...,
            # ...
        )

Load:
    .. code-block:: python

        docs = loader.load()
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        Some text
        {
            'file_name': 'example.pdf',
            'file_type': 'application/pdf',
            # ...
        }

Lazy load:
    .. code-block:: python

        docs = []
        docs_lazy = loader.lazy_load()

        for doc in docs_lazy:
            docs.append(doc)
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        Some text
        {
            'file_name': 'example.pdf',
            'file_type': 'application/pdf',
            # ...
        }
r$   c                 Z    SSK Jn  U" U R                  U R                  U R                  S9$ )Nr   )make_manager_config)r#   parsing_paramsr   )dedoc.utils.langchainr   r#   r3   r   )r&   r   s     r8   rG   DedocFileLoader._make_config`  s)    ="nn22**
 	
rW   rT   N)r   r   r   r   r   rN   rG   r   rT   rW   r8   r   r     s    @D
d 
rW   r   c            #         ^  \ rS rSrSrSSSSSSS	S
SSSSSSSS.S\S\S\S\S\\\4   S\S\S\S\S\S\S\\\4   S\\\4   S\\\4   S\	\   S\	\   SS4"U 4S jjjr
S\\   4S  jrS\4S! jrS\S\S"\S\\\\\\4   4   4S# jrS$rU =r$ )%DedocAPIFileLoaderij  ay  
Load files using `dedoc` API.
The file loader automatically detects the file type (even with the wrong extension).
By default, the loader makes a call to the locally hosted `dedoc` API.
More information about `dedoc` API can be found in `dedoc` documentation:
    https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html

Please see the documentation of DedocBaseLoader to get more details.

Setup:
    You don't need to install `dedoc` library for using this loader.
    Instead, the `dedoc` API needs to be run.
    You may use Docker container for this purpose.
    Please see `dedoc` documentation for more details:
        https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker

    .. code-block:: bash

        docker pull dedocproject/dedoc
        docker run -p 1231:1231

Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import DedocAPIFileLoader

        loader = DedocAPIFileLoader(
            file_path="example.pdf",
            # url=...,
            # split=...,
            # with_tables=...,
            # pdf_with_text_layer=...,
            # pages=...,
            # ...
        )

Load:
    .. code-block:: python

        docs = loader.load()
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        Some text
        {
            'file_name': 'example.pdf',
            'file_type': 'application/pdf',
            # ...
        }

Lazy load:
    .. code-block:: python

        docs = []
        docs_lazy = loader.lazy_load()

        for doc in docs_lazy:
            docs.append(doc)
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python

        Some text
        {
            'file_name': 'example.pdf',
            'file_type': 'application/pdf',
            # ...
        }
zhttp://0.0.0.0:1231r   TFr   r   r   r   r   N)urlr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r$   c                f   > [         TU ]  UUUUUUUU	U
UUUUUUS9  X l        SU R                  S'   g)ad	  Initialize with file path, API url and parsing parameters.

Args:
    file_path: path to the file for processing
    url: URL to call `dedoc` API
    split: type of document splitting into parts (each part is returned
        separately), default value "document"
        "document": document is returned as a single langchain Document object
            (don't split)
        "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
        "node": split document into tree nodes (title nodes, list item nodes,
            raw text nodes)
        "line": split document into lines
    with_tables: add tables to the result - each table is returned as a single
        langchain Document object

    Parameters used for document parsing via `dedoc`
        (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

        with_attachments: enable attached files extraction
        recursion_deep_attachments: recursion level for attached files
            extraction, works only when with_attachments==True
        pdf_with_text_layer: type of handler for parsing PDF documents,
            available options
            ["true", "false", "tabby", "auto", "auto_tabby" (default)]
        language: language of the document for PDF without a textual layer and
            images, available options ["eng", "rus", "rus+eng" (default)],
            the list of languages can be extended, please see
            https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
        pages: page slice to define the reading range for parsing PDF documents
        is_one_column_document: detect number of columns for PDF without
            a textual layer and images, available options
            ["true", "false", "auto" (default)]
        document_orientation: fix document orientation (90, 180, 270 degrees)
            for PDF without a textual layer and images, available options
            ["auto" (default), "no_change"]
        need_header_footer_analysis: remove headers and footers from the output
            result for parsing PDF and images
        need_binarization: clean pages background (binarize) for PDF without a
            textual layer and images
        need_pdf_table_analysis: parse tables for PDF without a textual layer
            and images
        delimiter: column separator for CSV, TSV files
        encoding: encoding of TXT, CSV, TSV
)r#   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   jsonreturn_formatN)superr9   r   r3   )r&   r#   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   	__class__s                    r8   r9   DedocAPIFileLoader.__init__  s\    B 	#-'A 3#9!5(C/$; 	 	
" 390rW   c              #      #    U R                  U R                  U R                  U R                  S9nU R	                  XR
                  S9 Sh  vN   g N7f)r<   )r   r#   rA   rB   N)
_send_filer   r#   r3   rL   r   )r&   doc_trees     r8   rQ   DedocAPIFileLoader.lazy_load	  sJ     ??DNNt?V?V # 
 ''hjj'QQQs   AAAAc                     0 $ r   rT   rU   s    r8   rG   DedocAPIFileLoader._make_config  s    	rW   rA   c                    SSK n[        R                  R                  U5      n[	        US5       nSXV40nUR                  U S3XsS9nSSS5        WR                  S:w  a&  [        SUR                  R                  5        35      e[        R                  " UR                  R                  5       5      n	U	$ ! , (       d  f       Nt= f)	z7Send POST-request to `dedoc` API and return the resultsr   Nrbfilez/upload)filesdata   zError during file handling: )requestsospathbasenameopenpoststatus_coder5   rm   decoder   loads)
r&   r   r#   rA   r   	file_namer   r   rresults
             r8   r   DedocAPIFileLoader._send_file  s     	GG$$Y/	)T"di./EWoULA # ==C;AII<L<L<N;OPQQAII,,./ #"s   B77
C)r   )r   r   r   r   r   r   r   r	   r   r   r9   r   r
   rQ   rN   rG   r   listr   r   __classcell__)r   s   @r8   r   r   j  s   GZ ) -2*,#/!&,$*8=.348#'"&%S:S: 	S:
 S: S:  T	*S: %(S: !S: S: S: !$S: "S: &+39%5S: !d+S:  "'sDy!1!S:" C=#S:$ 3-%S:& 
'S: S:jR8H- Rd #&48	c5tS))	* rW   r   )r   r   r   abcr   r   typingr   r   r   r   r	   langchain_core.documentsr
   )langchain_community.document_loaders.baser   r   r   r   rT   rW   r8   <module>r      sQ      	 #  . @H&j# H&VJ
o J
Zx xrW   