
    dh                         S SK rS SKrS SKJr  S SKJrJrJr  S SK	J
r
  S SKJr  \R                  " \5      r " S S\5      rg)    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   r    \ rS rSrSr   SS\\\4   S\\S4   S\\S4   S\SS4
S	 jjr	S\
\   4S
 jrSrg)BSHTMLLoader   a  
__ModuleName__ document loader integration

Setup:
    Install ``langchain-community`` and ``bs4``.

    .. code-block:: bash

        pip install -U langchain-community bs4

Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import BSHTMLLoader

        loader = BSHTMLLoader(
            file_path="./example_data/fake-content.html",
        )

Lazy load:
    .. code-block:: python

        docs = []
        docs_lazy = loader.lazy_load()

        # async variant:
        # docs_lazy = await loader.alazy_load()

        for doc in docs_lazy:
            docs.append(doc)
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python


        Test Title


        My First Heading
        My first paragraph.



        {'source': './example_data/fake-content.html', 'title': 'Test Title'}

Async load:
    .. code-block:: python

        docs = await loader.aload()
        print(docs[0].page_content[:100])
        print(docs[0].metadata)

    .. code-block:: python



        Test Title


        My First Heading
        My first paragraph.



        {'source': './example_data/fake-content.html', 'title': 'Test Title'}

N	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                      SSK nXl        X l        Uc3  [        R
                  R                  S5      (       d  [        S5      eSS0nX0l        X@l        g! [         a    [        S5      ef = f)aq  initialize with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.

Args:
    file_path: The path to the file to load.
    open_encoding: The encoding to use when opening the file.
    bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
    get_text_separator: The separator to use when calling get_text on the soup.
r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`lxmlzBy default BSHTMLLoader uses the 'lxml' package. Please either install it with `pip install -U lxml` or pass in init arg `bs_kwargs={'features': '...'}` to overwrite the default BeautifulSoup kwargs.features)	bs4ImportErrorr   r   	importlibutil	find_specr   r   )selfr   r   r   r   r   s         d/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/html_bs.py__init__BSHTMLLoader.__init__S   s~     	 #*>>++F33!,  $V,I""4%  	/ 	s   A A+c              #     #    SSK Jn  [        U R                  SU R                  S9 nU" U40 U R
                  D6nSSS5        WR                  U R                  5      nUR                  (       a   [        UR                  R                  5      nOSn[        U R                  5      US.n[        XFS9v   g! , (       d  f       N= f7f)	z)Load HTML document into document objects.r   )BeautifulSoupr)encodingN )sourcetitle)page_contentmetadata)r   r   openr   r   r   get_textr   r#   strstringr   )r   r   fsouptextr#   r%   s          r   	lazy_loadBSHTMLLoader.lazy_loady   s     %$..#0B0BCq 5dnn5D D }}T445::

))*EE $..)1
 D<< DCs   &CB6A:C6
C C)r   r   r   r   )NNr!   )__name__
__module____qualname____firstlineno____doc__r   r(   r   dictr   r   r   r-   __static_attributes__     r   r
   r
      sv    CP +/'+"$$5d#$5 S$Y'$5 t$	$5
  $5 
$5L=8H- =r7   r
   )importlib.utilr   loggingpathlibr   typingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr/   loggerr
   r6   r7   r   <module>r@      s8       ( ( - @			8	$~=: ~=r7   