
    dhb
                         S SK r S SKrS SKJr  S SKJrJrJr  S SKJ	r	  S SK
Jr  \R                  " \5      r " S S\5      rg)    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   r    \ rS rSrSr   SS\\\4   S\\S4   S\\S4   S\SS4
S	 jjr	S\
\   4S
 jrSrg)MHTMLLoader   z)Parse `MHTML` files with `BeautifulSoup`.N	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                 ~     SSK nXl        X l        Uc  SS0nX0l        X@l        g! [         a    [        S5      ef = f)as  initialize with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.

Args:
    file_path: Path to file to load.
    open_encoding: The encoding to use when opening the file.
    bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
    get_text_separator: The separator to use when getting the text
        from the soup.
r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`featureslxml)bs4ImportErrorr   r   r   r   )selfr   r   r   r   r   s         b/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/mhtml.py__init__MHTMLLoader.__init__   sU    "	 #*#V,I""4  	/ 	s   & <c              #     #    SSK Jn  [        U R                  SU R                  S9 n[
        R                  " UR                  5       5      nUR                  5       n[        U[        5      (       d  U/nU H  nUR                  5       S:X  d  M  UR                  SS9R                  5       nU" U40 U R                  D6nUR                  U R                  5      nUR                   (       a   [#        UR                   R$                  5      n	OSn	[#        U R                  5      U	S	.n
['        XS
9v     SSS5        g   SSS5        g! , (       d  f       g= f7f)z*Load MHTML document into document objects.r   )BeautifulSoupr)encodingz	text/htmlT)decode )sourcetitle)page_contentmetadataN)r   r   openr   r   emailmessage_from_stringreadget_payload
isinstancelistget_content_typer   r   get_textr   r!   strstringr   )r   r   fmessagepartsparthtmlsouptextr!   r#   s              r   	lazy_loadMHTMLLoader.lazy_load0   s     	&$..#0B0BCq//9G'')EeT** 	((*k9++4+8??AD(@@D==)@)@ADzz #DJJ$5$5 6 " #&dnn"5!&=H #HH1 DC  DCCs0   &EA$EB"E2	E;E=	E
EE)r   r   r   r   )NNr   )__name__
__module____qualname____firstlineno____doc__r   r-   r   dictr   r   r   r6   __static_attributes__     r   r
   r
      sr    3
 +/'+"$5d#5 S$Y'5 t$	5
  5 
5@8H- r@   r
   )r%   loggingpathlibr   typingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr8   loggerr
   r?   r@   r   <module>rH      s8       ( ( - @			8	$@* @r@   