
    dh1                         S SK r S SKJr  S SKJrJrJrJrJr  S SK	J
r
  S SKJr  \ R                  " \5      r\(       a  S SKr " S S\5      rg)    N)Path)TYPE_CHECKINGIteratorOptionalSequenceUnion)Document)
BaseLoaderc                       \ rS rSrSr    SS\\\4   S\\   S\\	\
      S\\   S\\   4
S	 jjrSS jrSSS
\4S jrS
\\   4S jrSrg)MWDumpLoader   a  Load `MediaWiki` dump from an `XML` file.

Example:
    .. code-block:: python

        from langchain_text_splitters import RecursiveCharacterTextSplitter
        from langchain_community.document_loaders import MWDumpLoader

        loader = MWDumpLoader(
            file_path="myWiki.xml",
            encoding="utf8"
        )
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=0
        )
        texts = text_splitter.split_documents(docs)


:param file_path: XML local file path
:type file_path: str
:param encoding: Charset encoding, defaults to "utf8"
:type encoding: str, optional
:param namespaces: The namespace of pages you want to parse.
    See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
    for a list of all common namespaces
:type namespaces: List[int],optional
:param skip_redirects: TR=rue to skip pages that redirect to other pages,
    False to keep them. False by default
:type skip_redirects: bool, optional
:param stop_on_error: False to skip over pages that cause parsing errors,
    True to stop. True by default
:type stop_on_error: bool, optional
N	file_pathencoding
namespacesskip_redirectsstop_on_errorc                     [        U[        5      (       a  UO
[        U5      U l        X l        X0l        X@l        XPl        g )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r   s         j/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/mediawikidump.py__init__MWDumpLoader.__init__3   s3     '1C&@&@c)n $,*    returnc                      SS K nUR                  R                  [	        U R
                  U R                  S95      $ ! [         a  n[        S5      UeS nAff = f)Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorDump	from_fileopenr   r   )r   r   es      r   _load_dump_fileMWDumpLoader._load_dump_fileB   sU    	 zz##D$--$PQQ  	T	s   = 
AAApagez
mwxml.Pagec                      SSK nU HF  nUR                  UR                  5      nUR	                  SSSS9nSUR
                  0n[        XgS9s  $    g! [         a  n[        S5      UeSnAff = f)	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizecollapsekeep_template_paramssource)page_contentmetadata)mwparserfromhellr   parsetext
strip_codetitler	   )r   r%   r-   r"   revisioncoder/   r,   s           r   _load_single_page_from_dump(MWDumpLoader._load_single_page_from_dumpL   s    	# H#))(--8D??E # D !$**-HAA   	3 	s   A 
A.A))A.c              #     #    U R                  5       nUR                   Hh  nU R                  (       a  UR                  (       a  M'  U R                  (       a  UR
                  U R                  ;  a  MT   U R                  U5      v   Mj     g! [         aB  n[        R                  SR                  U5      5        U R                  (       a  Ue SnAM  SnAff = f7f)zLazy load from a file path.zParsing error: {}N)r#   pagesr   redirectr   	namespacer4   	Exceptionloggererrorformatr   )r   dumpr%   r"   s       r   	lazy_loadMWDumpLoader.lazy_load]   s     
 ##%JJD""t}}4>>#H66t<<   077:;%%Gs0   A1C4BC
C7CCCC)r   r   r   r   r   )utf8NFT)r   z
mwxml.Dump)__name__
__module____qualname____firstlineno____doc__r   r   r   r   r   intboolr   r#   r	   r4   r   r?   __static_attributes__ r   r   r   r      s    !L #).2).(,+d#+ 3-+ Xc]+	+
 !+  ~+RB B B"	(	r   r   )loggingpathlibr   typingr   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   	getLoggerrB   r;   r   r   rJ   r   r   <module>rQ      s;      E E - @			8	$a: ar   