o
    Zh                     @   sb   d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ e eZG dd deZdS )    N)Path)IteratorOptionalSequenceUnion)Document)
BaseLoaderc                   @   s|   e Zd ZdZ				ddeeef dee deee	  d	ee
 d
ee
 f
ddZdd ZdefddZdee fddZdS )MWDumpLoadera  Load `MediaWiki` dump from an `XML` file.

    Example:
        .. code-block:: python

            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.document_loaders import MWDumpLoader

            loader = MWDumpLoader(
                file_path="myWiki.xml",
                encoding="utf8"
            )
            docs = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=0
            )
            texts = text_splitter.split_documents(docs)


    :param file_path: XML local file path
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
    :param namespaces: The namespace of pages you want to parse.
        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
        for a list of all common namespaces
    :type namespaces: List[int],optional
    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
        False to keep them. False by default
    :type skip_redirects: bool, optional
    :param stop_on_error: False to skip over pages that cause parsing errors,
        True to stop. True by default
    :type stop_on_error: bool, optional
    utf8NFT	file_pathencoding
namespacesskip_redirectsstop_on_errorc                 C   s4   t |tr|nt|| _|| _|| _|| _|| _d S )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r    r   i/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/mediawikidump.py__init__0   s
   
zMWDumpLoader.__init__c              
   C   sH   zdd l }W n ty } ztd|d }~ww |jt| j| jdS )Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorZDump	from_fileopenr   r   )r   r   er   r   r   _load_dump_file?   s   zMWDumpLoader._load_dump_filereturnc              
   C   sr   zddl }W n ty } ztd|d}~ww |D ]}||j}|jdddd}d|ji}t||d  S dS )	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizeZcollapseZkeep_template_paramssource)Zpage_contentmetadata)mwparserfromhellr   parsetextZ
strip_codetitler   )r   pager    r   revisioncoder"   r   r   r   r   _load_single_page_from_dumpI   s$   
z(MWDumpLoader._load_single_page_from_dumpc                 c   s    |   }|jD ]:}| jr|jrq| jr|j| jvrqz| |V  W q tyB } zt	d
| | jr8|W Y d}~qd}~ww dS )zLazy load from a file path.zParsing error: {}N)r   Zpagesr   redirectr   	namespacer'   	Exceptionloggererrorformatr   )r   dumpr$   r   r   r   r   	lazy_loadZ   s"   
zMWDumpLoader.lazy_load)r
   NFT)__name__
__module____qualname____doc__r   r   r   r   r   intboolr   r   r   r'   r   r/   r   r   r   r   r	      s,    &



r	   )loggingpathlibr   typingr   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   	getLoggerr0   r+   r	   r   r   r   r   <module>   s    
