o
    Zh                     @   sV   d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	 e
eZG dd de	ZdS )z1Loader that uses unstructured to load HTML files.    N)AnyIteratorList)Document)
BaseLoaderc                   @   sh   e Zd ZdZ				ddee dedededed	ed
dfddZd
ee	 fddZ
d
ee	 fddZdS )NewsURLLoadera/  Load news articles from URLs using `Unstructured`.

    Args:
        urls: URLs to load. Each is loaded into its own document.
        text_mode: If True, extract text from URL and use that for page content.
            Otherwise, extract raw HTML.
        nlp: If True, perform NLP on the extracted contents, like providing a summary
            and extracting keywords.
        continue_on_failure: If True, continue loading documents even if
            loading fails for a particular URL.
        show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
            tqdm to be installed, ``pip install tqdm``.
        **newspaper_kwargs: Any additional named arguments to pass to
            newspaper.Article().

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import NewsURLLoader

            loader = NewsURLLoader(
                urls=["<url-1>", "<url-2>"],
            )
            docs = loader.load()

    Newspaper reference:
        https://newspaper.readthedocs.io/en/latest/
    TFurls	text_modenlpcontinue_on_failureshow_progress_barnewspaper_kwargsreturnNc                 K   sT   z
ddl }|j| _W n ty   tdw || _|| _|| _|| _|| _|| _	dS )zInitialize with file path.r   NzMnewspaper package not found, please install it with `pip install newspaper3k`)
	newspaper__version__Z_NewsURLLoader__versionImportErrorr   r	   r
   r   r   r   )selfr   r	   r
   r   r   r   r    r   `/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/news.py__init__+   s   

zNewsURLLoader.__init__c              
   C   sR   |   }| jr%zddlm} W n ty  } ztd|d }~ww ||}t|S )Nr   )tqdmzPackage tqdm must be installed if show_progress_bar=True. Please install with 'pip install tqdm' or set show_progress_bar=False.)	lazy_loadr   r   r   list)r   iterr   er   r   r   loadF   s   zNewsURLLoader.loadc                 c   sR   zddl m} W n ty } ztd|d }~ww | jD ]}z||fi | j}|  |  | jr9|  W n" ty\ } z| j	rVt
d| d|  W Y d }~q|d }~ww t|ddt|dt|d	dt|d
g t|ddt|ddt|ddd}| jr|j}n|j}| jrt|dg |d< t|dd|d< t||dV  qd S )Nr   )ArticlezFCannot import newspaper, please install with `pip install newspaper3k`zError fetching or processing z, exception: title urlZcanonical_linkauthorsZ	meta_langZmeta_descriptionpublish_date)r   linkr    languagedescriptionr!   keywordssummary)Zpage_contentmetadata)r   r   r   r   r   downloadparser
   	Exceptionr   loggererrorgetattrr	   texthtmlr   )r   r   r   r   Zarticler'   contentr   r   r   r   T   sR   





	zNewsURLLoader.lazy_load)TFTF)__name__
__module____qualname____doc__r   strboolr   r   r   r   r   r   r   r   r   r   r      s.     
r   )r4   loggingtypingr   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   	getLoggerr1   r+   r   r   r   r   r   <module>   s    
