o
    ZhB                     @   sT   d dl mZmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZ G dd deZdS )    )AnyIteratorListOptional)urljoinurlparse)Document)WebBaseLoaderc                       s   e Zd ZdZ					ddededee d	ed
edef fddZdee	 fddZ
	ddedee dee	 fddZdedee fddZ  ZS )GitbookLoaderztLoad `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the navbar.
    FNmainTweb_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressc                    sZ   |p|| _ | j dr| j dd | _ |r| j  d}t j|f||d || _|| _dS )a  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
        /Nz/sitemap.xml)Z	web_pathsr   r   )r   endswithsuper__init__r   r   )selfr   r   r   r   r   r   	__class__ c/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/gitbook.pyr      s   

zGitbookLoader.__init__returnc                 #   s     j r2  } |} fdd|D } |}t||D ]\}} ||}|r/|V  q dS   } | j}|rD|V  dS dS )z(Fetch text from one single GitBook page.c                    s   g | ]}t  j|qS r   )r   r   ).0pathr   r   r   
<listcomp>=       z+GitbookLoader.lazy_load.<locals>.<listcomp>N)r   Zscrape
_get_pathsZ
scrape_allzip_get_documentweb_path)r   Z	soup_infoZrelative_pathsurlsZ
soup_infosurldocr   r   r   	lazy_load8   s"   


zGitbookLoader.lazy_loadsoup
custom_urlc                 C   sX   | | j}|s
dS |jdd }| d}|r|jnd}|p"| j|d}t||dS )z,Fetch content from page and return Document.N
)	separatorZh1 )sourcetitle)Zpage_contentmetadata)findr   Zget_textstriptextr%   r   )r   r*   r+   Zpage_content_rawcontentZtitle_if_existsr0   r1   r   r   r   r$   J   s   
zGitbookLoader._get_documentc                 C   s   dd | dD S )z'Fetch all relative paths in the navbar.c                 S   s   g | ]}t |jjqS r   )r   r4   r   )r   locr   r   r   r    Y   r!   z,GitbookLoader._get_paths.<locals>.<listcomp>r6   )Zfind_all)r   r*   r   r   r   r"   W   s   zGitbookLoader._get_paths)FNr   FT)N)__name__
__module____qualname____doc__strboolr   r   r   r   r)   r   r$   r   r"   __classcell__r   r   r   r   r
   	   s<    	(
r
   N)typingr   r   r   r   urllib.parser   r   Zlangchain_core.documentsr   Z-langchain_community.document_loaders.web_baser	   r
   r   r   r   r   <module>   s
    