o
    Zh)<                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ eeZe dd	d
ddddZdededefddZG dd deZdS )zWeb base loader class.    N)AnyAsyncIteratorDictIteratorListOptionalSequenceUnion)
deprecated)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageZRefererZDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                 C   sj   d|i}|  d }r| |d< | j dddid }r$|dd|d< |  d	 }r3|d
d|d< |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r    r#   d/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/web_base.py_build_metadata   s   r%   c                &   @   sN  e Zd ZdZ															dFddd	d
eeee f dee de	dee de	de	dee dee de
dedeeeef  de	deeeef  deeeef  dede	de	ddf$ddZedefddZ	!dGd"ed#e
d$e
d%edef
d&d'Zd"ed(ejdefd)d*Zd+ee defd,d-Zed.eddfd/d0Z	dHd1ed+ee d.eedf dee fd2d3ZdHd+ee d.eedf dee fd4d5Z	dHd+ee d.eedf dee fd6d7Z		dId"ed.eedf dee defd8d9ZdHd.eedf defd:d;Zdee fd<d=Z de!e fd>d?Z"e#d@dAdBdCdee fdDdEZ$dS )JWebBaseLoaderaQ  
    WebBaseLoader document loader integration

    Setup:
        Install ``langchain_community``.

        .. code-block:: bash

            pip install -U langchain_community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import WebBaseLoader

            loader = WebBaseLoader(
                web_path = "https://www.espn.com/"
                # header_template = None,
                # verify_ssl = True,
                # proxies = None,
                # continue_on_failure = False,
                # autoset_encoding = True,
                # encoding = None,
                # web_paths = (),
                # requests_per_second = 2,
                # default_parser = "html.parser",
                # requests_kwargs = None,
                # raise_for_status = False,
                # bs_get_text_kwargs = None,
                # bs_kwargs = None,
                # session = None,
                # show_progress = True,
                # trust_env = False,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            for doc in loader.lazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


    Async load:
        .. code-block:: python

            docs = []
            async for doc in loader.alazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

    .. versionchanged:: 0.3.14

        Deprecated ``aload`` (which was not async) and implemented a native async
        ``alazy_load``. Expand below for more details.

        .. dropdown:: How to update ``aload``

            Instead of using ``aload``, you can use ``load`` for synchronous loading or
            ``alazy_load`` for asynchronous lazy loading.

            Example using ``load`` (synchronous):

            .. code-block:: python

                docs: List[Document] = loader.load()

            Example using ``alazy_load`` (asynchronous):

            .. code-block:: python

                docs: List[Document] = []
                async for doc in loader.alazy_load():
                    docs.append(doc)

            This is in preparation for accommodating an asynchronous ``aload`` in the
            future:

            .. code-block:: python

                docs: List[Document] = await loader.aload()

     NTFr#      html.parser)show_progress	trust_envweb_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr*   r+   r   c                C   sD  |r|rt d|rt|| _n$t|tr|g| _nt|tr%t|| _ntdt| dt| d|	| _|
| _	|p=i | _
|| _|| _|pHi | _|pMi | _|rU|| _n?t }|p^t }|dszddlm} | j|d< W n ty   td Y nw t||_||_|r|j| || _|| _ || _!|| _"|| _#d	S )
a  Initialize loader.

        Args:
            web_paths: Web paths to load from.
            requests_per_second: Max number of concurrent requests to make.
            default_parser: Default parser to use for BeautifulSoup.
            requests_kwargs: kwargs for requests
            raise_for_status: Raise an exception if http status code denotes an error.
            bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
            bs_kwargs: kwargs for beatifulsoup4 web page parsing
            show_progress: Show progress bar when loading pages.
            trust_env: set to True if using proxy to make web requests, for example
                using http(s)_proxy environment variables. Defaults to False.
        zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)$
ValueErrorlistr3   
isinstancestrr   	TypeErrortyper4   r5   r6   r7   r*   r8   r9   r:   requestsSessiondefault_header_templatecopyr!   Zfake_useragentr<   randomImportErrorloggerinfodictheadersverifyr/   updater0   r1   r2   r+   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r*   r+   r<   r#   r#   r$   __init__   sX   $








zWebBaseLoader.__init__c                 C   s    t | jdkrtd| jd S )N   zMultiple webpaths found.r   )lenr3   r=   )rO   r#   r#   r$   r,      s   
zWebBaseLoader.web_path         ?r   retriescooldownbackoffc           
         s  t j| jd4 I d H }t|D ]}zWt| jj| jj d}| jj	s(d|d< |j
|fi | j|B 4 I d H &}| jr@|  | I d H W  d   I d H  W   W  d   I d H  S 1 I d H sdw   Y  W q t jy }	 z-||d krz td| d|d  d| d	|	 d
	 t|||  I d H  W Y d }	~	qd }	~	ww W d   I d H  td1 I d H sw   Y  td)N)r+   )rL   cookiesFsslrQ   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpZClientSessionr+   rangerK   r:   rL   rX   get_dictrM   r!   r6   r7   textZClientConnectionErrorrI   warningasynciosleepr=   )
rO   r   rU   rV   rW   r:   ikwargsresponseer#   r#   r$   _fetch   sT   

$zWebBaseLoader._fetch	semaphorec                    s   |4 I d H H z|  |I d H W W  d   I d H  S  tyN } z'| jr?td| d W Y d }~W d   I d H  dS td| d |d }~ww 1 I d H sUw   Y  d S )NrZ   z*, skipping due to continue_on_failure=Truer'   za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rg   	Exceptionr0   rI   r`   	exception)rO   r   rh   rf   r#   r#   r$   _fetch_with_rate_limit  s&   



z$WebBaseLoader._fetch_with_rate_limiturlsc                    s   t | j}g }|D ]}t | ||}|| qz | jr4ddlm} |j	|ddddI dH W S t j	| I dH W S  t
yR   td t j	| I dH  Y S w )	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrQ   )ZdescasciiZminintervalNz2For better logging of progress, `pip install tqdm`)ra   	Semaphorer4   ensure_futurerk   appendr*   Ztqdm.asynciorm   gatherrH   warningswarn)rO   rl   rh   tasksr   taskrm   r#   r#   r$   	fetch_all  s"   
zWebBaseLoader.fetch_allparserc                 C   s*   g d}| |vrt dd| d dS )z#Check that parser is valid for bs4.)r)   Zlxmlxmlzlxml-xmlZhtml5libz`parser` must be one of z, .N)r=   join)rx   Zvalid_parsersr#   r#   r$   _check_parser.  s   zWebBaseLoader._check_parserresultsc           	      C   sp   ddl m} g }t|D ])\}}|| }|du r(|dr d}n| j}| | ||||fi | j q|S )z0Unpack fetch results into BeautifulSoup objects.r   BeautifulSoupN.xmlry   )bs4r   	enumerateendswithr5   r|   rq   r9   )	rO   r}   rl   rx   r   Zfinal_resultsrc   resultr   r#   r#   r$   _unpack_fetch_results7  s   

z#WebBaseLoader._unpack_fetch_resultsc                 C   s    t | |}| j|||dS )z2Fetch all urls, then return soups for all results.rx   )ra   runrw   r   rO   rl   rx   r}   r#   r#   r$   
scrape_allI  s   zWebBaseLoader.scrape_allc                    s"   |  |I dH }| j|||dS )z8Async fetch all urls, then return soups for all results.Nr   )rw   r   r   r#   r#   r$   ascrape_allN  s   zWebBaseLoader.ascrape_allc                 C   s   ddl m} |d u r|drd}n| j}| | | jj|fi | j}| jr,|  | j	d ur6| j	|_	n| j
r=|j|_	||j|fi |pFi S )Nr   r~   r   ry   )r   r   r   r5   r|   r:   r!   r6   r7   r2   r1   apparent_encodingr_   )rO   r   rx   r9   r   Zhtml_docr#   r#   r$   _scrapeU  s   



zWebBaseLoader._scrapec                 C   s"   |du r| j }| j| j|| jdS )z?Scrape data from webpage and return it in BeautifulSoup format.N)rx   r9   )r5   r   r,   r9   )rO   rx   r#   r#   r$   scrapeo  s   zWebBaseLoader.scrapec                 c   sL    | j D ]}| j|| jd}|jdi | j}t||}t||dV  qdS )z+Lazy load text from the url(s) in web_path.)r9   Zpage_contentr"   Nr#   )r3   r   r9   r    r8   r%   r   )rO   pathr   r_   r"   r#   r#   r$   	lazy_loadw  s   

zWebBaseLoader.lazy_loadc                 C  sX   |  | jI dH }t| j|D ]\}}|jdi | j}t||}t||dV  qdS )z1Async lazy load text from the url(s) in web_path.Nr   r#   )r   r3   zipr    r8   r%   r   )rO   r}   r   r   r_   r"   r#   r#   r$   
alazy_load  s   
zWebBaseLoader.alazy_loadz0.3.14z1.0zSee API reference for updated usage: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html)ZsinceZremovalmessagec                 C   sX   |  | j}g }t| j|D ]\}}|jdi | j}t||}|t||d q|S )z9Load text from the urls in web_path async into Documents.r   Nr#   )r   r3   r   r    r8   r%   rq   r   )rO   r}   docsr   r   r_   r"   r#   r#   r$   aload  s   
zWebBaseLoader.aload)r'   NTNFTNr#   r(   r)   NFNNN)rS   r(   rT   )N)NN)%__name__
__module____qualname____doc__r	   r@   r   r   rK   boolintr   r   rP   propertyr,   floatrg   ra   ro   rk   r   rw   staticmethodr|   r   r   r   r   r   r   r   r   r   r   r
   r   r#   r#   r#   r$   r&   *   s    e	

U

	

(




r&   )r   ra   loggingrs   typingr   r   r   r   r   r   r   r	   r\   rC   Zlangchain_core._apir
   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z$langchain_community.utils.user_agentr   	getLoggerr   rI   rE   r@   rK   r%   r&   r#   r#   r#   r$   <module>   s,    (
