o
    Zh!                     @   s   d Z ddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ erHddlmZ ddlmZ dd	lmZ dd
lmZmZmZ eeZG dd deZG dd deZG dd deZdS )zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptional)Document)
BaseLoader)Browser)Page)Response)r   r   r   c                	   @   sL   e Zd ZdZedddddddefd	d
ZedddddddefddZdS )PlaywrightEvaluatorzAbstract base class for all evaluators.

    Each evaluator should take a page, a browser instance, and a response
    object, process the page as necessary, and return the resulting text.
    pager   browserr   responser   returnc                 C   s   dS )a  Synchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        N selfr   r   r   r   r   j/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/url_playwright.pyevaluate   s   zPlaywrightEvaluator.evaluate	AsyncPageAsyncBrowserAsyncResponsec                    s   dS )a  Asynchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        Nr   r   r   r   r   evaluate_async*   s   z"PlaywrightEvaluator.evaluate_asyncN)__name__
__module____qualname____doc__r   strr   r   r   r   r   r   r      s    r   c                   @   s\   e Zd ZdZddeee  fddZdddd	d
ddefddZddddd
ddefddZ	dS )UnstructuredHtmlEvaluatorz@Evaluate the page HTML content using the `unstructured` library.Nremove_selectorsc                 C   s.   zddl }W n ty   tdw || _dS )z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr#   )r   r#   r$   r   r   r   __init__>   s   
z"UnstructuredHtmlEvaluator.__init__r   r   r   r   r   r   r   c           	      C   sl   ddl m} | jp
g D ]}|| }|D ]}| r!|d qq| }||d}ddd |D S )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text

c                 S      g | ]}t |qS r   r!   .0elr   r   r   
<listcomp>V       z6UnstructuredHtmlEvaluator.evaluate.<locals>.<listcomp>	Zunstructured.partition.htmlr(   r#   locatorallZ
is_visibler   contentjoin	r   r   r   r   r(   selectorelementselementZpage_sourcer   r   r   r   J   s   

z"UnstructuredHtmlEvaluator.evaluater   r   r   c           	         s   ddl m} | jpg D ] }|| I dH }|D ]}| I dH r+|dI dH  qq| I dH }||d}ddd |D S )	z4Asynchronously process the HTML content of the page.r   r'   Nr)   r*   r,   c                 S   r-   r   r.   r/   r   r   r   r2   f   r3   z<UnstructuredHtmlEvaluator.evaluate_async.<locals>.<listcomp>r4   r9   r   r   r   r   X   s   
z(UnstructuredHtmlEvaluator.evaluate_asyncN)
r   r   r   r    r	   r   r!   r&   r   r   r   r   r   r   r"   ;   s    r"   c                   @   s   e Zd ZdZ					ddee dededeee  dee d	ee	eef  fd
dZ
dee fddZdee fddZdee fddZdS )PlaywrightURLLoadera  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        headless (bool): If True, the browser will run in headless mode.
        proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
            through the specified proxy.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import PlaywrightURLLoader

            urls = ["https://api.ipify.org/?format=json",]
            proxy={
                "server": "https://xx.xx.xx:15818", # https://<host>:<port>
                "username": "username",
                "password": "password"
            }
            loader = PlaywrightURLLoader(urls, proxy=proxy)
            data = loader.load()
    TNurlscontinue_on_failureheadlessr#   	evaluatorproxyc                 C   s^   zddl }W n ty   tdw || _|| _|| _|| _|r&|r&td|p+t|| _dS )z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)	
playwrightr%   r?   r@   rA   rC   
ValueErrorr"   rB   )r   r?   r@   rA   r#   rB   rC   rD   r   r   r   r&      s    
zPlaywrightURLLoader.__init__r   c           
      c   s   ddl m} | n}|jj| j| jd}| jD ]S}z.| }||}|du r/t	d| |
d | j|||}d|i}t||dV  W q tyk }	 z| jr_td	| d
|	  n|	W Y d}	~	qd}	~	ww |  W d   dS 1 s{w   Y  dS )zLoad the specified URLs using Playwright and create Document instances.

        Returns:
            A list of Document instances with loaded content.
        r   )sync_playwrightrA   rC   N"page.goto() returned None for url loadsourceZpage_contentmetadataError fetching or processing , exception: )playwright.sync_apirF   chromiumlaunchrA   rC   r?   new_pagegotorE   wait_for_load_staterB   r   r
   	Exceptionr@   loggererrorclose)
r   rF   pr   urlr   r   r+   rL   er   r   r   	lazy_load   s2   



"zPlaywrightURLLoader.lazy_loadc                    s   dd |   2 I dH S )Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.

        Returns:
            A list of Document instances with loaded content.
        c                    s   g | z3 d H W }|q6 S r=   r   )r0   docr   r   r   r2      s    z-PlaywrightURLLoader.aload.<locals>.<listcomp>N)
alazy_load)r   r   r   r   aload   s   zPlaywrightURLLoader.aloadc           
      C  s<  ddl m} | 4 I dH }|jj| j| jdI dH }| jD ]_}z:| I dH }||I dH }|du r<t	d| |
dI dH  | j|||I dH }d|i}t||dV  W q ty~ }	 z| jrrtd	| d
|	  n|	W Y d}	~	qd}	~	ww | I dH  W d  I dH  dS 1 I dH sw   Y  dS )r]   r   )async_playwrightNrG   rH   rI   rJ   rK   rM   rN   )playwright.async_apira   rP   rQ   rA   rC   r?   rR   rS   rE   rT   rB   r   r
   rU   r@   rV   rW   rX   )
r   ra   rY   r   rZ   r   r   r+   rL   r[   r   r   r   r_      s2   
.zPlaywrightURLLoader.alazy_load)TTNNN)r   r   r   r    r   r!   boolr	   r   r   r&   r   r
   r\   r`   r   r_   r   r   r   r   r>   i   s.    

	r>   )r    loggingabcr   r   typingr   r   r   r   r   r	   Zlangchain_core.documentsr
   Z)langchain_community.document_loaders.baser   rb   r   r   r   r   r   r   rO   	getLoggerr   rV   r   r"   r>   r   r   r   r   <module>   s     
&.