o
    Zh                     @   sr   d Z ddlZddlmZmZmZmZmZ erddlm	Z	m
Z
 ddlmZ ddlmZ eeZG dd deZdS )	zRLoader that uses Selenium to load a page, then uses unstructured to load the html.    N)TYPE_CHECKINGListLiteralOptionalUnionChromeFirefox)Document)
BaseLoaderc                   @   s   e Zd ZdZdddddg fdee deded d	ee d
ee dedee fddZ	de
d fddZdede
d defddZdee fddZdS )SeleniumURLLoadera  Load `HTML` pages with `Selenium` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        browser (str): The browser to use, either 'chrome' or 'firefox'.
        binary_location (Optional[str]): The location of the browser binary.
        executable_path (Optional[str]): The path to the browser executable.
        headless (bool): If True, the browser will run in headless mode.
        arguments [List[str]]: List of arguments to pass to the browser.
    TchromeNurlscontinue_on_failurebrowser)r   firefoxbinary_locationexecutable_pathheadless	argumentsc           
      C   sv   zddl }W n ty   tdw zddl}	W n ty#   tdw || _|| _|| _|| _|| _|| _|| _	dS )z4Load a list of URLs using Selenium and unstructured.r   NzIselenium package not found, please install it with `pip install selenium`zQunstructured package not found, please install it with `pip install unstructured`)
seleniumImportErrorunstructuredr   r   r   r   r   r   r   )
selfr   r   r   r   r   r   r   r   r    r   h/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/url_selenium.py__init__   s*   
zSeleniumURLLoader.__init__returnr   c           	      C   sB  | j  dkrQddlm} ddlm} ddlm} | }| jD ]}|	| q| j
r4|	d |	d | jdur=| j|_| jdu rG||d	S |||| jd
dS | j  dkrddlm} ddlm} ddlm} | }| jD ]}|	| qp| j
r|	d | jdur| j|_| jdu r||d	S |||| jd
dS td)a  Create and return a WebDriver instance based on the specified browser.

        Raises:
            ValueError: If an invalid browser is specified.

        Returns:
            Union[Chrome, Firefox]: A WebDriver instance for the specified browser.
        r   r   )r   )Options)Servicez
--headlessz--no-sandboxN)options)r   )r    Zservicer   )r	   z5Invalid browser specified. Use 'chrome' or 'firefox'.)r   lowerselenium.webdriverr   Z!selenium.webdriver.chrome.optionsr   Z!selenium.webdriver.chrome.servicer   r   add_argumentr   r   r   r	   Z"selenium.webdriver.firefox.optionsZ"selenium.webdriver.firefox.service
ValueError)	r   r   ZChromeOptionsr   Zchrome_optionsargr	   ZFirefoxOptionsZfirefox_optionsr   r   r   _get_driverB   sH   	












zSeleniumURLLoader._get_driverurldriverc           	      C   s   ddl m} ddlm} 	 |dddd}|j }r||d< z||jd	 }r0|d
p-d|d< W n	 |y:   Y nw z||jd }rQ|dpKd|d< W |S W |S  |y]   Y |S w )Nr   )NoSuchElementException)ByzNo title found.zNo description found.zNo language found.)sourcetitledescriptionlanguager,   z//meta[@name="description"]contentr-   htmllangr.   )	Zselenium.common.exceptionsr)   Zselenium.webdriver.common.byr*   r,   Zfind_elementZXPATHZget_attributeZTAG_NAME)	r   r'   r(   r)   r*   metadatar,   r-   Zhtml_tagr   r   r   _build_metadataw   s>   
z!SeleniumURLLoader._build_metadatac           
      C   s   ddl m} t }|  }| jD ]M}z(|| |j}||d}ddd |D }| ||}|	t
||d W q ty] }	 z| jrQtd| d	|	  n|	W Y d
}	~	qd
}	~	ww |  |S )zLoad the specified URLs using Selenium and create Document instances.

        Returns:
            List[Document]: A list of Document instances with loaded content.
        r   )partition_html)textz

c                 S   s   g | ]}t |qS r   )str).0elr   r   r   
<listcomp>   s    z*SeleniumURLLoader.load.<locals>.<listcomp>)page_contentr2   zError fetching or processing z, exception: N)Zunstructured.partition.htmlr4   listr&   r   getZpage_sourcejoinr3   appendr
   	Exceptionr   loggererrorquit)
r   r4   docsr(   r'   r:   elementsr5   r2   er   r   r   load   s(   


zSeleniumURLLoader.load)__name__
__module____qualname____doc__r   r6   boolr   r   r   r   r&   dictr3   r
   rF   r   r   r   r   r      s4    
#5r   )rJ   loggingtypingr   r   r   r   r   r"   r   r	   Zlangchain_core.documentsr
   Z)langchain_community.document_loaders.baser   	getLoggerrG   r@   r   r   r   r   r   <module>   s    
