o
    Zh4#                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ eeZe ddd	d
dd
dZdededefddZG dd deZdS )    N)FutureThreadPoolExecutor)	AnyAsyncIteratorDictIteratorListOptionalTupleUnioncast)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageZRefererZDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                 C   sj   d|i}|  d }r| |d< | j dddid }r$|dd|d< |  d	 }r3|d
d|d< |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findZget_textget)r   r   metadatar   r   r    r$   f/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/async_html.py_build_metadata&   s   r&   c                   @   sx  e Zd ZdZ										d4ddddeeee f d	ee d
ee	 dee de	dee dede
deeeef  de	de	de	de	fddZdedefddZededdfddZ	d5ded e
d!e
d"edef
d#d$Zded%ejdeeef fd&d'Zd(ee de	deeeef  fd)d*Zd(ee dee fd+d,Zded-edefd.d/Zdee fd0d1Zdee fd2d3ZdS )6AsyncHtmlLoaderzLoad `HTML` asynchronously.NThtml.parser   F)preserve_order	trust_envweb_pathheader_template
verify_sslproxiesautoset_encodingencodingdefault_parserrequests_per_secondrequests_kwargsraise_for_statusignore_load_errorsr*   r+   c                C   s   t |tr
|g| _nt |tr|| _|pt}|ds8zddlm} | j|d< W n t	y7   t
d Y nw t | _t|| j_|| j_|rP| jj| || _|| _|	pYi | _|
| _|| _|| _|| _|| _|| _dS )zInitialize with a webpage path.r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)
isinstancestr	web_pathsr   default_header_templater"   Zfake_useragentr7   randomImportErrorloggerinforequestsSessionsessiondictheadersverifyr/   updater3   r2   r4   r5   r0   r1   r6   r*   r+   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r*   r+   rD   r7   r$   r$   r%   __init__5   s8   






zAsyncHtmlLoader.__init__r   r   c              
   C   sj   | j r*z| jj|fi | jW S  ty) } ztt| W Y d }~d S d }~ww | jj|fi | jS N)r6   rB   r"   r4   	Exceptionwarningswarnr9   )rG   r   er$   r$   r%   _fetch_valid_connection_docso   s   z,AsyncHtmlLoader._fetch_valid_connection_docsparserc                 C   s*   g d}| |vrt dd| d dS )z#Check that parser is valid for bs4.)r(   Zlxmlxmlzlxml-xmlZhtml5libz`parser` must be one of z, .N)
ValueErrorjoin)rO   Zvalid_parsersr$   r$   r%   _check_parsery   s   zAsyncHtmlLoader._check_parser         ?retriescooldownbackoffc                    s  t j| jd4 I d H }t|D ]}zjtd| jj| jj d| j	}| jj
s-d|d< |j|fi |4 I d H 7}z	| I d H }	W n tyW   td|  d}	Y nw |	W  d   I d H  W   W  d   I d H  S 1 I d H sww   Y  W q t jtfy }
 zS||d kr| jrtd| d	| d
 W Y d }
~
 W d   I d H  dS ||d kr td| d|d  d| d|
 d	 t|||  I d H  W Y d }
~
qd }
~
ww W d   I d H  td1 I d H sw   Y  td)N)r+   )rD   cookiesFsslzFailed to decode content from     zError fetching z after z	 retries.z with attempt /z: z. Retrying...zretry count exceededr$   )aiohttpZClientSessionr+   rangerC   rB   rD   rZ   get_dictr4   rE   r"   textUnicodeDecodeErrorr>   errorZClientConnectionErrorTimeoutErrorr6   warningasynciosleeprR   )rG   r   rW   rX   rY   rB   ikwargsresponserb   rM   r$   r$   r%   _fetch   sh   


$  zAsyncHtmlLoader._fetch	semaphorec              	      sR   |4 I d H  ||  |I d H fW  d   I d H  S 1 I d H s"w   Y  d S rI   )rl   )rG   r   rm   r$   r$   r%   _fetch_with_rate_limit   s   0z&AsyncHtmlLoader._fetch_with_rate_limiturlsc                   s   t  j fdd|D }z1ddlm} |r.||ddddD ]}|I d H V  q"W d S |j|ddddD ]}|I d H V  q7W d S  tyr   td	 |rat j	| I d H D ]}|V  qXY d S t |D ]}|I d H V  qfY d S w )
Nc                    s   g | ]}t  |qS r$   )rg   create_taskrn   ).0r   rG   rm   r$   r%   
<listcomp>   s    z3AsyncHtmlLoader._lazy_fetch_all.<locals>.<listcomp>r   )tqdm_asynciozFetching pagesTr]   )ZdescasciiZminintervalz2For better logging of progress, `pip install tqdm`)
rg   	Semaphorer3   Ztqdm.asynciort   as_completedr=   rK   rL   gather)rG   ro   r*   tasksrt   taskresultr$   rr   r%   _lazy_fetch_all   s8   


zAsyncHtmlLoader._lazy_fetch_allc                    s   dd |  |d2 I dH S )z/Fetch all urls concurrently with rate limiting.c                    s    g | z
3 d H W \}}|q6 S rI   r$   )rq   _docr$   r$   r%   rs      s    z-AsyncHtmlLoader.fetch_all.<locals>.<listcomp>TN)r|   )rG   ro   r$   r$   r%   	fetch_all   s   zAsyncHtmlLoader.fetch_allrb   c                 C   sL   ddl m} |drd}n| j}| | |||}t||}t||dS )Nr   )BeautifulSoupz.xmlrP   )Zpage_contentr#   )Zbs4r   endswithr2   rT   r&   r   )rG   r   rb   r   rO   r   r#   r$   r$   r%   _to_document   s   



zAsyncHtmlLoader._to_documentc                 c   s    z*t   tdd}|t j| | j}| }W d   n1 s%w   Y  W n ty=   t | | j}Y nw t	t
tt |D ]\}}| | j| |V  qGdS )+Lazy load text from the url(s) in web_path.r]   )max_workersN)rg   get_running_loopr   submitrunr   r:   r{   RuntimeError	enumerater   r   r9   r   )rG   executorfutureresultsri   rb   r$   r$   r%   	lazy_load   s"   

zAsyncHtmlLoader.lazy_loadc                 C  s8   |  | j| j2 z3 dH W \}}| ||V  q	6 dS )r   N)r|   r:   r*   r   )rG   r   rb   r$   r$   r%   
alazy_load   s   zAsyncHtmlLoader.alazy_load)
NTNTNr(   r)   NFF)rU   r)   rV   )__name__
__module____qualname____doc__r   r9   r   r	   rC   boolintr   r   rH   rN   staticmethodrT   floatrl   rg   rv   r
   rn   r   r|   r   r   r   r   r   r   r$   r$   r$   r%   r'   2   s    	

:
	
%


r'   ) rg   loggingrK   concurrent.futuresr   r   typingr   r   r   r   r   r	   r
   r   r   r_   r@   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z$langchain_community.utils.user_agentr   	getLoggerr   r>   r;   r9   rC   r&   r'   r$   r$   r$   r%   <module>   s*    ,
