o
    Zh)                     @   s~   d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ eeZer5d dlmZ G dd deZdS )    N)TYPE_CHECKINGAnyIteratorListOptionalTuple)Document)
BaseLoader)SparkSessionc                	   @   sv   e Zd ZdZ				dded dee ded	efd
dZde	e
e
f fddZdee fddZdee fddZdS )PySparkDataFrameLoaderzLoad `PySpark` DataFrames.Ntext皙?spark_sessionr
   dfpage_content_columnfraction_of_memoryc                 C   s   z
ddl m}m} W n ty   tdw |r|n|j | _t||s.tdt	| || _
|| _|| _|  \| _| _| j
jt| _| j
j| _dS )ag  Initialize with a Spark DataFrame object.

        Args:
            spark_session: The SparkSession object.
            df: The Spark DataFrame object.
            page_content_column: The name of the column containing the page content.
             Defaults to "text".
            fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
        r   )	DataFramer
   zFpyspark is not installed. Please install it with `pip install pyspark`z3Expected data_frame to be a PySpark DataFrame, got N)pyspark.sqlr   r
   ImportErrorbuilderZgetOrCreateZspark
isinstance
ValueErrortyper   r   r   get_num_rowsnum_rowsmax_num_rowsZrddmaplistrdd_dfcolumnscolumn_names)selfr   r   r   r   r   r
    r"   m/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/pyspark_dataframe.py__init__   s&   
zPySparkDataFrameLoader.__init__returnc              
   C   s   zddl }W n ty } ztd|d}~ww | jd d }t|}| }|j}t	|| | j
 }t|| j |fS )z4Gets the number of "feasible" rows for the DataFramer   NzBpsutil not installed. Please install it with `pip install psutil`.   )psutilr   r   limitZcollectsys	getsizeofZvirtual_memory	availableintr   mincount)r!   r'   erowZestimated_row_sizeZmem_infoZavailable_memoryr   r"   r"   r#   r   9   s$   
z#PySparkDataFrameLoader.get_num_rowsc                 #   sV    j  D ]"  fddtt D }|j }|j t||dV  qdS )z#A lazy loader for document content.c                    s   i | ]
}j |  | qS r"   )r    ).0ir0   r!   r"   r#   
<dictcomp>M   s    z4PySparkDataFrameLoader.lazy_load.<locals>.<dictcomp>)Zpage_contentmetadataN)r   ZtoLocalIteratorrangelenr   popr   )r!   r5   r   r"   r3   r#   	lazy_loadJ   s   
z PySparkDataFrameLoader.lazy_loadc                 C   sJ   | j  | jkrtd| j   d| j d |  }tt	|| jS )zLoad from the dataframe.z The number of DataFrame rows is zQ, but we will only include the amount of rows that can reasonably fit in memory: .)
r   r.   r   loggerwarningr   r9   r   	itertoolsislice)r!   Zlazy_load_iteratorr"   r"   r#   loadR   s   zPySparkDataFrameLoader.load)NNr   r   )__name__
__module____qualname____doc__r   r   strfloatr$   r   r,   r   r   r   r9   r   r?   r"   r"   r"   r#   r      s$    
&r   )r=   loggingr)   typingr   r   r   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser	   	getLogger__file__r;   r   r
   r   r"   r"   r"   r#   <module>   s     
