
    dh)                         S SK r S SKrS SKrS SKJrJrJrJrJrJ	r	  S SK
Jr  S SKJr  \R                  " \5      r\(       a  S SKJr   " S S\5      rg)    N)TYPE_CHECKINGAnyIteratorListOptionalTuple)Document)
BaseLoader)SparkSessionc            	           \ rS rSrSr    SS\S   S\\   S\S\4S	 jjr	S
\
\\4   4S jrS
\\   4S jrS
\\   4S jrSrg)PySparkDataFrameLoader   zLoad `PySpark` DataFrames.Nspark_sessionr   dfpage_content_columnfraction_of_memoryc                     SSK JnJn  U(       a  UOUR                  R                  5       U l        [        X%5      (       d  [        S[        U5       35      eX l
        X0l        X@l        U R                  5       u  U l        U l        U R                  R                   R#                  [$        5      U l        U R                  R(                  U l        g! [         a    [        S5      ef = f)a/  Initialize with a Spark DataFrame object.

Args:
    spark_session: The SparkSession object.
    df: The Spark DataFrame object.
    page_content_column: The name of the column containing the page content.
     Defaults to "text".
    fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
r   )	DataFramer   zFpyspark is not installed. Please install it with `pip install pyspark`z3Expected data_frame to be a PySpark DataFrame, got N)pyspark.sqlr   r   ImportErrorbuildergetOrCreatespark
isinstance
ValueErrortyper   r   r   get_num_rowsnum_rowsmax_num_rowsrddmaplistrdd_dfcolumnscolumn_names)selfr   r   r   r   r   r   s          n/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/pyspark_dataframe.py__init__PySparkDataFrameLoader.__init__   s     	; +M0D0D0P0P0R 	
 "((Ed2hZP  #6 "4+/+<+<+>(t(ggkkood+ GGOO%  	X 	s   C C(returnc                     SSK nU R                  R                  S5      R	                  5       S   n[
        R                  " U5      nUR                  5       nUR                  n[        Xd-  U R                  -  5      n[        XpR                  R                  5       5      U4$ ! [         a  n[        S5      UeSnAff = f)z4Gets the number of "feasible" rows for the DataFramer   NzBpsutil not installed. Please install it with `pip install psutil`.   )psutilr   r   limitcollectsys	getsizeofvirtual_memory	availableintr   mincount)r&   r-   erowestimated_row_sizemem_infoavailable_memoryr   s           r'   r   #PySparkDataFrameLoader.get_num_rows9   s    	
 ggmmA&&(+ ]]3/((*#--2d6M6MM
 <1<??  	T	s   B# #
B>-B99B>c              #   $  #    U R                   R                  5        Hi  n[        [        U5      5       Vs0 sH  o R                  U   X   _M     nnX0R
                     nUR                  U R
                  5        [        XCS9v   Mk     gs  snf 7f)z#A lazy loader for document content.)page_contentmetadataN)r#   toLocalIteratorrangelenr%   r   popr	   )r&   r8   ir?   texts        r'   	lazy_load PySparkDataFrameLoader.lazy_loadJ   sv     ;;..0C>CCHoNo))!,cf4oHN445DLL112@@	 1Ns   4BBA Bc                 @   U R                   R                  5       U R                  :  a>  [        R	                  SU R                   R                  5        SU R
                   S35        U R                  5       n[        [        R                  " XR
                  5      5      $ )zLoad from the dataframe.z The number of DataFrame rows is zQ, but we will only include the amount of rows that can reasonably fit in memory: .)
r   r6   r   loggerwarningr   rF   r"   	itertoolsislice)r&   lazy_load_iterators     r'   loadPySparkDataFrameLoader.loadR   sy    77==?T...NN2477==?2C D>>Bmm_AO
 "^^-I$$%7GHH    )r%   r   r   r   r   r   r#   r   )NNrE   g?)__name__
__module____qualname____firstlineno____doc__r   r   strfloatr(   r   r4   r   r   r	   rF   r   rO   __static_attributes__ rQ   r'   r   r      s    $ 37 #)$'$,/$, SM$, !	$,
 "$,L@eCHo @"A8H- A	Id8n 	IrQ   r   )rL   loggingr0   typingr   r   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   	getLogger__file__rJ   r   r   r   rZ   rQ   r'   <module>ra      s@      
 F F - @			8	$(KIZ KIrQ   