
    dhn                         S SK r S SKJrJrJrJr  S SKrS SKJr  S SK	J
r
  S SKJr  \ R                  " \5      r " S S\5      r " S S	\
5      rg)
    N)DictIteratorListUnion)Document)BaseBlobParser)Blobc                       \ rS rSrSrSrg)ServerUnavailableException   z7Exception raised when the Grobid server is unavailable. N)__name__
__module____qualname____firstlineno____doc____static_attributes__r       k/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_community/document_loaders/parsers/grobid.pyr   r      s    Ar   r   c            	       l    \ rS rSrSr SS\S\SS4S jjrS\S	\S\S\\	   4S
 jr
S\S\\	   4S jrSrg)GrobidParser   z)Load  article `PDF` files using `Grobid`.segment_sentencesgrobid_serverreturnNc                     Xl         X l         [        R                  " U5        g ! [        R                  R
                   a    [        R                  S5        [        ef = f)NzyGROBID server does not appear up and running,                 please ensure Grobid is installed and the server is running)	r   r   requestsget
exceptionsRequestExceptionloggererrorr   )selfr   r   s      r   __init__GrobidParser.__init__   sV    
 "3*	-LL'""33 	-LLM -,	-s	   % :A	file_pathxml_datac              #     #     SSK Jn  U" US5      nUR                  S5      nUR                  S5      nU(       a  US   R                  nOSn/ n	U GH  n
U
R                  S5      nUc  M  [        U
R                  S
5      5       GH  u  p/ n/ n[        UR                  S5      5       GH  u  nnUR                  UR                  5        / nUR                  S5      bo  UR                  S5      R                  S5       H:  nUR                  S5      nUR                  US   US   US   US   US   S.5        M<     UR                  U5        USL d  M  [        U5      S:  d  M  US   S   US   S   nnUR                  [        U5      U/UR                  UR                  S5      UU4S.nU	R                  U5        GM     USLd  GMG  US   S   S   US   S   S   nnSR                  U5      [        U5      UUR                  UR                  S5      UU4S.nU	R                  U5        GM     GM     U	 Vs/ sH|  n[        US   [        [        US   5      [        US   5      [        US   5      [        US   5      [        US   5      [        US   5      [        U5      [        U5      S .5      S!9PM~     sn S	h  vN   g	! [         a    [        S5      ef = fs  snf  N#7f)"z!Process the XML file from Grobin.r   )BeautifulSoupzA`bs4` package not found, please install it with `pip install bs4`xmldivtitlezNo title foundheadNpscoords;,            )pagexyhwTr7   n)textparabboxessection_titlesection_numberpages r>   r?   r@   rC   rA   rB   )r>   r?   r@   rC   rA   rB   paper_titler&   )page_contentmetadata)bs4r)   ImportErrorfind_allr>   find	enumerateappendr   splitlenstrjoinr   dict)r#   r&   r'   r   r)   soupsectionstitlesr,   chunkssectionsecti	paragraphchunk_bboxesparagraph_textsentencesbboxesbboxboxfpagelpagesentence_dictparagraph_dictchunks                            r   process_xmlGrobidParser.process_xml&   s    
	)
 Xu-=='w'1INNE$EG<<'D$-g.>.>s.C$DLA#%L%'N'01C1CC1H'I8&--hmm<"$#<<1=(0X(>(D(DS(I&*jjo '03A-0V-0V-0V-0V%&!" )J )//8-5CL1<L+21:f+=wr{6?R5E(0(+A+2)1526((3-*/-M #MM-85 (J6 )4(OA.v6(,R08  %
 %'GGN$;$'F&2-1YY.2hhsm&+U^* n5W %E  @  !
    "6] #E&M 2 #E&M 2"%eHo"6!$U7^!4),U?-C)D*-e4D.E*F'*5z%(^	  !
 	
 	
y  	S 	x
 	
sP   K,K AK,,C&K,K,'A!K,A1K,>BK% K,K*K,K""	K,blobc           
         UR                   nUc  [        S5      e[        US5      nSX#SSS040n 0 nS H  nSXV'   M	     S	S
/US'   U=(       d    0 n[        R                  " SU R
                  S S UUSS9nUR                  nUc  [        / 5      $ U R                  X(U R                  5      $ ! [        R                  R                   a    [        R                  S5        S n Naf = f)Nzblob.source cannot be None.rbinputzapplication/pdfExpires0)generateIDsconsolidateHeadersegmentSentences1r-   r/   teiCoordinatesPOST<   )headersparamsfilesdatatimeoutz%GROBID server timed out. Return None.)source
ValueErroropenr   requestr   r>   r   ReadTimeoutr!   r"   iterrf   r   )	r#   rh   r&   pdfrw   rx   paramrr'   s	            r   
lazy_parseGrobidParser.lazy_parse|   s   KK	:;;9d#9+<y#>NOP	57DQ! R&,c]D!"KRE  ""A vvH
 8O##I9O9OPP "".. 	LL@AH	s   AB, ,5C$#C$)r   r   )z1http://localhost:8070/api/processFulltextDocument)r   r   r   r   r   boolrP   r$   r   r   rf   r	   r   r   r   r   r   r   r      sw    3
 Q-- - 
	- T
T
(+T
@DT
	(	T
lQt Q(: Qr   r   )loggingtypingr   r   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr	   	getLoggerr   r!   	Exceptionr   r   r   r   r   <module>r      sF     . .  - D B			8	$	 	FQ> FQr   