o
    =ji{                     @   sN   d dl Z d dlZd dlZd dlmZ d dlZd dlmZ dedefddZ	dS )    N)
UploadFile)Imagefilereturnc           
   
      s  |   I dH }| dI dH  | jdkrzWtj|dd}d}|D ]
}|| d 7 }q!t| dk rftd	| j	 d
 d}|D ]$}|j
tddd}|d}tt|}t|}||d 7 }qA|  |W S  ty }	 ztd| j	 d|	  W Y d}	~	dS d}	~	ww | jdkrztt|}ddd |jD W S  ty }	 ztd| j	 d|	  W Y d}	~	dS d}	~	ww | jdkr|dS z|dW S    Y dS )z
    Extracts text from an uploaded file. Supports PDF, DOCX, and plain text.
    Uses PyMuPDF (fitz) for reliable PDF extraction and Tesseract for OCR if needed.
    Nr   zapplication/pdfpdf)streamfiletype 
2   z"Normal text extraction failed for z, attempting OCR...   )matrixpngzError reading PDF z with OCR fallback: zGapplication/vnd.openxmlformats-officedocument.wordprocessingml.documentc                 S   s   g | ]}|j qS  )text).0	paragraphr   r   :/var/www/html/Resume-Parser/resume-parser-inhouse/utils.py
<listcomp>-   s    z*extract_text_from_file.<locals>.<listcomp>zError reading DOCX z: z
text/plainzutf-8)readseekcontent_typefitzopenget_textlenstripprintfilename
get_pixmapMatrixtobytesr   ioBytesIOpytesseractimage_to_stringclose	ExceptiondocxDocumentjoin
paragraphsdecode)
r   contentdocr   pagepiximg_dataimg	page_texter   r   r   extract_text_from_file   sP   





r5   )
r"   r(   r   fastapir   r$   PILr   strr5   r   r   r   r   <module>   s    