import io
import docx
import fitz  # PyMuPDF
from fastapi import UploadFile
import pytesseract
from PIL import Image

async def extract_text_from_file(file: UploadFile) -> str:
    """
    Extracts text from an uploaded file. Supports PDF, DOCX, and plain text.
    Uses PyMuPDF (fitz) for reliable PDF extraction and Tesseract for OCR if needed.
    """
    content = await file.read()
    await file.seek(0) # Reset file pointer for potential re-reads
    
    if file.content_type == "application/pdf":
        try:
            # Open the PDF from bytes using PyMuPDF
            doc = fitz.open(stream=content, filetype="pdf")
            text = ""
            for page in doc:
                text += page.get_text() + "\n"
            
            # If text is suspiciously short, it might be a scanned PDF
            if len(text.strip()) < 50:
                print(f"Normal text extraction failed for {file.filename}, attempting OCR...")
                text = ""
                for page in doc:
                    # Render page as image (pixmap)
                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom for better OCR
                    img_data = pix.tobytes("png")
                    img = Image.open(io.BytesIO(img_data))
                    page_text = pytesseract.image_to_string(img)
                    text += page_text + "\n"
            
            doc.close()
            return text
        except Exception as e:
            print(f"Error reading PDF {file.filename} with OCR fallback: {e}")
            return ""
            
    elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        try:
            doc = docx.Document(io.BytesIO(content))
            return "\n".join([paragraph.text for paragraph in doc.paragraphs])
        except Exception as e:
            print(f"Error reading DOCX {file.filename}: {e}")
            return ""

    elif file.content_type == "text/plain":
        return content.decode("utf-8")
    
    # Fallback: try to decode as utf-8 if it's likely a text file
    try:
        return content.decode("utf-8")
    except:
        return ""