o
    Vh P                     @   s`  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlmZmZmZmZ d dlZd dlZd dlmZmZmZmZ ejejdZd	d
 Zdd ZdedefddZ d-de!de"de#fddZ$de!defddZ%de!defddZ&de!defddZ'd e!de!fd!d"Z(d e!defd#d$Z)d.d'd(Z*d)d* Z+d+d, Z,dS )/    N)settings)JsonResponse)require_http_methods)DevelopmentApplication)EXTRACT_STRUCTURED_DATA_PROMPTTITLE_SEARCH_EXTRACTION_PROMPT"APPLICATION_FORM_EXTRACTION_PROMPT APPLICATION_FORM_FALLBACK_PROMPT)RateLimitErrorAPIErrorAPIConnectionErrorAPITimeoutError)api_keyc                 C   s&   zt |  W S  ty   Y d S w )N)pdto_datetimedate	Exception)date_str r   8/home/shobhit/Desktop/shao (1)/shao/shaoApp/functions.py
parse_date   s
   r   c                 C   s*   t | s| dks| du rdS t|  S )z7Safely get value, handling None, NaN, and empty strings N)r   isnastrstrip)valr   r   r   safe_get   s   r   datareturnc              
   C   s   t | tsi S ddddddd}i }| D ]M\}}| ||}|drNz|du s.|dkr3d||< n
ttt|||< W q tt	fyM   d||< Y qw |du sV|dkr[d||< qt|
 ||< q|S )z7Validate and clean structured data from OpenAI responser   r   )development_typenumber_of_dwellingsnumber_of_storeysnumber_of_placesnumber_of_unitsnumber_of_lots
number_of_N)
isinstancedictitemsget
startswithintfloatr   
ValueError	TypeErrorr   )r   expected_fieldsvalidated_datafielddefault_valuevaluer   r   r   validate_structured_data"   s2   
	


r4            ?promptmax_retries
base_delayc                 C   s  t |D ]}ztjjjdd| dgddd}|jd jj }|dfW   S  t	ys } zB||d	 k r[|d
|  t
dd	 }td|dd|d	  d| d t| W Y d}~qdd| dt| fW  Y d}~  S d}~w tttfy } zB||d	 k r|d
|  t
dd	 }td|dd|d	  d| d t| W Y d}~qdd| dt| fW  Y d}~  S d}~w ty } zddt| fW  Y d}~  S d}~ww dS )z@Make OpenAI API request with exponential backoff and retry logiczgpt-4o-2024-05-13user)rolecontentg?i  )modelmessagestemperature
max_tokensr   N      zRate limit hit, retrying in z.2fz seconds... (attempt /)zRate limit exceeded after z attempts: zAPI error, retrying in zAPI error after zUnexpected error: )NzMax retries exceeded)rangeclientchatcompletionscreatechoicesmessager<   r   r
   randomuniformprinttimesleepr   r   r   r   r   )r7   r8   r9   attemptresponser<   edelayr   r   r   make_openai_request_with_retryI   s>   
"
("
("rU   descriptionc              
   C   s$  | r|   si S d}t| |kr| d| d } tj| d}t|\}}|r/td|  i S |s7td i S z|drItdd	|}|	d}t
|}t|}|W S  t
jyw } ztd
|  td|  i W  Y d}~S d}~w ty } ztd|  i W  Y d}~S d}~ww )zCExtract structured data from description with proper error handlingi  N...)rV   OpenAI request failed: Empty response from OpenAI```^```[a-z]*\n?r   zJSON decoding error: Raw content: z%Unexpected error in data processing: )r   lenr   formatrU   rN   r*   resubrstripjsonloadsr4   JSONDecodeErrorr   )rV   
max_lengthr7   r<   errorr   r0   rS   r   r   r   extract_structured_dataq   s<   


rg   pdf_pathc                 C   s^  zt | }t|j}d}d}|jD ]}| }|r-t| dkr-|d7 }|t|7 }q|dkr6|| nd}|dkr@|| nd}|dkrO|dkrOd}	d}
n|dkr\|d	kr\d}	d
}
n|dkri|dkrid}	d}
nd}	d}
|	|
||t|dt|d|dW  d   W S 1 sw   Y  W dS  ty } zddt|ddddddW  Y d}~S d}~ww )z>Detect if PDF is digital (text-based) or scanned (image-based)r   
   rA   gffffff?d   digitalhighg333333?2   mediummixedlowscannedr5   )type
confidencetotal_pagespages_with_texttext_densityavg_text_per_pagetotal_text_lengthNunknown)rr   rs   rf   rt   ru   rv   rw   rx   )	
pdfplumberopenr]   pagesextract_textr   roundr   r   )rh   pdfrt   ru   rx   page	page_textrv   rw   pdf_typers   rS   r   r   r   detect_pdf_type   sZ   

('r   c              
   C   s   z`t | }|d dkr|d dkrdddddW S d}t| }|jD ]}| }|r2||d 7 }q$W d	   n1 s=w   Y  | }|rY||d |d t||d
 dW S dddddW S  ty{ } zdddt|dW  Y d	}~S d	}~ww )zEExtract text from PDF with enhanced error handling and type detectionrr   rq   rs   rl   r   z>PDF appears to be scanned/image-based with no extractable text)textr   rs   reason
Nru   )r   r   rs   text_lengthpages_processedz5No text could be extracted despite PDF type detectionrf   rp   )r   r   rs   rf   )	r   rz   r{   r|   r}   r   r]   r   r   )rh   pdf_infor   r   r   r   extracted_textrS   r   r   r   extract_text_from_pdf   sL   
	r   r   c                    sx   |    g d}g d}t fdd|D }t fdd|D }|dkr(dS |dkr.dS ||kr4dS ||kr:dS d	S )
z.Detect document type based on content analysis)ztitle searchtitlezcertificate of titlezland descriptionzregistered proprietorencumbranceszactivity in the last 125 dayszadministrative noticesztitle referencevolumefolio)	zapplication formzplanning permitzproposed userV   zfor what usezdevelopment or other matterzpermit requiredzonline form applicationzplanning permit applicationc                 3       | ]	}| v rd V  qdS rA   Nr   .0keyword
text_lowerr   r   	<genexpr>      z'detect_document_type.<locals>.<genexpr>c                 3   r   r   r   r   r   r   r   r      r   r5   title_searchapplication_formry   )lowersum)r   title_search_keywordsapplication_form_keywordstitle_search_matchesapplication_form_matchesr   r   r   detect_document_type  s   r   c              
   C   s~  | r|   s
ddiS d}t| |kr| d| d } t| }|dkr)tj| d}n|dkr4tj| d}ntj| d}t|\}}|rIdd	| iS |sOdd
iS z+|drat	
dd|}|d}t|}t|tspddiW S d|vrx||d< |W S  tjy } ztd|  td|  ddt| iW  Y d}~S d}~w ty } zddt| iW  Y d}~S d}~ww )z@Extract structured data from PDF text with proper error handlingrf   zEmpty or invalid text inputi@  NrW   r   )r   r   rX   rY   rZ   r[   r   zInvalid response formatdocument_typedetected_typez'JSON decoding error in PDF extraction: r\   z!Failed to parse OpenAI response: z)Unexpected error in PDF data processing: )r   r]   r   r   r^   r   r	   rU   r*   r_   r`   ra   rb   rc   r&   r'   rd   rN   r   r   )r   re   r   r7   r<   rf   r   rS   r   r   r   !extract_structured_data_from_text/  sF   




r   ri          @c                 C   s   g }t dt| |D ]_}| |||  }g }|D ]/}z||}	||	 W q tyG }
 ztd|
  |dt|
i W Y d}
~
qd}
~
ww || || t| k ritd|| d  d| d t| q
|S )	z9Process items in batches with delays to avoid rate limitsr   zError processing item: rf   NzProcessed batch rA   z
, waiting z seconds...)	rE   r]   appendr   rN   r   extendrO   rP   )r(   process_func
batch_sizedelay_between_batchesresultsibatchbatch_resultsitemresultrS   r   r   r   process_batch_with_delayj  s&   

r   c                 C   s  zddl m}m}m} |jj| | ddd\}}|jj||tj	||
dd|
dd|
d	d
|
dd|
dd|rE|
dsEdnd|rR|
drR|
dndd
}	|r|
ds|
di }
|jjd3i d|	d|
ddd|
ddd|
ddd|
ddd|
ddd|
ddd|
ddd|
ddd|
ddd|
ddd|
ddd |
d dd!|
d!dd"|
d"dd#|
d#dd$|

d$dd%|

d%dd&|

d&dd'|

d'dd(|

d(dd)|

d)dd*|

d*dd+|}d,| |	j|	jd-d.W S  |jy'   d/| d0d1 Y S  tyD } zd/| d2t| d1W  Y d}~S d}~ww )4a|  
    Save PDF extraction data to the database
    
    Args:
        application_id (str): The application ID
        pdf_path (str): Path to the PDF file
        extraction_result (dict): Result from extract_text_from_pdf function
        extracted_data (dict): Structured data extracted from the PDF
    
    Returns:
        dict: Result with success status and any errors
    r   )r   PDFDocumentExtractedPDFDataUnknown)application_idcouncil_name)r   defaultsr   r   ry   rs   rp   r   r   rf   successfailedN)
application	file_path	file_namer   r   rs   r   r   extraction_statuserror_messagedevelopment_summarypdf_documentland_descriptionr   registered_proprietorr   activity_last_125_daysadministrative_noticesproposed_userV   applicant_namecontact_namecontact_addresscontact_emailcontact_phoneapplicant_addressapplicant_emailapplicant_phonelot_sizesite_coverage
total_areaground_floor_areafirst_floor_areapossposraw_extracted_dataTzPDF data saved successfully)r   r   pdf_document_idr   rK   Fz!Development application not found)r   r   rf   zDatabase error: r   )shaoApp.modelsr   r   r   objectsget_or_createrI   ospathbasenamer)   idr   DoesNotExistr   r   )r   rh   extraction_resultextracted_datar   r   r   r   createdr   r   extracted_pdf_datarS   r   r   r   save_pdf_data_to_database  s   
	





"&
r   c                 C   s   zKt |}|ds$t| ||d|ddi}| |d|dd|dW S t|d }t| |||}| |d|d|d	|d
d|dd||d	W S  tyo } z| |dt|ddt| ddW  Y d}~S d}~ww )a  
    Process a PDF file and save the extracted data to the database
    
    Args:
        application_id (str): The application ID
        pdf_path (str): Path to the PDF file
    
    Returns:
        dict: Result with processing status and database save status
    r   rf   r   zNo text extracted from PDFF)r   rh   extraction_successr   database_saveTr   rs   r   r   r   )	r   rh   r   r   rs   r   r   r   r   zProcessing error: )r   rf   )r   rh   r   rf   r   N)r   r)   r   r   r   r   )r   rh   r   save_resultr   rS   r   r   r   process_and_save_pdf_data  sV   

	

r   )r5   r6   )ri   r   )-r   rb   pandasr   rO   rL   django.confr   django.httpr   django.views.decorators.httpr   r   r   openaishaoApp.promptsr   r   r   r	   r_   rz   r
   r   r   r   OpenAIOPENAI_API_KEYrF   r   r   r'   r4   r   r+   r,   rU   rg   r   r   r   r   r   r   r   r   r   r   r   <module>   s6    '(073$
;g