o
    Cji;                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
Z
d dlmZmZmZ dd Zdd Zd2d
dZdd Zdd Zd3ddZdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Z d+d, Z!d-d. Z"d/d0 Z#e$d1kre#  dS dS )4    N)DictListSetTuple)TEXT_INHIBIT_SPACESTEXT_PRESERVE_LIGATURESTEXT_PRESERVE_WHITESPACEc                 C   s   d|   ddS )Nz %s K   -)center)x r   h/var/www/html/Resume-Parser/resume-parser-inhouse/venv/lib/python3.10/site-packages/fitz_new/__main__.py<lambda>       r   c                 C   s   |d }|d }|dkr|  |S dd }t| |}t| |}	 |j|jkr;|j|j  kr4dkr;n n|jdksMtd||f  t| d}||S t|}||j d }}||S )zReturn image for a given XREF.r      c                 S   s"   | j jdkr| S ttj| }|S )N   )
colorspacenfitzPixmapcsRGB)pixtpixr   r   r   getimage    s   zrecoverpix.<locals>.getimagez&Warning: unsupported /SMask %i for %i:N)	extract_imager   r   irectalphar   print	set_alphasamples)docitemr   sr   pix1pix2r   r   r   r   
recoverpix   s"   
2
r&   FTc                 C   s   t | }|js|du rtd d}|js|S |r8||}|s&td |du r6t|dkr3dd nd |S td	|j  |S )
z!Open and authenticate a document.Tz$this command supports PDF files onlyzauthentication unsuccessful   zauthenticated as %sowneruserz'%s' requires a password)	r   openis_pdfsysexit
needs_passauthenticater   name)filenamepasswordshowpdfr!   rcr   r   r   	open_file=   s   



r7   c                 C   sJ   t dd |  D d }|  D ]\}}d|||f }t| qdS )zPrint a Python dictionary.c                 S   s   g | ]}t |qS r   )len).0kr   r   r   
<listcomp>R       zprint_dict.<locals>.<listcomp>r   z%s: %sN)maxkeysitemsrjustr   )r"   lr:   vmsgr   r   r   
print_dictP   s
   
rD   c                 C   s   t d|  | |}t | | |r=| }z|dd }|| }|dr+d}W n   d}Y t d|  t d t d d	S )
zPrint an object given by XREF number.

    Simulate the PDF source in "pretty" format.
    For a stream also print its size.
    z%i 0 objz/Lengthr   z0 Runknownzstream
...%s bytes	endstreamendobjN)r   xref_objectxref_is_streamsplitindexendswith)r!   xrefxref_strtempidxsizer   r   r   
print_xrefY   s    


rR   pagec              	   C   sb  t |d }| d|dd} | d}g }t|D ]\}}|d }| rIt|}	d|	  kr5|k r?n n|t| n	td||f  qz|d\}
}t|
}
t|}W n   td||f  Y d|
  krr|k rn nd|  kr~|k sn td||f  |
|kr||
 q|
|k r|t	t
|
|d 7 }q|t	t
|
|d d	7 }q|S )
aK  Transform a page / xref specification into a list of integers.

    Args
    ----
        rlist: (str) the specification
        limit: maximum number, i.e. number of pages, number of objects
        what: a string to be used in error messages
    Returns
    -------
        A list of integers representing the specification.
    r   N  ,zbad %s specification at item %ir
   z%bad %s range specification at item %ir'   )strreplacerJ   	enumerate	isdecimalintappendr-   r.   listrange)rlistlimitwhatrT   	rlist_arrout_listseqr"   r   ii1i2r   r   r   get_listp   s6   
.
ri   c              
   C   s  t | j| jd}tj| jd }d}|dkr|d }d}t|d}|j}td| j|j	|
 d |||d |d	 f  |j}|d
krU| }td||dkrPdndf  | }|d
krctd|  t  | jr{ttd | }t|| t  | jrttd t|j t  | jrttd t| j|
 dd}|D ]
}t|| t  q| jrttd t| j|j	d }	|	D ]}
|
d }||}td|
  t|| t  q| jrttd t|  t  |  d S )NTi   KBi  MBr   z7'%s', pages: %i, objects: %i, %g %s, %s, encryption: %sformat
encryptionr   z5document contains %i root form fields and is %ssigned   znot rV   z#document contains %i embedded fileszPDF catalogzPDF metadatazobject informationrM   )rb   zpage informationzPage %i:zPDF trailer)r7   inputr3   ospathgetsizeroundmetadatar   
page_countxref_lengthis_form_pdfget_sigflagsembfile_countcatalogmycenterpdf_catalogrR   rD   xrefsri   pages	page_xreftrailerpdf_trailerclose)argsr!   rQ   flagmetar   r#   rM   xreflpagelpnor   r   r   r4      sz   






r4   c                 C   s   t | j| jdd}| j}d|}| js0|j| j| j| j	| j
| j| j| j|| j| j| jd d S t| j|jd }t }|D ]}|d }|j|||d q?|j| j| j| j	| j
| j| j| j|| j| j| jd |  |  d S )NTr5   keepnonezrc4-40zrc4-128zaes-128zaes-256)
garbagedeflateprettycleanasciilinearrm   owner_pwuser_pwpermissionsr   	from_pageto_page)r7   ro   r3   rm   rK   r~   saveoutputr   compressr   sanitizer   r   r)   r*   
permissionri   ru   r   r+   
insert_pdfr   )r   r!   rm   encryptr~   outdocr   r   r   r   r   r      sR   r   c           
      C   s   | j }t }|D ]T}|d}t|dkr|d nd}t|d |dd}d|dd }|r@td|dd |jd }nt	d|jd }|D ]}	|j
||	d |	d d qJ|  q	|j| jd	dd
 |  dS )z&Join pages from several PDF documents.rW   r   Nr   Tr   r(   r   r   )r   r   )ro   r   r+   rJ   r8   r7   joinri   ru   r_   r   r   r   r   )
r   doc_listr!   src_itemsrc_listr3   srcr~   	page_listrf   r   r   r   doc_join  s   
 
r   c           	      C   sF  t | j| jdd}| s| jr| j| jkrtd t | j| j}| j	r*t
| j	nt
 }t
| }|r?||ks>td n|}|sHtd |t
| @ }|r[tdt|  |D ]%}||}||}|j|||d |d |d	 d
 td||j	f  q]|  | jr| j| jkr|j| jdd n|  |  dS )z!Copy embedded files between PDFs.Tr   cannot save PDF incrementallyz%not all names are contained in sourceznothing to copyz2following names already exist in receiving PDF: %sr2   	ufilenamedescr2   r   r   zcopied entry '%s' from '%s'rn   r   N)r7   ro   r3   can_save_incrementallyr   r-   r.   source	pwdsourcer1   setembfile_namesrX   embfile_infoembfile_getembfile_addr   r   r   saveIncr)	r   r!   r   names	src_names	intersectr"   infobuffr   r   r   embedded_copy"  sF   




r   c                 C   s   t | j| jdd}| s| jr| j| jkrtd z|| j W n t	y4   td| j  Y nw | jr>| j| jkrC|
  n|j| jdd |  dS )zDelete an embedded file entry.Tr   r   no such embedded file '%s'r   r   N)r7   ro   r3   r   r   r-   r.   embfile_delr1   
ValueError	save_incrr   r   )r   r!   r   r   r   embedded_delJ  s   

r   c                 C   s   t | j| jdd}z|| j}|| j}W n ty(   td| j  Y nw | j	r/| j	n|d }t
|d}|| |  td| j|f  |  dS )z&Retrieve contents of an embedded file.Tr   r   r2   wbzsaved entry '%s' as '%s'N)r7   ro   r3   r   r1   r   r   r-   r.   r   r+   writer   r   )r   r!   streamdr2   r   r   r   r   embedded_get]  s   

r   c                 C   s
  t | j| jdd}| s| jdu s| j| jkrtd z|| j td| j  W n   Y t	j
| j
r@t	j
| j
sHtd| j
  t| j
d }| j
}|}| js[|}n| j}|j| j||||d | jrr| j| jkrw|  n|j| jd	d
 |  dS )zInsert a new embedded file.Tr   Nr   zentry '%s' already existszno such file '%s'rbr   rn   r   )r7   ro   r3   r   r   r-   r.   r   r1   rp   rq   existsisfiler+   readr   r   r   r   r   r   r!   r   r2   r   r   r   r   r   embedded_addm  s0   

r   c                 C   s6  t | j| jdd}| s| jdu s| j| jkrtd z|| j W n   td| j  Y | j	durNt
j	| j	rNt
j	| j	rNt| j	d }nd}| jrW| j}nd}| jr`| j}n	| jrg| j}nd}| jrp| j}nd}|j| j||||d | jdu s| j| jkr|  n|j| jdd	 |  dS )
z0Update contents or metadata of an embedded file.Tr   Nr   r   r   r   rn   r   )r7   ro   r3   r   r   r-   r.   r   r1   rq   rp   r   r   r+   r   r2   r   r   embfile_updr   r   r   r   r   r   r   embedded_upd  sB   


r   c                 C   s  t | j| jdd}| }| jdurE| j|vr td| j  n%t  tdt|t|dkr0dndf  t  t	|
| j t  dS |sPtd	|j  dS t|dkr`d
|jt|f }nd|j }t| t  |D ]}| jsxt| qn|
|}t	|
| t  qn|  dS )zList embedded files.Tr   Nr   z!printing 1 of %i embedded file%s:r   r#   rV   z'%s' contains no embedded filesz-'%s' contains the following %i embedded filesz)'%s' contains the following embedded file)r7   ro   r3   r   r1   r-   r.   r   r8   rD   r   detailr   )r   r!   r   rC   r1   _r   r   r   embedded_list  s>   



r   c              
   C   s\  | j s| jstd t| j| jdd}| jr!t| j|j	d }nt
d|j	d }| js4tjtj}n| j}tj|rCtj|sJtd|  t }t }|D ]}| j r||d }|D ]B}|d }	|	|vr||	 ||	\}
}}}|dks~|sq`tj||
dd	 d	|	 d
| }t|d}|| |  d}q`| jr||d }|D ]Z}|d }	|	|vr
||	 t||}t|tu r|d }|d }tj|d|	|f }t|d}|| |  qtj|d|	 }|jj dk r|nt!"t!j#|}|$| qqR| j rt%dt&||f  | jr(t%dt&||f  |  dS )z)Extract images and / or fonts from a PDF.z"neither fonts nor images requestedTr   r   z"output directory %s does not existr   zn/arU   r
   .r   Nextimagez	img-%i.%sz
img-%i.pngr   zsaved %i fonts to '%s'zsaved %i images to '%s')'fontsimagesr-   r.   r7   ro   r3   r~   ri   ru   r_   r   rp   rq   abspathcurdirr   isdirr   get_page_fontsaddextract_fontr   rY   r+   r   r   get_page_imagesr&   typedictr   r   r   r   r   r   r   r8   )r   r!   r~   out_dir
font_xrefsimage_xrefsr   itemlistr"   rM   fontnamer   r   bufferoutnameoutfiler   imgdatar%   r   r   r   extract_objects  sr   










r   c           	      C   sX   |rdnt dg}| jd|d}|s|s|| d S ||jddd || d S )N   
   textflagsutf8surrogatepasserrors)bytesget_textr   encode)	rS   textoutGRIDfontsize
noformfeed
skip_emptyr   eopr   r   r   r   page_simple'  s   

r   c           
      C   sz   |rdnt dg}| jd|d}|g kr|s|| d S |jdd d |D ]}	||	d jd	d
d q'|| d S )Nr   r   blocksr   c                 S   s   | d | d fS )Nrn   r   r   )br   r   r   r   :  r   z page_blocksort.<locals>.<lambda>keyr   r   r   r   )r   r   r   sortr   )
rS   r   r   r   r   r   r   r   r   r   r   r   r   page_blocksort3  s   

r  c           !         s@  |rdnt dg}dtt dtdtfdd}dtt dtfd	d
}	dtt dtjf fdd}
dtdtfdddd }| jd|dd }|
|| \}}}}}|g kr]|s[|	| d S |	||}|j
dd d i }|D ]}|\}}}}|||}||g }|| |||< qnt| }|
  || }i }|D ]/}|| }t|}|dk rd||< qdd |D }|
  t|}||k r|}|d ||< q||d |d   |t|  d }|d }|	d |D ].}||k r|	d ||7 }||k s||||| || } |	| d  jd!d"d# || }q|	| d S )$Nr   r   valuesvaluereturnc                 S   s,   t | |}|r| |d  S td|| f )zFind the right row coordinate.

        Args:
            values: (list) y-coordinates of rows.
            value: (int) lookup for this value (y-origin of char).
        Returns:
            y-ccordinate of appropriate line for value.
        r   zLine for %g not found in %s)bisectbisect_rightRuntimeError)r  r  rf   r   r   r   find_line_indexE  s   	z$page_layout.<locals>.find_line_indexrowsc                 S   sJ   t | } |   | d g}| dd  D ]}||d | kr"|| q|S )Nr   r   r'   )r^   r   r]   )r	  r   nrowshr   r   r   curate_rowsT  s   

z page_layout.<locals>.curate_rowsr   rS   c              
      s  t  }|jj}|jj}|}|}d}g }| D ]}	|	d D ]}
|
d dkr$q|
d \}}}}|dk s6||jjkr7q|| }||krA|}|
d D ]}|d  krNqE|d D ]z}|d \}}}}|| }|d	 \}}tt|}|| |d
 }||kr|dkr|}||k r|}|dkr|g kr|d \}}}}||kr|tdkr|| }n|dkrtd}n|dkrtd}n|}||||f|d< qR|||||f qRqEqq|||||fS )Nr   linesdir)r   r   bboxspansrQ   charsorigincrU   r'      rf     rA     )	r   rectwidthheightr\   rs   r   chrr]   )r   rS   r	  
page_widthpage_height	rowheightleftrightr  blocklinex0y0x1y1r  spanr  r   cwidthoxoychold_chold_oxold_oy
old_cwidthligr   joinligaturer   r   process_blocks]  s`   


)z#page_layout.<locals>.process_blocksr/  c                 S   st   | dkrt dS | dkrt dS | dkrt dS | dkr t dS | d	kr(t d
S | dkr0t dS | dkr8t dS | S )zReturn ligature character for a given pair / triple of characters.

        Args:
            lig: (str) 2/3 characters, e.g. "ff"
        Returns:
            Ligature, e.g. "ff" -> chr(0xFB00)
        ffr  fii  fli  ffir  fflr  fti  sti  )r  )r/  r   r   r   r1    s   	z!page_layout.<locals>.joinligaturec                 S   s   d}d}d}d}|t jkrtd| |D ]`}|\}	}
}}|
|  }
|
| }||	kr2|
| |d kr2q|	dkr?||
 | dkr?q|	}|
|| k rP||	7 }|}|
}q|	dkrUqt|
| t| }|
|krm|dkrm|d| 7 }||	7 }|}|
}q| S )a  Produce the text of one output line.

        Args:
            left: (float) left most coordinate used on page
            slot: (float) avg width of one character in any font in use.
            minslot: (float) min width for the characters in this line.
            chars: (list[tuple]) characters of this line.
        Returns:
            text: (str) text string for this line
        rV   r   z%program error: minslot too small = %gg?rU   g?r   )r   EPSILONr  r\   r8   rstrip)r  slotminslotlcharsr   old_charold_x1r,  r  charr(  r   r'  r$  deltar   r   r   make_textline  s:   
z"page_layout.<locals>.make_textlinerawdictr   c                 S   s   | d S )Nr   r   )r  r   r   r   r     s    zpage_layout.<locals>.<lambda>r   r(   r   c                 S   s   g | ]}|d  qS )rn   r   )r9   r  r   r   r   r;     r<   zpage_layout.<locals>.<listcomp>r   r'   g333333?
r   r   r   )r   r   r\   r   r   r   PagerX   r   r   r   getr]   r^   r>   r8   
statisticsmedianr   )!rS   r   r   r   r   r   r   r   r  r  r2  rC  r   r  r	  r  r  r  r  r  r   r)  yr>  r>   r<  minslotsr:   ccountwidths	this_slotrowposr   r   r0  r   page_layoutA  sb   	46






$


rP  c              
   C   s   t | j| jdd}t| j|jd }| j}|d kr&tj	|j
\}}|d }t|d}ttB }| jr6|tN }| jr=|tN }| jrD|tN }tttd}|D ]}	||	d  }
|| j |
|| j| j| j| j|d qL|  d S )NFr   r   z.txtr   simpler   layoutr   )r7   ro   r3   ri   r~   ru   r   rp   rq   splitextr1   r+   r   r   convert_whitenoligaturesextra_spacesr   r   r  rP  modegridr   r   r   r   )r   r!   r   r   r2   r   r   r   funcr   rS   r   r   r   gettext"  s<   

r[  c                  C   s  t jdtdd} | jddd}|jdtdd	}|jd
tdd |jddd |jdddd |jdddd |jdddd |jdtdd |jdtdd |jtd |jdtdd	}|jd
tdd |jdtd d |jddd |jd!d"d#d$d% |jd&td'd |jd(td)d |jd*t	d+t
d,d-d. |jd/dd0d1d2 |jd3dd0d4d2 |jd5dd0d6d2 |jd7t	d8d9d: |jd;dd0d<d2 |jd=dd0d>d2 |jdd?d |jtd |jd@tdAdBdC}|jd
dDdEdF |jdGdHdIdJ |jtd |jdKtdLd	}|jd
tdd |jdMddNd |jdOddPd |jdGdQd |jddd |jdtdRd |jtd |jdStdTd	}|jd
dd |jdUdVd |jdWddXd |jddd |jtd |jdYtdZd	}|jd
dd |jddd |jdGd[d |jdUdHd\dJ |jd]dHd^dJ |jd_d`d |jtd |jdatdbd	}|jd
dd |jddd |jdGd[d |jdUdHdcdJ |jtd |jddtdedfdC}|jd
dd |jdUdHdgdJ |jddd |jdGdhd |jd]did |jdjdkd |jdldmd |jd_dnd |jtd |jdotdpd	}	|	jd
tdd |	jdUdHdgdJ |	jddd |	jdGdqd |	jtd |jdrtdsd	}
|
jd
tdtd |
jddud |
jdGdvd |
jdwdHdxdJ |
jdydzd |
jdUdDd{dF |
jtd |jd|td}d	}|jd
td~d |jddd |jdtdddd. |jdtddd |jdddd0d |jdddd0d |jdddd0d |jdddd0d |jdddd0d |jdGdd |jdtddd |jdtddd |jtd |  }t|ds|   dS || dS )zDefine command configurations.r   zBasic PyMuPDF Functions)progdescriptionSubcommandsz/Enter 'command -h' for subcommand specific help)titlehelpr4   zdisplay PDF information)r]  ro   zPDF filename)r   r`  z	-passwordr3   )r`  z-catalog
store_truezshow PDF catalog)actionr`  z-trailerzshow PDF trailerz	-metadatazshow PDF metadataz-xrefsz&show selected objects, format: 1,5-7,Nz-pagesz'show selected pages, format: 1,5-7,50-N)rZ  r   z.optimize PDF, or create sub-PDF if pages givenr   zoutput PDF filenamez-encryptionzencryption methodr   r   )r`  choicesdefaultz-ownerzowner passwordz-userzuser passwordz-garbagezgarbage collection level   r   )r   r`  rc  rd  z	-compressFzcompress (deflate) output)rb  rd  r`  z-asciizASCII encode binary dataz-linearzformat for fast web displayz-permissionr'   zinteger with permission levels)r   rd  r`  z	-sanitizezsanitize / clean contentsz-prettyzprettify PDF structurez/output selected pages pages, format: 1,5-7,50-Nr   zjoin PDF documentsz3specify each input as 'filename[,password[,pages]]')r]  epilog*zinput filenames)nargsr`  z-outputTzoutput filename)requiredr`  extractz extract images and fonts to diskz-imageszextract imagesz-fontszextract fontsz-folder to receive output, defaults to currentz-consider these pages only, format: 1,5-7,50-Nz
embed-infozlist embedded filesz-namezif given, report only this onez-detailzdetail informationz	embed-addzadd embedded filez-output PDF filename, incremental save if nonezname of new entryz-pathzpath to data for new entryz-desczdescription of new entryz	embed-delzdelete embedded filezname of entry to deletez	embed-updzupdate embedded filez*except '-name' all parameters are optionalzname of entryz-Output PDF filename, incremental save if nonezpath to new data for entryz	-filenameznew filename to store in entryz
-ufilenamez&new unicode filename to store in entryz!new description to store in entryzembed-extractzextract embedded file to diskz'output filename, default is stored namez
embed-copyz copy embedded files between PDFszPDF to receive embedded fileszpassword of inputz2output PDF, incremental save to 'input' if omittedz-sourcezcopy embedded files from herez
-pwdsourcezpassword of 'source' PDFzrestrict copy to these entriesr[  z(extract text in various formatting modeszinput document filenamezpassword for input documentz-modez-mode: simple, block sort, or layout (default)rQ  rS  z select pages, format: 1,5-7,50-Nz1-N)r   r`  rd  z-noligaturesz*expand ligature characters (default False))rb  r`  rd  z-convert-whitez6convert whitespace characters to white (default False)z-extra-spacesz%fill gaps with spaces (default False)z-noformfeedz-write linefeeds, no formfeeds (default False)z-skip-emptyz+suppress pages with no text (default False)z3store text in this file (default inputfilename.txt)z-gridz+merge lines if closer than this (default 2)r(   z	-fontsizez4only include text with a larger fontsize (default 3)rn   rZ  N)argparseArgumentParserr{   add_subparsers
add_parseradd_argumentrX   set_defaultsr4   r\   r_   r   r   r   r   r   r   r   r   r   floatr[  
parse_argshasattr
print_helprZ  )parsersubpsps_showps_cleanps_join
ps_extractps_embed_addps_embed_delps_embed_updps_embed_extractps_embed_copy
ps_gettextr   r   r   r   mainE  s  r  __main__)FT)rS   )%rk  r  rp   r-   rH  typingr   r   r   r   r   	fitz.fitzr   r   r   r{   r&   r7   rD   rR   ri   r4   r   r   r   r   r   r   r   r   r   r   r  rP  r[  r  __name__r   r   r   r   <module>   sF   
$	
/?/(!1$D b#  ,
