
    h 1                         S r SSKrSSKrSSKrSSKJr  SSKJr	  SSK
Jr  SSKJr  \R                  r\R                  " \5      r " S S\	R$                  5      r " S S\	R$                  5      rg)	)PdfTextPagePdfTextSearcher    N)PdfiumError)PDFIUM_INFOc                      ^  \ rS rSrSrU 4S jr\S 5       rSS jrSS jr	SS jr
S rSS	 jrS
 rSS jrS rSS jrSrU =r$ )r      z
Text page helper class.

Attributes:
    raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
    page (PdfPage): Reference to the page this textpage belongs to.
c                 X   > Xl         X l        [        TU ]  [        R
                  5        g N)rawpagesuper__init__pdfium_cFPDFText_ClosePage)selfr   r   	__class__s      S/var/www/html/shao/venv/lib/python3.13/site-packages/pypdfium2/_helpers/textpage.pyr   PdfTextPage.__init__   s     	445    c                     U R                   $ r
   )r   r   s    r   parentPdfTextPage.parent!   s    yyr   c                     X:  a  g[         R                  " X5      nUS:X  a  U R                  US-   X#S-   U5      $ [         R                  " X5      nUS:X  a  U R                  XS-
  X4S-   5      $ XVX44$ )Nr      )r   "FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   c_startc_end	l_passive	r_passivet_startt_ends          r   r   "PdfTextPage._get_active_text_range&   s|    ?==dLb=..wqy%1iXX;;DHB;..waVWKXXy33r   c                    X4S:X  a,  U(       d%  [         R                  " S5        U R                  US9$ US:X  a  U R                  5       U-
  nU R	                  XU-   S-
  5      nUS:X  a  gUu  pgpX-  nX(U	-   -  nUS-   U-
  n
S[
        R                  s=:  a  S	:  a  O  OU
S
-  n
U
S-  n
[        R                  " U
S
-  5      n[        R                  " U[        R                  " [        R                  5      5      n[        R                  " XX,5      nX:  d   SU
 SU 35       eUR                  SUS-
  S
-   R                  SUS9$ )a-  
Warning:
    .. versionchanged:: 4.28
       For various reasons, calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).

Extract text from a given range.

Parameters:
    index (int): Index of the first char to include.
    count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
    errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
Returns:
    str: The text in the range in question, or an empty string if no text was found.

Note:
    * The returned text's length does not have to match *count*, even if it will for most PDFs.
      This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
      This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
      Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
    * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
r   r   z]get_text_range() call with default params will be implicitly redirected to get_text_bounded()errorsr   r   r    i  i     zBuffer too small: z vs N	utf-16-le)warningswarnget_text_boundedcount_charsr   r   buildctypescreate_string_buffercastPOINTERc_ushortr   FPDFText_GetTextr   decode)r   indexcountr)   
force_thisactive_ranger#   r$   r!   r"   in_countbuffer
buffer_ptr	out_counts                 r   get_text_rangePdfTextPage.get_text_range6   sR   2 >W$ZMMyz(((77B;$$&.E 225+a-H1 0<,	Y&&7W$
 +##*d*MHA,,X\:[[)HI
--d5M	$T(:8*D&TT$zz*IaK?+22;v2NNr   c                    U R                   R                  5       nUc  US   nUc  US   nUc  US   nUc  US   nXXCU4n[        R                  " / UQSPSP76 nUS::  a  g[        R
                  " US-  5      n	[        R                  " U	[        R                  " [        R                  5      5      n
[        R                  " / UQU
PUP76   U	R                  R                  SUS9$ )	a^  
Extract text from given boundaries in PDF coordinates.
If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.

Parameters:
    errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
Returns:
    str: The text on the page area in question, or an empty string if no text was found.
Nr   r   r+      r*   r,   r(   )r   get_bboxr   FPDFText_GetBoundedTextr2   r3   r4   r5   r6   r   r8   )r   leftbottomrighttopr)   bboxargsn_charsr>   r?   s              r   r/   PdfTextPage.get_text_boundedq   s     yy!!#<7D>!WF=GE;q'CC/22BDB$BBa<,,Wq[9[[)HI
((D$D
DGDzz  V <<r   c                 T    [         R                  " U 5      nUS:X  a  [        S5      eU$ )z>
Returns:
    int: The number of characters on the text page.
r   zFailed to get character count.)r   FPDFText_CountCharsr   )r   rM   s     r   r0   PdfTextPage.count_chars   s,    
 ..t4b=>??r   c                 V    [         R                  " XU5      nUS:X  a  [        S5      eU$ )z
Parameters:
    index (int): Start character index.
    count (int): Character count to consider (defaults to -1 for all remaining).
Returns:
    int: The number of text rectangles in the given character range.
r   zFailed to count rectangles.)r   FPDFText_CountRectsr   )r   r9   r:   n_rectss       r   count_rectsPdfTextPage.count_rects   s.     ..tEBb=;<<r   c                 D    [         R                  " XX#U5      nUS:  a  gU$ )av  
Get the index of a character by position.

Parameters:
    x (float): Horizontal position (in PDF canvas units).
    y (float): Vertical position.
    x_tol (float): Horizontal tolerance.
    y_tol (float): Vertical tolerance.
Returns:
    int | None: The index of the character at or nearby the point (x, y).
    May be None if there is no character or an error occurred.
r   N)r   FPDFText_GetCharIndexAtPos)r   xyx_toly_tolr9   s         r   	get_indexPdfTextPage.get_index   s'     33DQuM19r   c                    U(       a^  [         R                  " 5       n[         R                  " XU5      nUR                  UR                  UR
                  UR                  4u  pVpxOr[        5       [        5       [        5       [        5       4u  pVpx[         R                  " XXWXh5      nUR                  UR                  UR                  UR                  4u  pVpxU(       d  [        S5      eXVXx4$ )a  
Get the bounding box of a single character.

Parameters:
    index (int):
        Index of the character to work with, in the page's character array.
    loose (bool):
        Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
Returns:
    Float values for left, bottom, right and top in PDF canvas units.
zFailed to get charbox.)r   FS_RECTFFPDFText_GetLooseCharBoxrG   rH   rI   rJ   c_doubleFPDFText_GetCharBoxvaluer   )	r   r9   looserectoklbrts	            r   get_charboxPdfTextPage.get_charbox   s     $$&D224EBDKKTXXEJA!Q!XZXZGJA!--d1FB!''177AGG;JA!677Qzr   c                    [        5       [        5       [        5       [        5       4u  p#pE[        R                  " XX%XC5      nU(       d  [        S5      eUR                  UR                  UR                  UR                  4$ )a4  
Get the bounding box of a text rectangle at the given index.
Note that :meth:`.count_rects` must be called once with default parameters
before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).

Returns:
    Float values for left, bottom, right and top in PDF canvas units.
zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))rb   r   FPDFText_GetRectr   rd   )r   r9   rh   ri   rj   rk   rg   s          r   get_rectPdfTextPage.get_rect   si     ZXZC
a&&tA!?  [  \  \!''17733r   c                    [        U5      S:X  a  [        S5      eSnU(       a  U[        R                  -  nU(       a  U[        R                  -  nU(       a  U[        R
                  -  nUS-   R                  S5      n[        R                  " U[        R                  " [        R                  5      5      n[        R                  " XXb5      n	[        X5      n
U R                  U
5        U
$ )a  
Locate text on the page.

Parameters:
    text (str):
        The string to search for.
    index (int):
        Character index at which to start searching.
    match_case (bool):
        If True, the search will be case-specific (upper and lower letters treated as different characters).
    match_whole_word (bool):
        If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
    consecutive (bool):
        If False (the default), :meth:`.search` will skip past the current match to look for the next match.
        If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
Returns:
    PdfTextSearcher: A helper object to search text.
r   z#Text length must be greater than 0. r,   )len
ValueErrorr   FPDF_MATCHCASEFPDF_MATCHWHOLEWORDFPDF_CONSECUTIVEencoder2   r4   r5   r6   FPDFText_FindStartr   _add_kid)r   textr9   
match_casematch_whole_wordconsecutiveflagsenc_textenc_text_ptrraw_searchersearchers              r   searchPdfTextPage.search   s    ( t9>BCCX,,,EX111EX...E6M))+6{{8V^^FOO-LM224uT"<6hr   )r   r   )r   r   )r   r   ignoreF)NNNNr   r'   )F)r   FFF)__name__
__module____qualname____firstlineno____doc__r   propertyr   r   rA   r/   r0   rU   r]   rl   rp   r   __static_attributes____classcell__r   s   @r   r   r      sV    6
  4 8Ov=@&84 $ $r   r   c                   N   ^  \ rS rSrSrU 4S jr\S 5       rS rS r	S r
SrU =r$ )	r   i  z
Text searcher helper class.

Attributes:
    raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
    textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
c                 X   > Xl         X l        [        TU ]  [        R
                  5        g r
   )r   textpager   r   r   FPDFText_FindClose)r   r   r   r   s      r   r   PdfTextSearcher.__init__  s      445r   c                     U R                   $ r
   )r   r   s    r   r   PdfTextSearcher.parent  s    }}r   c                     U" U 5      nU(       d  g [         R                  " U 5      n[         R                  " U 5      nX44$ r
   )r   FPDFText_GetSchResultIndexFPDFText_GetSchCount)r   	find_funcrg   r9   r:   s        r   _get_occurrencePdfTextSearcher._get_occurrence#  s9    t_33D9--d3|r   c                 @    U R                  [        R                  5      $ )z
Returns:
    (int, int): Start character index and count of the next occurrence,
    or None if the last occurrence was passed.
)r   r   FPDFText_FindNextr   s    r   get_nextPdfTextSearcher.get_next+       ##H$>$>??r   c                 @    U R                  [        R                  5      $ )z
Returns:
    (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
    or None if the last occurrence was passed.
)r   r   FPDFText_FindPrevr   s    r   get_prevPdfTextSearcher.get_prev3  r   r   )r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r   r   r     s8    6
  @@ @r   r   )__all__r2   loggingr-   pypdfium2.rawr   r   pypdfium2.internalinternalpdfium_ipypdfium2._helpers.miscr   pypdfium2.versionr   rb   	getLoggerr   loggerAutoCloseabler   r    r   r   <module>r      sc    -      % / )??			8	$z8)) zz)@x-- )@r   