
    @h1                    @   S SK Jr  S SKrS SKrS SKJrJr  S SKJrJ	r	J
r
  S SKJr  S SKJr  S SKJr  S SKJrJrJrJrJrJr  S S	KJrJr  \R6                  " \5      r\" S
SS9r " S S\\5      r " S S\5      r  " S S\!\5      r"\" SS9 " S S5      5       r#SS jr$g)    )annotationsN)ABCabstractmethod)
CollectionIterableSequence)Set)	dataclass)Enum)AnyCallableLiteralOptionalTypeVarUnion)BaseDocumentTransformerDocumentTSTextSplitter)boundc                     \ rS rSrSrSS\SSS4             SS jjr\SS j5       r S     SS
 jjr	SS jr
SS jrSS jr\SS j5       r\SS	\" 5       S4             SS jj5       r      SS jrSrg	)r      z)Interface for splitting text into chunks.i     FTc                    US::  a  SU 3n[        U5      eUS:  a  SU 3n[        U5      eX!:  a  SU SU S3n[        U5      eXl        X l        X0l        X@l        XPl        X`l        g)a  Create a new TextSplitter.

Args:
    chunk_size: Maximum size of chunks to return
    chunk_overlap: Overlap in characters between chunks
    length_function: Function that measures the length of given chunks
    keep_separator: Whether to keep the separator and where to place it
                    in each corresponding chunk (True='start')
    add_start_index: If `True`, includes chunk's start index in metadata
    strip_whitespace: If `True`, strips whitespace from the start and end of
                      every document
r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacemsgs           U/var/www/html/shao/venv/lib/python3.13/site-packages/langchain_text_splitters/base.py__init__TextSplitter.__init__   s    * ?0=CS/!14]ODCS/!%.}o ><46  S/!%+ /- /!1    c                    g)z$Split text into multiple components.N )r"   texts     r*   
split_textTextSplitter.split_textE   s    r-   Nc           	        U=(       d    0 /[        U5      -  n/ n[        U5       H  u  pVSnSnU R                  U5       H  n	[        R                  " X5   5      n
U R
                  (       a<  Xx-   U R                  -
  nUR                  U	[        SU5      5      nXzS'   [        U	5      n[        XS9nUR                  U5        M     M     U$ )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater1   copydeepcopyr    r   findmaxr   append)r"   texts	metadatas
_metadatas	documentsir0   indexprevious_chunk_lenchunkr6   offsetnew_docs                r*   create_documentsTextSplitter.create_documentsI   s     32$U"3
	 'GAE!".==7(("7$:M:MMF IIeSF^<E.3]+),U&"I  ) / ( r-   c                    / / p2U H9  nUR                  UR                  5        UR                  UR                  5        M;     U R                  X#S9$ )zSplit documents.)r?   )r=   r5   r6   rH   )r"   rA   r>   r?   docs        r*   split_documentsTextSplitter.split_documents]   sM    ryCLL))*S\\*  $$U$@@r-   c                x    UR                  U5      nU R                  (       a  UR                  5       nUS:X  a  g U$ )N )joinr!   strip)r"   docs	separatorr0   s       r*   
_join_docsTextSplitter._join_docse   s3    ~~d#!!::<D2:r-   c                x   U R                  U5      n/ n/ nSnU GHv  nU R                  U5      nXh-   [        U5      S:  a  UOS-   U R                  :  Ga  X`R                  :  a%  [        R	                  SU SU R                   35        [        U5      S:  a  U R                  XR5      n	U	b  UR                  U	5        X`R                  :  d,  Xh-   [        U5      S:  a  UOS-   U R                  :  at  US:  an  X`R                  US   5      [        U5      S:  a  UOS-   -  nUSS  nX`R                  :  a  M@  Xh-   [        U5      S:  a  UOS-   U R                  :  a  US:  a  Mn  UR                  U5        Xh[        U5      S:  a  UOS-   -  nGMy     U R                  XR5      n	U	b  UR                  U	5        U$ )Nr   zCreated a chunk of size z%, which is longer than the specified    )r   r7   r   loggerwarningrT   r=   r   )
r"   splitsrS   separator_lenrR   current_doctotald_lenrK   s
             r*   _merge_splitsTextSplitter._merge_splitsm   s    --i8!#A((+D[1AA1E1M""# +++NN25' :>>B>N>N=OQ {#a'//+ACC(  "5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  "5"55[9IA9MSTU**+!AI q!c+.>.B]JJE9 : ook5?KKr-   c                   ^  SSK Jn  [        TU5      (       d  Sn[        U5      eSU4S jjnU " SSU0UD6$ ! [         a    Sn[        U5      ef = f)	z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBasec                8   > [        TR                  U 5      5      $ N)r7   tokenizer0   	tokenizers    r*   _huggingface_tokenizer_lengthNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    9--d344r-   z`Could not import transformers python package. Please install it with `pip install transformers`.r%   r0   strreturnintr/   )$transformers.tokenization_utils_baserc   
isinstancer   ImportError)clsrh   kwargsrc   r)   ri   s    `    r*   from_huggingface_tokenizer'TextSplitter.from_huggingface_tokenizer   sp    	"Ti)@AAW  !o%5 K#@KFKK  	"E  S/!	"s	   ,9 Agpt2allc                   ^^^
  SSK nUb  UR                  U5      m
OUR                  U5      m
SUUU
4S jjn[	        U [
        5      (       a  UUTTS.n	0 UEU	EnU " SSU0UD6$ ! [         a    Sn[        U5      ef = f)	z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                8   > [        TR                  U TTS95      $ N)allowed_specialdisallowed_special)r7   encode)r0   r{   r|   encs    r*   _tiktoken_encoder=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r-   )encoding_name
model_namer{   r|   r%   rk   r/   )tiktokenrq   encoding_for_modelget_encoding
issubclassTokenTextSplitter)rr   r   r   r{   r|   rs   r   r)   r   extra_kwargsr~   s      ``     @r*   from_tiktoken_encoder"TextSplitter.from_tiktoken_encoder   s    	# !--j9C''6C	 	 c,--!.(#2&8	L 0/,/F?#4????  	#A 
 c""	#s   A% %A=c                6    U R                  [        U5      5      $ )z2Transform sequence of documents by splitting them.)rL   list)r"   rA   rs   s      r*   transform_documents TextSplitter.transform_documents   s     ##DO44r-   )r    r   r   r   r   r!   )r#   rn   r$   rn   r%   zCallable[[str], int]r&   z$Union[bool, Literal['start', 'end']]r'   boolr(   r   rm   Noner0   rl   rm   	list[str]re   )r>   r   r?   zOptional[list[dict[Any, Any]]]rm   list[Document])rA   zIterable[Document]rm   r   )rR   r   rS   rl   rm   Optional[str])rZ   zIterable[str]rS   rl   rm   r   )rh   r   rs   r   rm   r   )rr   ztype[TS]r   rl   r   r   r{   'Union[Literal['all'], AbstractSet[str]]r|   &Union[Literal['all'], Collection[str]]rs   r   rm   r   )rA   Sequence[Document]rs   r   rm   r   )__name__
__module____qualname____firstlineno____doc__r7   r+   r   r1   rH   rL   rT   r`   classmethodrt   setr   r   __static_attributes__r/   r-   r*   r   r      sQ   3  03?D %!%&2&2 &2 .	&2
 =&2 &2 &2 
&2P 3 3 MQ+I	(A(T L L,  $$(CF5EJ*@*@*@ "*@ A	*@
 C*@ *@ 
*@ *@X5+57:5	5r-   c                  f   ^  \ rS rSrSrSS\" 5       S4           S	U 4S jjjrS
S jrSrU =r	$ )r      z/Splitting text to tokens using model tokenizer.rv   Nrw   c                   > [         T	U ]  " S0 UD6   SSKnUb  UR	                  U5      nOUR                  U5      nXl        X0l        X@l        g! [         a    Sn[        U5      ef = f)zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r/   )	superr+   r   rq   r   r   
_tokenizer_allowed_special_disallowed_special)
r"   r   r   r{   r|   rs   r   r)   r~   	__class__s
            r*   r+   TokenTextSplitter.__init__   s     	"6"	# !--j9C''6C /#5   	#A 
 c""	#s   A A(c                   ^  SU 4S jjn[        T R                  T R                  T R                  R                  US9n[        XS9$ )a/  Splits the input text into smaller chunks based on tokenization.

This method uses a custom tokenizer configuration to encode the input text
into tokens, processes the tokens in chunks of a specified size with overlap,
and decodes them back into text chunks. The splitting is performed using the
`split_text_on_tokens` function.

Args:
    text (str): The input text to be split into smaller chunks.

Returns:
    List[str]: A list of text chunks, where each chunk is derived from a portion
    of the input text based on the tokenization and chunking rules.
c                b   > TR                   R                  U TR                  TR                  S9$ rz   )r   r}   r   r   )_textr"   s    r*   _encode-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r-   )r$   tokens_per_chunkdecoder}   rg   )r   rl   rm   z	list[int])	Tokenizerr   r   r   r   split_text_on_tokens)r"   r0   r   rh   s   `   r*   r1   TokenTextSplitter.split_text  sC     	 --!--??))	
	 $CCr-   )r   r   r   )r   rl   r   r   r{   r   r|   r   rs   r   rm   r   r   )
r   r   r   r   r   r   r+   r1   r   __classcell__)r   s   @r*   r   r      sh    9 $$(CF5EJ66 "6 A	6
 C6 6 
6 68D Dr-   r   c                      \ rS rSrSrSrSrSrSrSr	Sr
S	rS
rSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSr g) Languagei"  z"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6r/   N)!r   r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r   r/   r-   r*   r   r   "  s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJ!Lr-   r   T)frozenc                  H    \ rS rSr% SrS\S'    S\S'    S\S'    S\S	'   S
rg)r   iB  zTokenizer data class.rn   r$   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]r}   r/   N)r   r   r   r   r   __annotations__r   r/   r-   r*   r   r   B  s)    *,&&=&&=r-   r   c                   / nUR                  U 5      nSn[        XAR                  -   [        U5      5      nX4U nU[        U5      :  a  UR	                  UR                  U5      5        U[        U5      :X  a   U$ XAR                  UR                  -
  -  n[        XAR                  -   [        U5      5      nX4U nU[        U5      :  a  M  U$ )z6Split incoming text and return chunks using tokenizer.r   )r}   minr   r7   r=   r   r$   )r0   rh   rZ   	input_ids	start_idxcur_idx	chunk_idss          r*   r   r   P  s    F  &II)888#i.IGG,I
c)n
$i&&y12c)n$ M 	//)2I2III	i"<"<<c)nM0	 c)n
$ Mr-   )r0   rl   rh   r   rm   r   )%
__future__r   r9   loggingabcr   r   collections.abcr   r   r   r	   AbstractSetdataclassesr
   enumr   typingr   r   r   r   r   r   langchain_core.documentsr   r   	getLoggerr   rX   r   r   r   rl   r   r   r   r/   r-   r*   <module>r      s    "   # : : . !   G			8	$T(E5*C E5P=D =D@"sD "@ $
> 
> 
>r-   