
    ChC              
         S SK Jr  S SKrS SKrS SKrS SKJrJr  S SKr	S SK
r
S SK
Jr  S SKJr  SSKJr  SSKJr  \R$                  " \5      r\(       a  S S	KJr  S
SSSSS\SSS4
                         SS jjrSSSS\4             SS jjrSS jrSSS\4             SS jjr    S           SS jjrg)    )annotationsN)TYPE_CHECKINGCallable)Tensor)tqdm   )cos_sim)normalize_embeddings)SentenceTransformerF    i  i i  d   c           
     H    U R                  UUUSU	U
US9n[        UUUUUUS9$ )a  
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
other sentences and returns a list with the pairs that have the highest cosine similarity score.

Args:
    model (SentenceTransformer): SentenceTransformer model for embedding computation
    sentences (List[str]): A list of strings (texts or sentences)
    show_progress_bar (bool, optional): Plotting of a progress bar. Defaults to False.
    batch_size (int, optional): Number of texts that are encoded simultaneously by the model. Defaults to 32.
    query_chunk_size (int, optional): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time). Defaults to 5000.
    corpus_chunk_size (int, optional): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time). Defaults to 100000.
    max_pairs (int, optional): Maximal number of text pairs returned. Defaults to 500000.
    top_k (int, optional): For each sentence, we retrieve up to top_k other sentences. Defaults to 100.
    score_function (Callable[[Tensor, Tensor], Tensor], optional): Function for computing scores. By default, cosine similarity. Defaults to cos_sim.
    truncate_dim (int, optional): The dimension to truncate sentence embeddings to. If None, uses the model's ones. Defaults to None.
    prompt_name (Optional[str], optional): The name of a predefined prompt to use when encoding the sentence.
        It must match a key in the model `prompts` dictionary, which can be set during model initialization
        or loaded from the model configuration.

        Ignored if `prompt` is provided. Defaults to None.

    prompt (Optional[str], optional): A raw prompt string to prepend directly to the input sentence during encoding.

        For instance, `prompt="query: "` transforms the sentence "What is the capital of France?" into:
        "query: What is the capital of France?". Use this to override the prompt logic entirely and supply your own prefix.
        This takes precedence over `prompt_name`. Defaults to None.

Returns:
    List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
T)show_progress_bar
batch_sizeconvert_to_tensortruncate_dimprompt_nameprompt)query_chunk_sizecorpus_chunk_size	max_pairstop_kscore_function)encodeparaphrase_mining_embeddings)model	sentencesr   r   r   r   r   r   r   r   r   r   
embeddingss                \/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/util/retrieval.pyparaphrase_miningr       sN    \ +!  J ()+%     c                   US-  n[         R                  " 5       nSnSn[        S[        U 5      U5       GH*  n	[        S[        U 5      U5       GH  n
U" X
X-    X	X-    5      n[        R
                  " U[        U[        US   5      5      SSSS9u  pUR                  5       R                  5       nUR                  5       R                  5       n[        [        U5      5       Hr  n[        X   5       H^  u  nnX-   nU	U-   nUU:w  d  M  X   U   U:  d  M$  UR                  X   U   UU45        US-  nX:  d  MI  UR                  5       nUS   nM`     Mt     GM     GM-     [        5       n/ nUR                  5       (       dp  UR                  5       u  nnn[        UU/5      u  nnUU:w  a/  UU4U;  a'  UR                  UU45        UR!                  UUU/5        UR                  5       (       d  Mp  [        US SS9nU$ )	a  
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
other sentences and returns a list with the pairs that have the highest cosine similarity score.

Args:
    embeddings (Tensor): A tensor with the embeddings
    query_chunk_size (int): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    corpus_chunk_size (int): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    max_pairs (int): Maximal number of text pairs returned.
    top_k (int): For each sentence, we retrieve up to top_k other sentences
    score_function (Callable[[Tensor, Tensor], Tensor]): Function for computing scores. By default, cosine similarity.

Returns:
    List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
r   r   TFdimlargestsortedc                    U S   $ )Nr    xs    r   <lambda>.paraphrase_mining_embeddings.<locals>.<lambda>   s    !A$r!   keyreverse)queuePriorityQueuerangelentorchtopkmincputolist	enumerateputgetsetemptyr'   addappend)r   r   r   r   r   r   pairs	min_score	num_addedcorpus_start_idxquery_start_idxscoresscores_top_k_valuesscores_top_k_idx	query_itr	top_k_idx
corpus_itrijentryadded_pairs
pairs_listscoresorted_isorted_js                            r   r   r   X   s
   0 
QJE !EII!!S_6GH$QJ9IJO#_-OP.>.RSF
 5:JJE3vay>24PU51 #6"9"9";"B"B"D/335<<>"3v;/	-67G7R-S)Iz'3A(:5AAv"5"@"Ki"W		#6#A)#LaQR"ST!Q	$1$)IIKE(-aI .T 0  K I6 %KJkkmmiikq!#QF^(xXx$8$KOOXx01uh9: kkmm 
EJr!   c                     [        U 0 UD6$ )z8This function is deprecated. Use semantic_search instead)semantic_search)argskwargss     r   information_retrievalrX      s    D+F++r!   
   c                   [        U [        R                  [        R                  45      (       a  [        R
                  " U 5      n O+[        U [        5      (       a  [        R                  " U 5      n [        U R                  5      S:X  a  U R                  S5      n [        U[        R                  [        R                  45      (       a  [        R
                  " U5      nO+[        U[        5      (       a  [        R                  " U5      nUR                  U R                  :w  a  U R                  UR                  5      n [        [        U 5      5       Vs/ sH  n/ PM     nn[        S[        U 5      U5       GH  n[        X-   [        U 5      5      n	U R                  (       a2  [        R                   " XU R                  S9n
U R#                  SU
5      nOXU	 n[        S[        U5      U5       GHd  n[        X-   [        U5      5      nUR                  (       a2  [        R                   " XUR                  S9n
UR#                  SU
5      nOXU nU" X5      n[        R$                  " U[        U[        US   5      5      SSSS9u  nnUR'                  5       R)                  5       nUR'                  5       R)                  5       n[        [        U5      5       Hu  n[+        UU   UU   5       H\  u  nnUU-   nUU-   n[        UU   5      U:  a  [,        R.                  " UU   UU45        M@  [,        R0                  " UU   UU45        M^     Mw     GMg     GM     [        [        U5      5       HI  n[        [        UU   5      5       H  nUU   U   u  nnUUS.UU   U'   M     [3        UU   S SS	9UU'   MK     U$ s  snf )
a  
This function performs by default a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

Args:
    query_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the query embeddings. Can be a sparse tensor.
    corpus_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the corpus embeddings. Can be a sparse tensor.
    query_chunk_size (int, optional): Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. Defaults to 100.
    corpus_chunk_size (int, optional): Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. Defaults to 500000.
    top_k (int, optional): Retrieve top k matching entries. Defaults to 10.
    score_function (Callable[[:class:`~torch.Tensor`, :class:`~torch.Tensor`], :class:`~torch.Tensor`], optional): Function for computing scores. By default, cosine similarity.

Returns:
    List[List[Dict[str, Union[int, float]]]]: A list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
r   r   deviceTFr$   )	corpus_idrQ   c                    U S   $ )NrQ   r)   r*   s    r   r,   !semantic_search.<locals>.<lambda>   s    \]^e\fr!   r.   )
isinstancenpndarraygenericr5   
from_numpyliststackr4   shape	unsqueezer\   tor3   r7   	is_sparsearangeindex_selectr6   r8   r9   zipheapqheappushheappushpopr'   )query_embeddingscorpus_embeddingsr   r   r   r   _queries_result_listrE   query_end_idxindicesquery_chunkrD   corpus_end_idxcorpus_chunk
cos_scorescos_scores_top_k_valuescos_scores_top_k_idxrI   sub_corpus_idrQ   r]   query_iddoc_itrs                           r   rU   rU      s   0 "RZZ$<== ++,<=	$d	+	+ ;;'78
!!"a'+55a8#bjj"**%=>>!,,->?	%t	,	,!KK(9: #3#:#::+../@/G/GH',S1A-B'CD'C!2'CD C(8$9;KLO>DT@UV%%ll?JZJaJabG*777CK*=IK !&a->)?AR S !1!EsK\G]^N **,,'7PaPhPhi0==aI0.Q (BJ =BJJCs:a='9:4X]=9#%9 '>&A&A&C&J&J&L##7#;#;#=#D#D#F "3z?3	,/0DY0OQhirQs,t(M5 0= @I.:H.x89EA/9E9;M ))*=h*G%QZI[\ -u 4% !T MN #123S!4X!>?@G28<WEE9CLW\5])'2 A )//B8/LRfpt(uH%	 4 _ Es   Oc                   [        U [        R                  5      (       d  [        R                  " U 5      n [        R                  " XR                  S9n[        U 5      n / n[        U[        U 5      5      n[        [        SU-  S5      [        U 5      5      n[        [        S[        U 5      U5      SU(       + S9 GH  nXXs-    U R                  -  nU R                  R                  S;   a  X:  n	U	R                  S5      n
X:  nUR                  5       (       d  Ma  X   n
X   nU
R                  5       nUR                  US	S
9u  p[!        X5       H(  u  nnUR#                  USU R%                  5       5        M*     M  UR                  US	S
9u  nn[        [        U5      5       H  nUU   S   U:  d  M  UU   R                  US	S
9u  nnUS   U:  aW  U[        U 5      :  aH  [        SU-  [        U 5      5      nUU   R                  US	S
9u  nnUS   U:  a  U[        U 5      :  a  MH  UR#                  UUU:     R%                  5       5        M     GM     ['        US S	S9n/ n[)        5       n[+        U5       H]  u  nn/ nU H  nUU;  d  M  UR#                  U5        M     [        U5      U:  d  M;  UR#                  U5        UR-                  U5        M_     ['        US S	S9nU$ )a  
Function for Fast Community Detection.

Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
Returns only communities that are larger than min_community_size. The communities are returned
in decreasing order. The first element in each list is the central point in the community.

Args:
    embeddings (torch.Tensor or numpy.ndarray): The input embeddings.
    threshold (float): The threshold for determining if two embeddings are close. Defaults to 0.75.
    min_community_size (int): The minimum size of a community to be considered. Defaults to 10.
    batch_size (int): The batch size for computing cosine similarity scores. Defaults to 1024.
    show_progress_bar (bool): Whether to show a progress bar during computation. Defaults to False.

Returns:
    List[List[int]]: A list of communities, where each community is represented as a list of indices.
r[      2   r   zFinding clusters)descdisable)cudanpur   T)kr&   Nr#   c                    [        U 5      $ Nr4   r*   s    r   r,   %community_detection.<locals>.<lambda>R  s    Ar!   r.   c                    [        U 5      $ r   r   r*   s    r   r,   r   b  s    #a&r!   )r`   r5   r   tensorr\   r
   r7   r4   maxr   r3   Ttypesumanyr6   rm   r@   r9   r'   r=   r:   update)r   	thresholdmin_community_sizer   r   extracted_communitiessort_max_size	start_idxrz   threshold_maskrow_wise_countlarge_enough_maskr   rs   top_k_indicescountrv   top_k_valuesrL   top_val_largetop_idx_largeunique_communitiesextracted_ids
cluster_id	communitynon_overlapped_communityidxs                              r   community_detectionr     s   0 j%,,//\\*-
Y/@/@AI%j1J /ZAA 22B7ZIMaZ*-4FTePe	  I,BCjllR
 !!_4'4N+//2N !/ D$((**+>N#6J ""$A)DAA #&n"Dw%,,WVe_-C-C-EF #E )oo0BDoQOL! 3|,-?2&)33=a=3E3E_c3E3d0M= (+i7MCPZO<[(+A,=s:(O7A!}7I7IMcg7I7h4} (+i7MCPZO<[ *00}PY?Y1Z1a1a1cd .AZ ##8>NX\] EM!*+@!A
I#% C-'(//4  '(,>>%%&>?  !9: "B   28HRVWr!   )r   r   r   z	list[str]r   boolr   intr   r   r   r   r   r   r   r   r   "Callable[[Tensor, Tensor], Tensor]r   z
int | Noner   
str | Noner   r   returnlist[list[float | int]])r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   "list[list[dict[str, int | float]]])rq   r   rr   r   r   r   r   r   r   r   r   r   r   r   )g      ?rY   i   F)r   ztorch.Tensor | np.ndarrayr   floatr   r   r   r   r   r   r   zlist[list[int]])
__future__r   rn   loggingr1   typingr   r   numpyra   r5   r   tqdm.autonotebookr   
similarityr	   r   r
   	getLogger__name__logger)sentence_transformers.SentenceTransformerr   r    r   rX   rU   r   r)   r!   r   <module>r      s   "    *    "  (			8	$M $ #9@#"??? ? 	?
 ? ? ? ? 7? ? ? ? ?H !#9@FFF F 	F
 F 7F FR,  #9@XXX X 	X
 X 7X (Xz  #c)cc c 	c
 c cr!   