o
    
shx                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZddl	m
Z
 ddlmZmZmZmZ ddlmZmZmZmZ e rCd dlZddlmZ e rPd dlZdd	lmZ G d
d deZG dd deZeedddG dd deZeZdS )    N)AnyOptionalUnionoverload   )BasicTokenizer)ExplicitEnumadd_end_docstringsis_tf_availableis_torch_available   )ArgumentHandlerChunkPipelineDatasetbuild_pipeline_init_args)/TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMESc                   @   s*   e Zd ZdZdeeee f fddZdS )"TokenClassificationArgumentHandlerz5
    Handles arguments for token classification.
    inputsc                 K   s   | dd}| d}|d ur%t|ttfr%t|dkr%t|}t|}n$t|tr0|g}d}ntd ur9t|ts?t|tjrE||d |fS t	d| d}|rit|tr_t|d tr_|g}t||krit	d||||fS )	Nis_split_into_wordsF	delimiterr   r   zAt least one input is required.offset_mappingz;offset_mapping should have the same batch size as the input)
get
isinstancelisttuplelenstrr   typesGeneratorType
ValueError)selfr   kwargsr   r   
batch_sizer    r$   i/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/pipelines/token_classification.py__call__    s$   
"


z+TokenClassificationArgumentHandler.__call__N)__name__
__module____qualname____doc__r   r   r   r&   r$   r$   r$   r%   r      s    r   c                   @   s$   e Zd ZdZdZdZdZdZdZdS )AggregationStrategyzDAll the valid aggregation strategies for TokenClassificationPipelinenonesimplefirstaveragemaxN)	r'   r(   r)   r*   NONESIMPLEFIRSTAVERAGEMAXr$   r$   r$   r%   r+   8   s    r+   T)has_tokenizera
  
        ignore_labels (`list[str]`, defaults to `["O"]`):
            A list of labels to ignore.
        grouped_entities (`bool`, *optional*, defaults to `False`):
            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
            same entity together in the predictions or not.
        stride (`int`, *optional*):
            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
            The strategy to fuse (or not) tokens based on the model prediction.

                - "none" : Will simply not do any aggregation and simply return raw results from the model
                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
                  that support that meaning, which is basically tokens separated by a space). These mitigations will
                  only work on real words, "New york" might still be tagged with two different entities.
                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
                  end up with different tags. Words will simply use the tag of the first token of the word when there
                  is ambiguity.
                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
                  label is applied.
                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
                  end up with different tags. Word entity will simply be the token with the maximum score.c                       sl  e Zd ZdZdZdZdZdZdZe	 f fdd	Z
								d7dee d	ee d
ee deeeeef   dee dee dee fddZedededeeeef  fddZedee dedeeeeef   fddZdeeee f dedeeeeef  eeeeef   f f fddZd8ddZdd ZejdfddZdd Z		d9d ed!ejd"ejdeeeeef   d#ejd
ed$eeee   d%eeeeef   dee fd&d'Zd(ee d
edee fd)d*Z d+ee d
edefd,d-Z!d+ee d
edee fd.d/Z"d+ee defd0d1Z#d2edeeef fd3d4Z$d+ee dee fd5d6Z%  Z&S ):TokenClassificationPipelineuv	  
    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
    examples](../task_summary#named-entity-recognition) for more information.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
    >>> tokens = token_classifier(sentence)
    >>> tokens
    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]

    >>> token = tokens[0]
    >>> # Start and end provide an easy way to highlight words in the original text.
    >>> sentence[token["start"] : token["end"]]
    ' jean-baptiste'

    >>> # Some models use the same idea to do part of speech.
    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
    >>> syntaxer("My name is Sarah and I live in London")
    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).

    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
    	sequencesFTc                    s@   t  j|i | | | jdkrtnt tdd| _|| _d S )NtfF)do_lower_case)	super__init__check_model_type	frameworkr   r   r   _basic_tokenizer_args_parser)r!   args_parserargsr"   	__class__r$   r%   r<      s   

z$TokenClassificationPipeline.__init__Ngrouped_entitiesignore_subwordsaggregation_strategyr   r   strider   c	                 C   sb  i }	||	d< |r|d u rdn||	d< |d ur||	d< i }
|d us$|d urQ|r,|r,t j}n|r4|s4t j}nt j}|d urDtd| d |d urQtd| d |d urvt|tr`t |  }|t jt j	t j
hv rr| jjsrtd||
d	< |d ur~||
d
< |d ur|| jjkrtd|t jkrtd| d| jjrdd|d}||	d< ntd|	i |
fS )Nr    r   r   zl`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="z"` instead.zk`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="z{Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option to `"simple"` or use a fast tokenizer.rG   ignore_labelszl`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)zI`stride` was provided to process all the text but `aggregation_strategy="z&"`, please select another one instead.T)return_overflowing_tokenspaddingrH   tokenizer_paramszm`stride` was provided to process all the text but you're using a slow tokenizer. Please use a fast tokenizer.)r+   r3   r2   r1   warningswarnr   r   upperr5   r4   	tokenizeris_fastr    model_max_length)r!   rJ   rE   rF   rG   r   r   rH   r   preprocess_paramspostprocess_paramsrM   r$   r$   r%   _sanitize_parameters   sx   



z0TokenClassificationPipeline._sanitize_parametersr   r"   returnc                 K      d S Nr$   r!   r   r"   r$   r$   r%   r&         z$TokenClassificationPipeline.__call__c                 K   rX   rY   r$   rZ   r$   r$   r%   r&      r[   c                    sv   | j |fi |\}}}}||d< ||d< |r+tdd |D s+t j|gfi |S |r1||d< t j|fi |S )a  
        Classify each token of the text(s) given as inputs.

        Args:
            inputs (`str` or `List[str]`):
                One or several texts (or one list of texts) for token classification. Can be pre-tokenized when
                `is_split_into_words=True`.

        Return:
            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
            the following keys:

            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
              want to have the exact string in the original sentence, use `start` and `end`.
            - **score** (`float`) -- The corresponding probability for `entity`.
            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
              *aggregation_strategy* is not `"none"`.
            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
              token in the sentence.
            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
        r   r   c                 s   s    | ]}t |tV  qd S rY   )r   r   ).0inputr$   r$   r%   	<genexpr>  s    z7TokenClassificationPipeline.__call__.<locals>.<genexpr>r   )r@   allr;   r&   )r!   r   r"   _inputsr   r   r   rC   r$   r%   r&      s   c                 +   s   | di }| jjo| jjdk}d }|d }|rT|d }t|ts&td|}	||	}g }t|}
d}|	D ]}|||t| f |t||
 7 }q7|	}d|d< nt|t	s]td|}| j|f| j
|d| jjd|}|rz| jjsztd	| d
d  t|d }t|D ]J | j
dkr fdd| D }n fdd| D }|d ur||d<  dkr|nd |d<  |d k|d< |d ur| |d< ||d< |V  qd S )NrM   r   r   r   zEWhen `is_split_into_words=True`, `sentence` must be a list of tokens.TzKWhen `is_split_into_words=False`, `sentence` must be an untokenized string.)return_tensors
truncationreturn_special_tokens_maskreturn_offsets_mappingz@is_split_into_words=True is only supported with fast tokenizers.overflow_to_sample_mapping	input_idsr9   c                    s"   i | ]\}}|t |  d qS r   )r9   expand_dimsr\   kvir$   r%   
<dictcomp>F  s   " z:TokenClassificationPipeline.preprocess.<locals>.<dictcomp>c                    s    i | ]\}}||   d qS rg   )	unsqueezeri   rl   r$   r%   rn   H  s     r   sentencer   is_lastword_idsword_to_chars_map)poprQ   rS   r   r   r    joinr   appendr   r>   rR   rangeitemsrr   )r!   rp   r   rT   rM   rb   rs   r   r   wordsdelimiter_lenchar_offsetwordtext_to_tokenizer   
num_chunksmodel_inputsr$   rl   r%   
preprocess  sb   



	
z&TokenClassificationPipeline.preprocessc           
      C   s   | d}| dd }| d}| d}| dd }| dd }| jdkr1| jdi |d }n| jdi |}	t|	trB|	d	 n|	d }|||||||d
|S )Nspecial_tokens_maskr   rp   rq   rr   rs   r9   r   logits)r   r   r   rp   rq   rr   rs   r$   )rt   r>   modelr   dict)
r!   r   r   r   rp   rq   rr   rs   r   outputr$   r$   r%   _forwardT  s(   



z$TokenClassificationPipeline._forwardc                    s|   d u rdg g }|d  d}|D ]}| jdkr3|d d jtjtjfv r3|d d tj }n|d d  }|d d }|d d }	|d d urS|d d nd }
|d	 d  }| d
}t	j
|ddd}t	|| }||jddd }| jdkr|	 }	|
d ur|
 nd }
| j||	||
||||d}| ||} fdd|D }|| qt|}|dkr| |}|S )NOr   rs   ptr   rp   rf   r   r   rr   T)axiskeepdimsr9   )rr   rs   c                    s0   g | ]}| d d vr| dd vr|qS )entityNentity_group)r   r\   r   rJ   r$   r%   
<listcomp>  s    z;TokenClassificationPipeline.postprocess.<locals>.<listcomp>r   )r   r>   dtypetorchbfloat16float16tofloat32numpynpr0   expsumgather_pre_entities	aggregateextendr   aggregate_overlapping_entities)r!   all_outputsrG   rJ   all_entitiesrs   model_outputsr   rp   rf   r   r   rr   maxesshifted_expscorespre_entitiesrE   entitiesr~   r$   r   r%   postprocessn  sN   $




z'TokenClassificationPipeline.postprocessc                 C   s   t |dkr|S t|dd d}g }|d }|D ]>}|d |d   kr*|d k rOn n#|d |d  }|d |d  }||ksL||krN|d |d krN|}q|| |}q|| |S )Nr   c                 S   s   | d S )Nstartr$   )xr$   r$   r%   <lambda>  s    zLTokenClassificationPipeline.aggregate_overlapping_entities.<locals>.<lambda>keyr   endscore)r   sortedrv   )r!   r   aggregated_entitiesprevious_entityr   current_lengthprevious_lengthr$   r$   r%   r     s$   $

z:TokenClassificationPipeline.aggregate_overlapping_entitiesrp   rf   r   r   rr   rs   c	                 C   sp  g }	t |D ]\}
}||
 rq| jt||
 }|dur||
 \}}|durA|durA||
 }|durA|| \}}||7 }||7 }t|tsS| jdkrS| }| }||| }t| jddrrt| jjj	ddrrt
|t
|k}n |tjtjtjhv rtdt |dkod||d |d  v}t||
 | jjkr|}d	}nd}d}d	}|||||
|d
}|	| q|	S )zTFuse various numpy arrays into dicts with all the information needed for aggregationNr   
_tokenizercontinuing_subword_prefixz?Tokenizer does not support real words, using fallback heuristicr   rI   r   F)r|   r   r   r   index
is_subword)	enumeraterQ   convert_ids_to_tokensintr   r>   itemgetattrr   r   r   r+   r3   r4   r5   rN   rO   UserWarningunk_token_idrv   )r!   rp   rf   r   r   r   rG   rr   rs   r   idxtoken_scoresr|   	start_indend_ind
word_index
start_char_word_refr   
pre_entityr$   r$   r%   r     s`   

 z/TokenClassificationPipeline.gather_pre_entitiesr   c                 C   s   |t jt jhv r7g }|D ])}|d  }|d | }| jjj| ||d |d |d |d d}|| qn| ||}|t jkrD|S | 	|S )Nr   r   r|   r   r   )r   r   r   r|   r   r   )
r+   r1   r2   argmaxr   configid2labelrv   aggregate_wordsgroup_entities)r!   r   rG   r   r   
entity_idxr   r   r$   r$   r%   r     s$   

z%TokenClassificationPipeline.aggregater   c                 C   s  | j dd |D }|tjkr&|d d }| }|| }| jjj| }nK|tjkrGt	|dd d}|d }| }|| }| jjj| }n*|tj
krmtdd |D }tj|dd	}	|	 }
| jjj|
 }|	|
 }ntd
||||d d |d d d}|S )Nc                 S      g | ]}|d  qS r|   r$   r   r$   r$   r%   r         z>TokenClassificationPipeline.aggregate_word.<locals>.<listcomp>r   r   c                 S   s   | d   S )Nr   )r0   )r   r$   r$   r%   r   !  s    z<TokenClassificationPipeline.aggregate_word.<locals>.<lambda>r   c                 S   r   )r   r$   r   r$   r$   r%   r   '  r   )r   zInvalid aggregation_strategyr   r   r   )r   r   r|   r   r   )rQ   convert_tokens_to_stringr+   r3   r   r   r   r   r5   r0   r4   r   stacknanmeanr    )r!   r   rG   r|   r   r   r   r   
max_entityaverage_scoresr   
new_entityr$   r$   r%   aggregate_word  s4   





z*TokenClassificationPipeline.aggregate_wordc                 C   s   |t jt jhv rtdg }d}|D ] }|du r|g}q|d r&|| q|| || |g}q|dur@|| || |S )z
        Override tokens from a given word that disagree to force agreement on word boundaries.

        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
        company| B-ENT I-ENT
        z;NONE and SIMPLE strategies are invalid for word aggregationNr   )r+   r1   r2   r    rv   r   )r!   r   rG   word_entities
word_groupr   r$   r$   r%   r   7  s"   z+TokenClassificationPipeline.aggregate_wordsc                 C   sl   |d d  ddd }tdd |D }dd |D }|t|| j||d d	 |d d
 d}|S )z
        Group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        r   r   -r   r   c                 S   r   )r   r$   r   r$   r$   r%   r   \  r   zBTokenClassificationPipeline.group_sub_entities.<locals>.<listcomp>c                 S   r   r   r$   r   r$   r$   r%   r   ]  r   r   r   )r   r   r|   r   r   )splitr   r   meanrQ   r   )r!   r   r   r   tokensr   r$   r$   r%   group_sub_entitiesS  s   


z.TokenClassificationPipeline.group_sub_entitiesentity_namec                 C   sT   | drd}|dd  }||fS | dr"d}|dd  }||fS d}|}||fS )NzB-Br   zI-I)
startswith)r!   r   bitagr$   r$   r%   get_tagh  s   
	
z#TokenClassificationPipeline.get_tagc           	      C   s   g }g }|D ]7}|s| | q| |d \}}| |d d \}}||kr2|dkr2| | q| | | |g}q|rH| | | |S )z
        Find and group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        r   r   r   )rv   r   r   )	r!   r   entity_groupsentity_group_disaggr   r   r   last_bilast_tagr$   r$   r%   r   v  s   
z*TokenClassificationPipeline.group_entities)NNNNNFNNrY   )NN)'r'   r(   r)   r*   default_input_names_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r<   r   boolr+   r   r   r   r   rV   r   r   r   r&   r   r   r   r1   r   r   r   ndarrayr   r   r   r   r   r   r   __classcell__r$   r$   rC   r%   r7   B   s    #$	
P$,&
';6	

I"r7   ) r   rN   typingr   r   r   r   r   r   models.bert.tokenization_bertr   utilsr   r	   r
   r   baser   r   r   r   
tensorflowr9   models.auto.modeling_tf_autor   r   models.auto.modeling_autor   r   r+   r7   NerPipeliner$   r$   r$   r%   <module>   s2    
"    <