U
    4Af&                     @   sb   d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	 e
eZddd	d
ZG dd deZdS )    )ListOptionalTuple   )PreTrainedTokenizerFast)logging   )HerbertTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                       s   e Zd ZdZeZeZd fdd		Zde	e
 ee	e
  e	e
 d
ddZde	e
 ee	e
  ee	e
 d fddZde	e
 ee	e
  e	e
 d
ddZdeee ee dddZ  ZS )HerbertTokenizerFastam  
    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).

    Peculiarities:

    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
      a punctuation character will be treated separately.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
    N<s><unk><pad><mask></s>c	           
   	      s(   t  j||f||||||d|	 d S )N)r   	cls_token	unk_token	pad_token
mask_token	sep_token)super__init__)
selfr
   r   r   r   r   r   r   r   kwargs	__class__ Y/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/herbert/tokenization_herbert_fast.pyr   2   s    	zHerbertTokenizerFast.__init__)token_ids_0token_ids_1returnc                 C   s8   | j g}| jg}|dkr$|| | S || | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An HerBERT, like BERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)cls_token_idsep_token_id)r   r    r!   clssepr   r   r    build_inputs_with_special_tokensJ   s
    z5HerbertTokenizerFast.build_inputs_with_special_tokensF)r    r!   already_has_special_tokensr"   c                    sf   |rt  j||ddS |dkr8dgdgt|  dg S dgdgt|  dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r    r!   r(   Nr   r   )r   get_special_tokens_masklen)r   r    r!   r(   r   r   r   r)   e   s      z,HerbertTokenizerFast.get_special_tokens_maskc                 C   sV   | j g}| jg}|dkr.t|| | dg S t|| | dg t|| dg  S )a{  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
        BERT sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        Nr   r   )r$   r#   r*   )r   r    r!   r&   r%   r   r   r   $create_token_type_ids_from_sequences   s
    z9HerbertTokenizerFast.create_token_type_ids_from_sequences)save_directoryfilename_prefixr"   c                 C   s   | j jj||d}t|S )N)name)
_tokenizermodelsavetuple)r   r,   r-   filesr   r   r   save_vocabulary   s    z$HerbertTokenizerFast.save_vocabulary)NNNr   r   r   r   r   )N)NF)N)N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesr	   Zslow_tokenizer_classr   r   intr   r'   boolr)   r+   strr   r4   __classcell__r   r   r   r   r      sB             
    
   
r   N)typingr   r   r   Ztokenization_utils_fastr   utilsr   Ztokenization_herbertr	   Z
get_loggerr5   loggerr9   r   r   r   r   r   <module>   s   
