U
    4Af-'                     @   sZ   d Z ddlZddlmZmZmZ ddlmZmZ ddl	m
Z
 e
eZG dd deZdS )	z"Tokenization class for model ByT5.    N)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc                       s  e Zd ZdZddgZd(dd	 fd
dZedd Zdd Zd)e	e
 ee	e
  ee	e
 d fddZe	e
 e	e
 dddZd*e	e
 ee	e
  e	e
 dddZd+e	e
 ee	e
  e	e
 dddZee	e dddZdd  Zd!d" Zd#d$ Zd,eee ee d%d&d'Z  ZS )-ByT5Tokenizera  
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    Z	input_idsZattention_mask</s><unk><pad>}   N)returnc                    s
  |dkr$|d kr$dd t |D }nP|dkrt|d k	rtt|dkrttttdd |}||krttd| d| dt|trt|d	d	d
n|}t|trt|d	d	d
n|}t|trt|d	d	d
n|}|||d| _t| j| _	d| _
t jf |||d|d| d S )Nr   c                 S   s   g | ]}d | dqS )z
<extra_id_> .0ir   r   N/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/byt5/tokenization_byt5.py
<listcomp>I   s     z*ByT5Tokenizer.__init__.<locals>.<listcomp>c                 S   s   t dt| kS )NZextra_id)boolstr)xr   r   r   <lambda>L       z(ByT5Tokenizer.__init__.<locals>.<lambda>zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens)rangelensetfilter
ValueError
isinstancer   r   Z_added_tokens_decoderoffset_utf_vocab_sizesuper__init__)selfr    r!   r"   r#   r$   kwargsZextra_tokens	__class__r   r   r.   >   s.    
zByT5Tokenizer.__init__c                 C   s   | j S )N)r,   r/   r   r   r   
vocab_sizee   s    zByT5Tokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr   r3   r   r   
<dictcomp>j   s      z+ByT5Tokenizer.get_vocab.<locals>.<dictcomp>)r%   r4   r+   updateadded_tokens_encoder)r/   Zvocabr   r3   r   	get_vocabi   s    zByT5Tokenizer.get_vocabF)token_ids_0token_ids_1already_has_special_tokensr   c                    sZ   |rt  j||ddS |dkr2dgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r9   r:   r;   Nr   r   )r-   get_special_tokens_maskr&   )r/   r9   r:   r;   r1   r   r   r<   n   s      z%ByT5Tokenizer.get_special_tokens_mask)	token_idsr   c                 C   sB   t |dkr2|d | jkr2td| j d |S || jg S dS )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.N)r&   eos_token_idwarningswarnr    )r/   r=   r   r   r   _add_eos_if_not_present   s    z%ByT5Tokenizer._add_eos_if_not_present)r9   r:   r   c                 C   s<   | j g}|dkr"t|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r?   r&   )r/   r9   r:   Zeosr   r   r   $create_token_type_ids_from_sequences   s    z2ByT5Tokenizer.create_token_type_ids_from_sequencesc                 C   s,   |  |}|dkr|S |  |}|| S dS )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rB   )r/   r9   r:   r   r   r    build_inputs_with_special_tokens   s
    

z.ByT5Tokenizer.build_inputs_with_special_tokens)textr   c                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r   )chrr   r   r   r   r      s     z+ByT5Tokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r/   rE   tokensr   r   r   	_tokenize   s    zByT5Tokenizer._tokenizec                 C   s$   t |dkrd}nt|| j }|S )z0Converts a token (str) in an id using the vocab.r   N)r&   ordr+   )r/   tokenZtoken_idr   r   r   _convert_token_to_id   s    z"ByT5Tokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)rF   r+   )r/   indexrL   r   r   r   _convert_id_to_token   s    z"ByT5Tokenizer._convert_id_to_tokenc                 C   sh   d}|D ]L}|| j kr(| j | d}n$|| jkr>|d}ntt|g}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.r   rG   ignore)errors)Zadded_tokens_decoderrH   r7   bytesrK   decode)r/   rI   bstringrL   Z
tok_stringstringr   r   r   convert_tokens_to_string   s    


z&ByT5Tokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c                 C   s   dS )Nr   r   )r/   rW   rX   r   r   r   save_vocabulary   s    zByT5Tokenizer.save_vocabulary)r
   r   r   r   N)NF)N)N)N)__name__
__module____qualname____doc__Zmodel_input_namesr.   propertyr4   r8   r   intr   r   r<   rB   rC   rD   r   rJ   rM   rO   rV   r   rY   __classcell__r   r   r1   r   r	      sN         '
    
   
  

r	   )r]   r@   typingr   r   r   Ztokenization_utilsr   r   utilsr   Z
get_loggerrZ   loggerr	   r   r   r   r   <module>   s   
