U
    4Afw.                     @   sx   d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ G dd	 d	ejjZdS )
    N)ListUnion)BertTokenizer)FastBertTokenizerShrinkLongestTrimmercase_fold_utf8combine_segmentspad_model_inputs   )keras   c                       s   e Zd ZdZdeeeeeeeeeeeed fddZe	d	d
ddZ
e	eeejf dddZdd ZdddZdd Z  ZS )TFBertTokenizera  
    This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
    from an existing standard tokenizer object.

    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
    straight from `tf.string` inputs to outputs.

    Args:
        vocab_list (`list`):
            List containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        padding (`str`, defaults to `"longest"`):
            The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
            or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
        truncation (`bool`, *optional*, defaults to `True`):
            Whether to truncate the sequence to the maximum length.
        max_length (`int`, *optional*, defaults to `512`):
            The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
            `truncation` is `True`).
        pad_to_multiple_of (`int`, *optional*, defaults to `None`):
            If set, the sequence will be padded to a multiple of this value.
        return_token_type_ids (`bool`, *optional*, defaults to `True`):
            Whether to return token_type_ids.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention_mask.
        use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
            TFLite.
    NlongestT   )
vocab_listdo_lower_casecls_token_idsep_token_idpad_token_idpadding
truncation
max_lengthpad_to_multiple_ofreturn_token_type_idsreturn_attention_maskuse_fast_bert_tokenizerc              	      s  t    |r*t|ftj|d|| _nVtjjtjj|tj	tj
tj|tjdtjdtjddd}t|ftj|d|| _|| _|| _|d kr|dn|| _|d kr|d	n|| _|d kr|d
n|| _t|d dd| _|| _|| _|| _|	| _|
| _|| _d S )N)token_out_typeZlower_case_nfd_strip_accents)Zout_type)Zdtype)keysZ	key_dtypevaluesZvalue_dtyper   )Znum_oov_buckets)r   Z
lower_casez[CLS]z[SEP]z[PAD]r
   Zaxis)super__init__r   tfZint64tf_tokenizerlookupZStaticVocabularyTableZKeyValueTensorInitializerstringrangesizeBertTokenizerLayerr   r   indexr   r   r   r   paired_trimmerr   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   Ztokenizer_kwargsZlookup_table	__class__ Q/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/bert/tokenization_bert_tf.pyr!   9   sN    
 
	 zTFBertTokenizer.__init__ZPreTrainedTokenizerBase)	tokenizerc           	      K   s   | dd}|dkr|jn|}| dd}|dkr8|jn|}| dd}|dkrV|jn|}| dd}|dkrt|jn|}| }t| dd d}d	d
 |D }| f |||||d|S )a  
        Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

        Args:
            tokenizer (`PreTrainedTokenizerBase`):
                The tokenizer to use to initialize the `TFBertTokenizer`.

        Examples:

        ```python
        from transformers import AutoTokenizer, TFBertTokenizer

        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
        ```
        r   Nr   r   r   c                 S   s   | d S )Nr   r.   )xr.   r.   r/   <lambda>       z0TFBertTokenizer.from_tokenizer.<locals>.<lambda>)keyc                 S   s   g | ]}|d  qS )r   r.   ).0entryr.   r.   r/   
<listcomp>   s     z2TFBertTokenizer.from_tokenizer.<locals>.<listcomp>r   r   r   r   r   )popr   r   r   r   Z	get_vocabsorteditems)	clsr0   kwargsr   r   r   r   Zvocabr   r.   r.   r/   from_tokenizeri   s(    zTFBertTokenizer.from_tokenizer)pretrained_model_name_or_pathc                 O   sP   zt j|f||}W n*   ddlm} |j|f||}Y nX | j|f|S )a  
        Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name or path to the pre-trained tokenizer.

        Examples:

        ```python
        from transformers import TFBertTokenizer

        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        ```
        r   )BertTokenizerFast)r   from_pretrainedZtokenization_bert_fastr@   r>   )r<   r?   Zinit_inputsr=   r0   r@   r.   r.   r/   rA      s    zTFBertTokenizer.from_pretrainedc                 C   s&   | j rt|}| j|}|ddS )Nr   )r   r   r#   tokenizeZ
merge_dims)r+   Ztextstokensr.   r.   r/   unpaired_tokenize   s    z!TFBertTokenizer.unpaired_tokenizec	                 C   s&  |d kr| j }|dkrtd|d k	r6|d k	r6td|d krD| j}|d krR| j}|d kr`| j}|d krn| j}|d kr|| j}t|tj	st
|}|d k	rt|tj	st
|}|d k	r|jjdkrtd|jjdkrtd|jjdkr|d d df |d d df  }}| |}|d kr\|rB|d d d |d f }t|f| j| jd	\}	}
n>| |}|r| j||g\}}t||f| j| jd	\}	}
|d
kr|	jdd}|d k	r|tj| |  }n|}t|	|| jd\}	}d|	i}|r ||d< |r"t|
|| jd\}
}|
|d< |S )N)r   r   z1Padding must be either 'longest' or 'max_length'!zJmax_length cannot be overridden at call time when truncating paired texts!r   zJtext argument should not be multidimensional when a text pair is supplied!z)text_pair should not be multidimensional!   r   )Zstart_of_sequence_idZend_of_segment_idr   r   )Zmax_seq_lengthZ	pad_value	input_idsattention_masktoken_type_ids)r   
ValueErrorr   r   r   r   r   
isinstancer"   ZTensorZconvert_to_tensorshapeZrankrE   r   r   r   r*   ZtrimZbounding_shapemathfloordivr	   r   )r+   textZ	text_pairr   r   r   r   r   r   rG   rI   Z
pad_lengthrH   output_r.   r.   r/   call   sz    

"

  
  


  
zTFBertTokenizer.callc                 C   s   | j | j| j| j| jdS )Nr8   r8   )r+   r.   r.   r/   
get_config   s    zTFBertTokenizer.get_config)
NNNr   Tr   NTTT)NNNNNNN)__name__
__module____qualname____doc__r   boolintstrr!   classmethodr>   r   osPathLikerA   rE   rR   rS   __classcell__r.   r.   r,   r/   r      sN   0          0&	       
Hr   )r\   typingr   r   Z
tensorflowr"   Ztensorflow_textr   r(   r   r   r   r   r	   Zmodeling_tf_utilsr   Ztokenization_bertZlayersZLayerr   r.   r.   r.   r/   <module>   s   