U
    4Af6                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
mZ eeZeG dd dZedd	G d
d dZG dd dZG dd deZdS )    N)	dataclass)ListOptionalUnion   )is_tf_availableis_torch_availableloggingc                   @   sJ   e Zd ZU dZeed< eed< dZee ed< dZee ed< dd Z	dS )	InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 C   s   t jt| ddd S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesZasdictself r   F/tmp/pip-unpacked-wheel-zw5xktn0/transformers/data/processors/utils.pyto_json_string1   s    zInputExample.to_json_string)
__name__
__module____qualname____doc__str__annotations__r   r   r   r   r   r   r   r   r
      s   
r
   T)frozenc                   @   sf   e Zd ZU dZee ed< dZeee  ed< dZ	eee  ed< dZ
eeeef  ed< dd ZdS )	InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 C   s   t t| d S )r   r   r   r   r   r   r   r   K   s    zInputFeatures.to_json_string)r   r   r   r   r   intr!   r%   r   r&   r   r   floatr   r   r   r   r   r#   6   s   
r#   c                   @   sN   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dddZdS )DataProcessorzEBase class for data converters for sequence classification data sets.c                 C   s
   t  dS )z
        Gets an example from a dict with tensorflow tensors.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NNotImplementedError)r   Ztensor_dictr   r   r   get_example_from_tensor_dictS   s    z*DataProcessor.get_example_from_tensor_dictc                 C   s
   t  dS )z8Gets a collection of [`InputExample`] for the train set.Nr*   r   data_dirr   r   r   get_train_examples]   s    z DataProcessor.get_train_examplesc                 C   s
   t  dS )z6Gets a collection of [`InputExample`] for the dev set.Nr*   r-   r   r   r   get_dev_examplesa   s    zDataProcessor.get_dev_examplesc                 C   s
   t  dS )z7Gets a collection of [`InputExample`] for the test set.Nr*   r-   r   r   r   get_test_examplese   s    zDataProcessor.get_test_examplesc                 C   s
   t  dS )z*Gets the list of labels for this data set.Nr*   r   r   r   r   
get_labelsi   s    zDataProcessor.get_labelsc                 C   s(   t |  dkr$|  t|j |_|S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenr2   r'   r   )r   exampler   r   r   tfds_mapm   s    zDataProcessor.tfds_mapNc              
   C   s:   t |ddd"}ttj|d|dW  5 Q R  S Q R X dS )z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openlistcsvreader)clsZ
input_filer;   fr   r   r   	_read_tsvv   s    zDataProcessor._read_tsv)N)r   r   r   r   r,   r/   r0   r1   r2   r6   classmethodrB   r   r   r   r   r)   P   s   
	r)   c                   @   sd   e Zd ZdZdddZdd Zd	d
 ZedddZedddZ	dddZ
dddZdddZdS )%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.NclassificationFc                 C   s4   |d krg n|| _ |d krg n|| _|| _|| _d S N)labelsexamplesmodeverbose)r   rG   rH   rI   rJ   r   r   r   __init__   s    z.SingleSentenceClassificationProcessor.__init__c                 C   s
   t | jS rF   )r4   rH   r   r   r   r   __len__   s    z-SingleSentenceClassificationProcessor.__len__c                 C   s(   t |trt| j| j| dS | j| S )N)rG   rH   )
isinstanceslicerD   rG   rH   )r   idxr   r   r   __getitem__   s    
z1SingleSentenceClassificationProcessor.__getitem__ r   r3   c           	   
   K   s(   | f |}|j ||||||ddd |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examples)add_examples_from_csv)	r@   	file_namerR   rS   rT   rU   rV   kwargs	processorr   r   r   create_from_csv   s    

z5SingleSentenceClassificationProcessor.create_from_csvc                 K   s   | f |}|j ||d |S )N)rG   )add_examples)r@   texts_or_text_and_labelsrG   r[   r\   r   r   r   create_from_examples   s    
z:SingleSentenceClassificationProcessor.create_from_examplesc	                 C   s   |  |}	|r|	dd  }	g }
g }g }t|	D ]`\}}|
||  |||  |d k	rj|||  q.|r|| d| nt|}|| q.| j|
||||dS )Nr3   -)rW   rX   )rB   	enumerateappendr    r^   )r   rZ   rR   rS   rT   rU   rV   rW   rX   linesZtextsrG   idsiliner   r   r   r   rY      s(    
    z;SingleSentenceClassificationProcessor.add_examples_from_csvc              	   C   sB  |d k	r4t |t |kr4tdt | dt | |d k	rht |t |krhtdt | dt | |d kr~d gt | }|d krd gt | }g }t }t|||D ]N\}}	}
t|ttfr|	d kr|\}}	n|}||	 |t	|
|d |	d q|r|| _
n| j
| |r&t|| _ntt| j|| _| j
S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r   r   r   r   )r4   
ValueErrorsetziprM   tupler=   addrc   r
   rH   extendrG   union)r   r_   rG   re   rW   rX   rH   Zadded_labelsZtext_or_text_and_labelr   r   textr   r   r   r^      s2    

z2SingleSentenceClassificationProcessor.add_examplesTc                    s  |dkr|j }dd t| jD }g }t| jD ]H\}	}
|	d dkrTtd|	  |j|
jdt||j d}|	| q0t
d	d
 |D }g  tt|| jD ]\}	\}}
|	d dkrtd|	 dt| j  |rdndgt| }|t| }|r&|g| | }|rdndg| | }n&||g|  }||r@dndg|  }t||krrtdt| d| t||krtdt| d| | jdkr||
j }n"| jdkrt|
j}n
t| j|	dk rZ| jrZtd td|
j  tdddd |D   tdddd |D   td|
j d| d  	t|||d q|dkr~ S |dkrt std ddl} fd!d"}|jj||j|jd#|jf|dg|dgd#|g f}|S |d$krt std%ddl}dd&l m!} |j"d'd  D |j#d(}|j"d)d  D |j#d(}| jdkr||j"d*d  D |j#d(}n&| jdkr|j"d+d  D |jd(}||||}|S td,dS )-a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
            `InputFeatures` which can be fed to the model.

        Nc                 S   s   i | ]\}}||qS r   r   ).0rf   r   r   r   r   
<dictcomp>  s      zFSingleSentenceClassificationProcessor.get_features.<locals>.<dictcomp>i'  r   zTokenizing example T)Zadd_special_tokens
max_lengthc                 s   s   | ]}t |V  qd S rF   )r4   )rp   r$   r   r   r   	<genexpr>  s     zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>zWriting example /r3   zError with input length z vs rE   Z
regression   z*** Example ***zguid: zinput_ids:  c                 S   s   g | ]}t |qS r   r    rp   xr   r   r   
<listcomp>6  s     zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>zattention_mask: c                 S   s   g | ]}t |qS r   rw   rx   r   r   r   rz   7  s     zlabel: z (id = )r$   r%   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc                  3   s$    D ]} | j | jd| jfV  qd S )Nr$   r%   r|   )exfeaturesr   r   genC  s    z?SingleSentenceClassificationProcessor.get_features.<locals>.genr~   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDatasetc                 S   s   g | ]
}|j qS r   )r$   rp   rA   r   r   r   rz   S  s     )Zdtypec                 S   s   g | ]
}|j qS r   )r%   r   r   r   r   rz   T  s     c                 S   s   g | ]
}|j qS r   r   r   r   r   r   rz   V  s     c                 S   s   g | ]
}|j qS r   r   r   r   r   r   rz   X  s     z,return_tensors should be one of 'tf' or 'pt')$max_lenrb   rG   rH   loggerinfoencoder   minrc   maxrj   r4   rh   rI   r   r(   rJ   r   joinr#   r   RuntimeErrorZ
tensorflowdataZDatasetZfrom_generatorZint32Zint64ZTensorShaper   torchZtorch.utils.datar   Ztensorlong)r   	tokenizerrr   Zpad_on_leftZ	pad_tokenZmask_padding_with_zeroZreturn_tensorsZ	label_mapZall_input_idsZex_indexr5   r$   Zbatch_lengthr%   Zpadding_lengthr   r}   r   Zdatasetr   r   Zall_attention_maskZ
all_labelsr   r   r   get_features   s    


  

"
z2SingleSentenceClassificationProcessor.get_features)NNrE   F)rQ   r   r3   NF)N)rQ   r   r3   NFFF)NNFF)NFr   TN)r   r   r   r   rK   rL   rP   rC   r]   r`   rY   r^   r   r   r   r   r   rD   }   s@   
                
       
(     rD   )r>   r   r   r   typingr   r   r   utilsr   r   r	   Z
get_loggerr   r   r
   r#   r)   rD   r   r   r   r   <module>   s   
-