U
    4Af\                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ eeZdZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZdS )    N)DictListOptional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c                   @   sB   e Zd ZdZdeeeee dddZdd Z	e
jd	d
dZdS )TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    FN)	tokenizer	file_path
block_size	cache_dirc              
   C   s  t tdt tj|dkr2td| d||j	dd }tj
|\}}tj|d k	rd|n|d|jj d| d| }|d }	t|	6 tj|r|st }
t|d	}t|| _W 5 Q R X td
| dt |
  ntd|  g | _t|dd}| }W 5 Q R X |||}tdt|| d |D ]$}| j|||||   qLt }
t|d}tj| j|tjd W 5 Q R X td| dt |
 dd W 5 Q R X d S )Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpairZ
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftextZtokenized_texti rP   P/tmp/pip-unpacked-wheel-zw5xktn0/transformers/data/datasets/language_modeling.py__init__-   sN    
 
zTextDataset.__init__c                 C   s
   t | jS Nr@   r9   rE   rP   rP   rQ   __len__j   s    zTextDataset.__len__returnc                 C   s   t j| j| t jdS )NZdtype)torchtensorr9   longrE   rO   rP   rP   rQ   __getitem__m   s    zTextDataset.__getitem__)FN)r3   
__module____qualname____doc__r   strintr   rR   rV   rZ   ZTensorr^   rP   rP   rP   rQ   r
   (   s   	  =r
   c                   @   sB   e Zd ZdZeeedddZdd Ze	ee
jf ddd	Zd
S )LineByLineTextDatasetr   r   r   r   c              	   C   s   t tdt tj|dkr2td| dt	
d|  t|dd}dd	 |  D }W 5 Q R X ||d
d
|d}|d | _dd	 | jD | _d S )Nr   Fr   r   r   r   r   c                 S   s$   g | ]}t |d kr| s|qS r   )r@   isspace.0linerP   rP   rQ   
<listcomp>   s       z2LineByLineTextDataset.__init__.<locals>.<listcomp>TZadd_special_tokensZ
truncation
max_length	input_idsc                 S   s    g | ]}d t j|t jdiqS rn   rY   rZ   r[   r\   ri   erP   rP   rQ   rk      s     )r&   r'   r(   r)   r*   r+   r,   r-   r.   r:   r;   r6   r<   
splitlinesr9   )rE   r   r   r   rM   linesbatch_encodingrP   rP   rQ   rR   v   s     
zLineByLineTextDataset.__init__c                 C   s
   t | jS rS   rT   rU   rP   rP   rQ   rV      s    zLineByLineTextDataset.__len__rW   c                 C   s
   | j | S rS   r9   r]   rP   rP   rQ   r^      s    z!LineByLineTextDataset.__getitem__Nr3   r_   r`   ra   r   rb   rc   rR   rV   r   rZ   r[   r^   rP   rP   rP   rQ   rd   q   s   rd   c                   @   sD   e Zd ZdZeeeedddZdd Ze	ee
jf ddd	Zd
S )LineByLineWithRefDatasetr   )r   r   r   ref_pathc              
   C   sp  t tdt tj|dkr2td| dtj|dkrRtd| dt	
d|  t	
d|  t|dd	}| }W 5 Q R X d
d |D }t|dd	}dd |  D }W 5 Q R X t|t|krtd| dt| d| dt| ||dd|d}|d | _dd | jD | _t| j}	t|	D ]$}
tj||
 tjd| j|
 d< qFd S )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r   c                 S   s(   g | ] }t |d kr| s| qS rf   )r@   rg   striprh   rP   rP   rQ   rk      s       z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>c                 S   s*   g | ]"}t |d kr| st|qS rf   )r@   rg   jsonloadsrh   rP   rP   rQ   rk      s       zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trl   rn   c                 S   s    g | ]}d t j|t jdiqS ro   rp   rq   rP   rP   rQ   rk      s     rY   Zchinese_ref)r&   r'   r(   r)   r*   r+   r,   r-   r.   r:   r;   r6   	readlinesr<   rs   r@   r9   r?   rZ   r[   r\   )rE   r   r   r   ry   rM   datarefru   nrO   rP   rP   rQ   rR      s6     "

z!LineByLineWithRefDataset.__init__c                 C   s
   t | jS rS   rT   rU   rP   rP   rQ   rV      s    z LineByLineWithRefDataset.__len__rW   c                 C   s
   | j | S rS   rv   r]   rP   rP   rQ   r^      s    z$LineByLineWithRefDataset.__getitem__Nrw   rP   rP   rP   rQ   rx      s   $rx   c                   @   sL   e Zd ZdZeeedddZdddZdd	 Z	e
eejf d
ddZdS )LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    )r   file_dirr   c              
      s$  t tdt tj|dkr0t| dt	
d|  g | _t|D ]}tj||}tj|dkrt| dd}t|dd~}| }g }	|D ]f}
d|
krd	}qd
|
krd} fdd|	dd  D }| || }| j| g }	q|r|	|
 qW 5 Q R X qPt	
d d S )Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r   z<doc id=Tz</doc>c                    s0   g | ](}t |d kr| s  |qS rf   )r@   rg   r=   r>   rh   r   rP   rQ   rk      s    z9LineByLineWithSOPTextDataset.__init__.<locals>.<listcomp>r   zDataset parse finished.)r&   r'   r(   r)   r*   r+   r,   isdirr.   r:   r;   r9   listdirr1   r-   r6   r}   create_examples_from_documentextendrA   )rE   r   r   r   	file_namer   Zarticle_openrM   Zoriginal_linesZarticle_linesrj   documentr9   rP   r   rQ   rR      s@    

z%LineByLineWithSOPTextDataset.__init__皙?c                 C   s  ||j dd }|}t |k r,td|}g }g }d}	d}
|
t|k r||
 }|s`|
d7 }
q<|| |	t|7 }	|
t|d ks|	|kr|rd}t|dkrtdt|d }g }t|D ]}|||  qg }t|t|D ]}|||  qt|dks<t|dkrq<t dk r:d}|| }}nd}dd	 }|||| t|dksttd
t| dt|dkstdt| d|||}|	||}t
j|t
jdt
j|t
jdt
j|rdndt
jdd}|| g }d}	|
d7 }
q<|S )'Creates examples for a single document.Tr      r   r         ?Fc                 S   sh   t | t | }||krqdt | t |kr.| n|}t |dksFtdt dk rZ|d= q |  q dS )z;Truncates a pair of sequences to a maximum sequence length.r   z8Sequence length to be truncated must be no less than oner   r   N)r@   r.   randompop)tokens_atokens_bmax_num_tokenstotal_lengthZtrunc_tokensrP   rP   rQ   truncate_seq_pair-  s    zULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pairLength of sequence a is  which must be no less than 1Length of sequence b is rY   )rn   token_type_idsZsentence_order_label)r/   r   randintr@   rA   r?   r   r.   rB   $create_token_type_ids_from_sequencesrZ   r[   r\   )rE   r   r   r   Zshort_seq_probr   target_seq_lengthr9   current_chunkcurrent_lengthrO   segmenta_endr   jr   Zis_nextr   rn   r   examplerP   rP   rQ   r      sb    	


z:LineByLineWithSOPTextDataset.create_examples_from_documentc                 C   s
   t | jS rS   rT   rU   rP   rP   rQ   rV   S  s    z$LineByLineWithSOPTextDataset.__len__rW   c                 C   s
   | j | S rS   rv   r]   rP   rP   rQ   r^   V  s    z(LineByLineWithSOPTextDataset.__getitem__N)r   )r3   r_   r`   ra   r   rb   rc   rR   r   rV   r   rZ   r[   r^   rP   rP   rP   rQ   r      s
   )
cr   c                   @   sN   e Zd ZdZdeeedddZeee  eedd	d
Z	dd Z
dd ZdS )$TextDatasetForNextSentencePredictionr   Fr   r   re   c              
   C   s$  t tdt tj|s.td| d|| _	|| _
tj|\}}tj|d|jj d| d| }	|| _|	d }
t|
 tj|	r|st }t|	d}t|| _W 5 Q R X td|	 d	t |  n4td
|  g g| _t|ddt}| }|sqv| }|sHt| jd dkrH| jg  ||}||}|r| jd | qW 5 Q R X tdt| j d g | _t | jD ]\}}| !||| qt }t|	d}tj"| j|tj#d W 5 Q R X td|	 dt | dd W 5 Q R X d S )Nr   r   r   Zcached_nsp_r   r   r   r   r   r   r   r   r   zCreating examples from z documents.r   r    r"   r#   r$   r%   )$r&   r'   r(   r)   r*   r+   r,   r-   r.   short_seq_probabilitynsp_probabilityr0   r1   r2   r3   r   r   r4   r5   r6   r7   r8   r9   r:   r;   	documentsreadlinerz   r@   rA   r>   r=   	enumerater   rC   rD   )rE   r   r   r   rF   r   r   rG   rH   rI   rJ   rK   rL   rM   rj   tokens	doc_indexr   rP   rP   rQ   rR   _  sb    	
 


z-TextDatasetForNextSentencePrediction.__init__)r   r   r   c                 C   s  || j jdd }|}t | jk r0td|}g }d}d}|t|k r~|| }	||	 |t|	7 }|t|d ks||krt|rld}
t|dkrtdt|d }
g }t|
D ]}|||  qg }t|dkst | j	k rd}|t| }tdD ],}tdt| j
d }||kr q2q| j
| }tdt|d }t|t|D ](}|||  t||kr^ qq^t||
 }||8 }n(d}t|
t|D ]}|||  qt|dkstdt| d	t|dks
td
t| d	| j ||}| j ||}tj|tjdtj|tjdtj|rPdndtjdd}| j| g }d}|d7 }q<dS )r   Tr   r   r   r   
   Fr   r   r   rY   )rn   r   Znext_sentence_labelN)r   r/   r   r   r   r@   rA   r?   r   r   r   r.   rB   r   rZ   r[   r\   r9   )rE   r   r   r   r   r   r   r   rO   r   r   r   r   r   Zis_random_nextZtarget_b_lengthr   Zrandom_document_indexZrandom_documentZrandom_startZnum_unused_segmentsrn   r   r   rP   rP   rQ   r     sh    	





zBTextDatasetForNextSentencePrediction.create_examples_from_documentc                 C   s
   t | jS rS   rT   rU   rP   rP   rQ   rV     s    z,TextDatasetForNextSentencePrediction.__len__c                 C   s
   | j | S rS   rv   r]   rP   rP   rQ   r^     s    z0TextDatasetForNextSentencePrediction.__getitem__N)Fr   r   )r3   r_   r`   ra   r   rb   rc   rR   r   r   rV   r^   rP   rP   rP   rQ   r   Z  s   	   UZr   )r{   r+   r7   r   r5   r&   typingr   r   r   rZ   Zfilelockr   Ztorch.utils.datar   Ztokenization_utilsr   utilsr	   Z
get_loggerr3   r:   r(   r
   rd   rx   r   r   rP   rP   rP   rQ   <module>   s(   
I!0 