U
    4A·fä  ã                   @   sŠ   d Z ddlZddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dZe e¡Zeeejd	œd
d„ZG dd„ dƒZG dd„ dƒZdS )z%REALM Retriever model implementation.é    N)ÚOptionalÚUnion)Úhf_hub_downloadé   )ÚAutoTokenizer)Úloggingzblock_records.npy)Úblock_records_pathÚnum_block_recordsÚreturnc                 C   sF   dd l m  m} |jj| dd}|j|dd}t| d¡ ¡ ƒ}|S )Nr   i    )Úbuffer_sizeT)Zdrop_remainderé   )	Ztensorflow.compat.v1ÚcompatZv1ÚdataZTFRecordDatasetÚbatchÚnextÚtakeZas_numpy_iterator)r   r	   ÚtfZblocks_datasetZ	np_record© r   úX/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/deprecated/realm/retrieval_realm.pyÚconvert_tfrecord_to_np!   s
    r   c                   @   s"   e Zd ZdZddd„Zdd	„ Zd
S )ÚScaNNSearcherztNote that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included.é   éè  éd   é † c           	      C   sD   ddl m} |||dd}|j|||d}|j|d}| ¡ | _dS )zBuild scann searcher.r   )ÚbuilderZdot_product)ÚdbÚnum_neighborsZdistance_measure)Ú
num_leavesÚnum_leaves_to_searchÚtraining_sample_size)Údimensions_per_blockN)Z#scann.scann_ops.py.scann_ops_pybindr   ÚtreeZscore_ahÚbuildÚsearcher)	Úselfr   r   r!   r   r   r    ZBuilderr   r   r   r   Ú__init__.   s      ÿzScaNNSearcher.__init__c                 C   s"   | j  | ¡  ¡ ¡\}}| d¡S )NZint64)r$   Úsearch_batchedÚdetachÚcpuZastype)r%   Zquestion_projectionÚretrieved_block_idsÚ_r   r   r   r'   C   s    zScaNNSearcher.search_batchedN)r   r   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r&   r'   r   r   r   r   r   +   s       ù
r   c                       sZ   e Zd ZdZ‡ fdd„Zddd„Zeeee	e
jf  dœd	d
„ƒZdd„ Zdd„ Z‡  ZS )ÚRealmRetrieverah  The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
    positions."

        Parameters:
            block_records (`np.ndarray`):
                A numpy array which cantains evidence texts.
            tokenizer ([`RealmTokenizer`]):
                The tokenizer to encode retrieved texts.
    c                    s   t ƒ  ¡  || _|| _d S ©N)Úsuperr&   Úblock_recordsÚ	tokenizer)r%   r3   r4   ©Ú	__class__r   r   r&   S   s    
zRealmRetriever.__init__NÚptc                 C   sš   t j| j|dd}| jj|d dd}g }g }	|D ]}
| |¡ |	 |
 ¡ ¡ q2| j||	ddd|d}| |¡}|d k	rŠ|  ||¡|f S d d d |fS d S )Nr   )ÚindicesZaxisT)Zskip_special_tokens)ÚpaddingZ
truncationZreturn_special_tokens_maskÚ
max_length)Únpr   r3   r4   ÚdecodeÚappendZconvert_to_tensorsÚblock_has_answer)r%   r*   Zquestion_input_idsÚ
answer_idsr:   Zreturn_tensorsZretrieved_blocksZquestionÚtextZ	text_pairZretrieved_blockÚconcat_inputsZconcat_inputs_tensorsr   r   r   Ú__call__X   s&    
     ÿ
zRealmRetriever.__call__)Úpretrained_model_name_or_pathc                 O   sZ   t j |¡rt j |t¡}ntf |tdœ|—Ž}tj|dd}tj	|f|ž|Ž}| ||ƒS )N)Zrepo_idÚfilenameT)Zallow_pickle)
ÚosÚpathÚisdirÚjoinÚ_REALM_BLOCK_RECORDS_FILENAMEr   r;   Úloadr   Úfrom_pretrained)ÚclsrC   Zinit_inputsÚkwargsr   r3   r4   r   r   r   rK   m   s     ÿÿzRealmRetriever.from_pretrainedc                 C   s(   t  tj |t¡| j¡ | j |¡ d S r1   )	r;   ÚsaverE   rF   rH   rI   r3   r4   Úsave_pretrained)r%   Zsave_directoryr   r   r   rO   {   s    zRealmRetriever.save_pretrainedc                 C   sj  g }g }g }d}|j D ] }| ¡ }| | jj¡}	|	d ||	d d…  | jj¡ }
| g ¡ | g ¡ |D ]h}t|	d |
ƒD ]T}|d || kr~|||t|ƒ … |kr~|d  |¡ |d  |t|ƒ d ¡ q~qlt|d ƒdkrò| d¡ q| d¡ t|d ƒ|krt|d ƒ}qt||ƒD ]:\}}t|ƒ|k r$dg|t|ƒ  }||7 }||7 }q$|||fS )z&check if retrieved_blocks has answers.r   r   NéÿÿÿÿFT)	Z	input_idsÚtolistÚindexr4   Zsep_token_idr=   ÚrangeÚlenÚzip)r%   rA   r?   Zhas_answersZ	start_posZend_posZmax_answersZinput_idZinput_id_listZfirst_sep_idxZsecond_sep_idxZanswerÚidxZ
start_pos_Zend_pos_Zpaddedr   r   r   r>      s6    "


zRealmRetriever.block_has_answer)Nr7   )r,   r-   r.   r/   r&   rB   Úclassmethodr   r   ÚstrrE   ÚPathLikerK   rO   r>   Ú__classcell__r   r   r5   r   r0   H   s   

r0   )r/   rE   Útypingr   r   Znumpyr;   Zhuggingface_hubr   Ú r   Úutilsr   rI   Z
get_loggerr,   ÚloggerrX   ÚintZndarrayr   r   r0   r   r   r   r   Ú<module>   s   

