U
    <Af                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dl	m
Z d dl	mZ d dlmZ d dlmZ d dlmZmZmZ dd	lmZmZ eeZG d
d dejZdS )    N)List)	load_file)	save_file)nn)tqdm)fullnamehttp_getimport_from_string   )WhitespaceTokenizerWordTokenizerc                   @   s   e Zd ZdeeedddZdd Zee	 dd	d
Z
edddZde	edddZdd Zee	dddZedde dfe	ee	edddZdS )WordEmbeddingsF@B )	tokenizerupdate_embeddingsmax_seq_lengthc                 C   s   t j|  t|tr t|}t|tjr6t	|}|
 \}}|| _t ||| _| jd|i || jj_|| _|| _|| _d S )Nweight)r   Module__init__
isinstancelistnpasarrayZndarraytorchZ
from_numpysizeembeddings_dimensionZ	Embedding	emb_layerZload_state_dictr   Zrequires_gradr   r   r   )selfr   embedding_weightsr   r   Znum_embeddingsr    r   O/tmp/pip-unpacked-wheel-i7fohqg6/sentence_transformers/models/WordEmbeddings.pyr      s    



zWordEmbeddings.__init__c                 C   s,   |  |d }d }||||d d |S )N	input_idsattention_mask)token_embeddingsZcls_token_embeddingsr"   )r   update)r   featuresr#   Z
cls_tokensr   r   r    forward-   s    zWordEmbeddings.forward)textsc                    s    fdd|D }dd |D }t |}g }g }|D ]<}dg|t|  }	|||	  |dgt| |	  q6tj|tjdtj|tjdtj|tjdd}
|
S )Nc                    s   g | ]}j j|f qS r   )r   tokenize).0textkwargsr   r   r    
<listcomp>:   s     z+WordEmbeddings.tokenize.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   )len)r)   tokensr   r   r    r-   ;   s     r   r
   )Zdtype)r!   r"   sentence_lengths)maxr.   appendr   Ztensorlong)r   r'   r,   Ztokenized_textsr0   max_lenr!   Zattention_masksr/   paddingoutputr   r+   r    r(   9   s    zWordEmbeddings.tokenize)returnc                 C   s   | j S )N)r   r   r   r   r    get_word_embedding_dimensionM   s    z+WordEmbeddings.get_word_embedding_dimensionT)output_pathsafe_serializationc              	   C   s|   t tj|dd}tj|  |dd W 5 Q R X |rRt|  tj|d nt	
|  tj|d | j
| d S )Nwordembedding_config.jsonw   )indentmodel.safetensorspytorch_model.bin)openospathjoinjsondumpget_config_dictsave_safetensors_fileZ
state_dictr   saver   )r   r:   r;   ZfOutr   r   r    rJ   P   s    zWordEmbeddings.savec                 C   s   t | j| j| jdS )N)tokenizer_classr   r   )r   r   r   r   r8   r   r   r    rH   Z   s    zWordEmbeddings.get_config_dict)
input_pathc              	   C   s   t tj| dd}t|}W 5 Q R X t|d }|| }tjtj| drjttj| d}nt	jtj| dt	
dd}|d }t|||d	 d
}|S )Nr<   rrK   r@   rA   cpu)Zmap_locationzemb_layer.weightr   r   r   r   )rB   rC   rD   rE   rF   loadr	   existsload_safetensors_filer   Zdevicer   )rL   fInconfigrK   r   weightsr   modelr   r   r    rP   a   s    
  zWordEmbeddings.load N)embeddings_file_pathr   item_separatormax_vocab_sizec              
   C   s  t d|  tj| s\t d|  d| ks<d| krJtd| d|  }t||  d }g }g }| drt	j
| dd	d
n
t
| d	d
}	t|	ddd}
|
D ]}| |}|st|dkrq|d }|d krt|d }|d |t| t|d |krt d qtdd |dd  D }|| || |d k	r|dkrt||kr qpqt|}|| t|||dW  5 Q R  S Q R X d S )NzRead in embeddings file {}z.{} does not exist, try to download from server/\zEmbeddings file not found: {}zAhttps://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/z.gzrtutf8)encodingzLoad Word EmbeddingsZ
Embeddings)descunitr>   r   r
   ZPADDING_TOKENz\ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.c                 S   s   g | ]}t |qS r   )float)r)   numr   r   r    r-      s     z1WordEmbeddings.from_text_file.<locals>.<listcomp>rO   )loggerinfoformatrC   rD   rQ   
ValueErrorr   endswithgziprB   r   rstripsplitr.   r2   r   zeroserrorarrayr   Z	set_vocabr   )rX   r   rY   r   rZ   urlr   ZvocabZ
embeddingsrS   iteratorlinerk   wordZvectorr   r   r    from_text_filer   sX    
 





  zWordEmbeddings.from_text_file)Fr   )T)__name__
__module____qualname__r   boolintr   r&   r   strr(   r9   rJ   rH   staticmethodrP   r   rs   r   r   r   r    r      s2     
r   )ri   rF   loggingrC   typingr   Znumpyr   r   Zsafetensors.torchr   rR   r   rI   r   r   Zsentence_transformers.utilr   r   r	   r   r   r   	getLoggerrt   rd   r   r   r   r   r   r    <module>   s   
