U
    <Af`                     @   sp   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlmZm	Z	 ddl
mZ eeZG dd de	jZdS )    N)DictListLiteral)Tensornn   )WhitespaceTokenizerc                       s   e Zd ZdZi ddfee eeef eed fddZ	eee
f ddd	Zee ee d
ddZdd Zdeee  eeed ej
f dddZdd Zdd Zedd Z  ZS )BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    r   T)vocabword_weightsunknown_word_weightcumulative_term_frequencyc                    s   t t|   tt|}ddddg| _|| _|| _|| _|| _	g | _
d}|D ]H}|}||krh|| }n"| |kr||  }n|d7 }| j
| qNtd|t|| t|t dd	| _t|| _d S )
Nr
   r   r   r   r   r   z>{} out of {} words without a weighting value. Set weight to {}F)Z
stop_wordsZdo_lower_case)superr	   __init__listsetconfig_keysr
   r   r   r   weightslowerappendloggerinfoformatlenr   	tokenizersentence_embedding_dimension)selfr
   r   r   r   Znum_unknown_wordswordZweight	__class__ D/tmp/pip-unpacked-wheel-i7fohqg6/sentence_transformers/models/BoW.pyr      s4    
  zBoW.__init__)featuresc                 C   s   |S Nr    )r   r"   r    r    r!   forward9   s    zBoW.forward)textsreturnc                    s    fdd|D } |S )Nc                    s   g | ]}j j|f qS r    )r   tokenize).0textkwargsr   r    r!   
<listcomp>>   s     z BoW.tokenize.<locals>.<listcomp>)get_sentence_features)r   r%   r+   Z	tokenizedr    r*   r!   r'   =   s    zBoW.tokenizec                 C   s   | j S r#   )r   r   r    r    r!    get_sentence_embedding_dimensionA   s    z$BoW.get_sentence_embedding_dimensionr   sentence_embedding)tokenized_textspad_seq_lengthr&   c                 C   sp   g }|D ]X}t j|  t jd}|D ]0}| jrF||  | j| 7  < q$| j| ||< q$|| qdt |iS )N)Zdtyper0   )torchzerosr/   Zfloat32r   r   r   stack)r   r1   r2   ZvectorstokensZvectortokenr    r    r!   r-   D   s    zBoW.get_sentence_featuresc                    s    fdd j D S )Nc                    s   i | ]}| j | qS r    )__dict__)r(   keyr.   r    r!   
<dictcomp>U   s      z'BoW.get_config_dict.<locals>.<dictcomp>)r   r.   r    r.   r!   get_config_dictT   s    zBoW.get_config_dictc              	   C   s8   t tj|dd}tj|  |dd W 5 Q R X d S )Nconfig.jsonw   )indent)openospathjoinjsondumpr;   )r   Zoutput_pathZfOutr    r    r!   saveW   s    zBoW.savec              	   C   s2   t tj| d}t|}W 5 Q R X tf |S )Nr<   )r@   rA   rB   rC   rD   loadr	   )Z
input_pathZfInconfigr    r    r!   rG   [   s    zBoW.load)r   )__name__
__module____qualname____doc__r   strr   floatboolr   r   r$   intr'   r/   r   r3   r-   r;   rF   staticmethodrG   __classcell__r    r    r   r!   r	      s.   
% 
 r	   )rD   loggingrA   typingr   r   r   r3   r   r   r   r   	getLoggerrI   r   Moduler	   r    r    r    r!   <module>   s   
