U
    <Afe#                     @   sv   d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZmZmZ G dd de
jZdS )    N)AnyDictListOptionalTupleUnion)nn)
AutoConfig	AutoModelAutoTokenizer	MT5ConfigT5Configc                       sP  e Zd ZdZd!eee eeeef  eeeef  eeeef  ee e	edd	 fddZ
dddd	Zddd
dZddddZedddZeeejf eeejf dddZedddZd"eee ee eeeef  f eee	f eeejf dddZeeef dddZd#ee	ddddZeed ddd Z  ZS )$Transformera  Huggingface AutoModel to generate token embeddings.
    Loads the correct class, e.g. BERT / RoBERTa etc.

    Args:
        model_name_or_path: Huggingface models name
            (https://huggingface.co/models)
        max_seq_length: Truncate any inputs longer than max_seq_length
        model_args: Keyword arguments passed to the Huggingface
            Transformers model
        tokenizer_args: Keyword arguments passed to the Huggingface
            Transformers tokenizer
        config_args: Keyword arguments passed to the Huggingface
            Transformers config
        cache_dir: Cache dir for Huggingface Transformers to store/load
            models
        do_lower_case: If true, lowercases the input (independent if the
            model is cased or not)
        tokenizer_name_or_path: Name or path of the tokenizer. When
            None, then model_name_or_path is used
    NF)	model_name_or_pathmax_seq_length
model_argstokenizer_argsconfig_args	cache_dirdo_lower_casetokenizer_name_or_pathreturnc	           
         s  t t|   ddg| _|| _|d kr*i }|d kr6i }|d krBi }tj|f|d|i}	| j||	|f| |d k	rd|kr||d< tj|d k	r|n|fd|i|| _	|d krt
| jdrt
| jjdrt
| j	drt| jjj| j	j}|| _|d k	r| j	jj| jj_d S )Nr   r   r   model_max_lengthconfigmax_position_embeddings)superr   __init__config_keysr   r	   from_pretrained_load_modelr   	tokenizerhasattr
auto_modelr   minr   r   r   	__class____name__Ztokenizer_class)
selfr   r   r   r   r   r   r   r   r   r$    L/tmp/pip-unpacked-wheel-i7fohqg6/sentence_transformers/models/Transformer.pyr       s>    



zTransformer.__init__)r   c                 K   sZ   t |tr| j|||f| n8t |tr<| j|||f| ntj|f||d|| _dS )zLoads the transformer modelr   r   N)
isinstancer   _load_t5_modelr   _load_mt5_modelr
   r   r"   )r&   r   r   r   r   r(   r(   r)   r   N   s    

 zTransformer._load_modelc                 K   s2   ddl m} dg|_|j|f||d|| _dS )Loads the encoder model from T5r   )T5EncoderModel	decoder.*r*   N)transformersr/   "_keys_to_ignore_on_load_unexpectedr   r"   )r&   r   r   r   r   r/   r(   r(   r)   r,   Y   s     zTransformer._load_t5_modelc                 K   s2   ddl m} dg|_|j|f||d|| _dS )r.   r   )MT5EncoderModelr0   r*   N)r1   r3   r2   r   r"   )r&   r   r   r   r   r3   r(   r(   r)   r-   b   s     zTransformer._load_mt5_modelc                 C   s   d |  | jjjS )Nz+Transformer({}) with Transformer model: {} )formatget_config_dictr"   r$   r%   r&   r(   r(   r)   __repr__k   s     zTransformer.__repr__)featuresr   c                 C   s   |d |d d}d|kr&|d |d< | j f |ddi}|d }|||d d | j jjrd	}t|d
k rtd}|| }|d|i |S )z#Returns token_embeddings, cls_token	input_idsattention_mask)r9   r:   Ztoken_type_idsZreturn_dictFr   )Ztoken_embeddingsr:            Zall_layer_embeddings)r"   updater   Zoutput_hidden_stateslen)r&   r8   Ztrans_featuresZoutput_statesZoutput_tokensZall_layer_idxZhidden_statesr(   r(   r)   forwardp   s    
zTransformer.forwardc                 C   s
   | j jjS )N)r"   r   Zhidden_sizer6   r(   r(   r)   get_word_embedding_dimension   s    z(Transformer.get_word_embedding_dimensionT)textspaddingr   c              	   C   s   i }t |d tr|g}nt |d trrg }g |d< |D ]0}tt| \}}|| |d | q8|g}n8g g  }}	|D ] }
||
d  |	|
d  q||	g}dd |D }| jrdd |D }|| j	||dd| j
d	 |S )
z-Tokenizes a text and maps tokens to token-idsr   Z	text_keysr=   c                 S   s   g | ]}d d |D qS )c                 S   s   g | ]}t | qS r(   )strstrip.0sr(   r(   r)   
<listcomp>   s     3Transformer.tokenize.<locals>.<listcomp>.<listcomp>r(   rG   colr(   r(   r)   rI      s     z(Transformer.tokenize.<locals>.<listcomp>c                 S   s   g | ]}d d |D qS )c                 S   s   g | ]}|  qS r(   )lowerrF   r(   r(   r)   rI      s     rJ   r(   rK   r(   r(   r)   rI      s     Zlongest_firstpt)rC   Z
truncationZreturn_tensors
max_length)r+   rD   dictnextiteritemsappendr   r>   r    r   )r&   rB   rC   outputZto_tokenizelookupZtext_keytextZbatch1Zbatch2Z
text_tupler(   r(   r)   tokenize   s:    

	zTransformer.tokenizec                    s    fdd j D S )Nc                    s   i | ]}| j | qS r(   )__dict__)rG   keyr6   r(   r)   
<dictcomp>   s      z/Transformer.get_config_dict.<locals>.<dictcomp>)r   r6   r(   r6   r)   r5      s    zTransformer.get_config_dict)output_pathsafe_serializationr   c              	   C   sT   | j j||d | j| ttj|dd}tj| 	 |dd W 5 Q R X d S )N)r]   sentence_bert_config.jsonwr;   )indent)
r"   Zsave_pretrainedr    openospathjoinjsondumpr5   )r&   r\   r]   ZfOutr(   r(   r)   save   s    zTransformer.save)
input_pathr   c              	   C   s   dD ]"}t j| |}t j|r q(qt|}t|}W 5 Q R X d|krhd|d krh|d d d|krd|d kr|d d d|krd|d kr|d d tf d| i|S )N)r^   zsentence_roberta_config.jsonzsentence_distilbert_config.jsonzsentence_camembert_config.jsonzsentence_albert_config.jsonz sentence_xlm-roberta_config.jsonzsentence_xlnet_config.jsonr   Ztrust_remote_coder   r   r   )	rb   rc   rd   existsra   re   loadpopr   )rh   Zconfig_nameZsbert_config_pathZfInr   r(   r(   r)   rj      s    	
zTransformer.load)NNNNNFN)T)T)r%   
__module____qualname____doc__rD   r   intr   r   boolr   r   r,   r-   r7   torchZTensorr@   rA   r   r   r   rX   r5   rg   staticmethodrj   __classcell__r(   r(   r'   r)   r   
   sF          .		$   
(r   )re   rb   typingr   r   r   r   r   r   rq   r   r1   r	   r
   r   r   r   Moduler   r(   r(   r(   r)   <module>   s    