U
    <Af                     @   sN   d dl mZmZ d dlZd dlZd dlmZ d dlmZ G dd dejZ	dS )    )DictUnionN)Image)nnc                       s   e Zd Zdedd fddZedddZeeejf eeejf d	d
dZ	de
eef eeejf dddZeejdddZeddddZeed dddZ  ZS )	CLIPModelopenai/clip-vit-base-patch32N)
model_namereturnc                    s:   t t|   |d kr|}tj|| _tj|| _d S N)superr   __init__transformersZfrom_pretrainedmodelCLIPProcessor	processor)selfr   Zprocessor_name	__class__ J/tmp/pip-unpacked-wheel-i7fohqg6/sentence_transformers/models/CLIPModel.pyr   
   s
    zCLIPModel.__init__)r	   c                 C   s   dS )NzCLIPModel()r   r   r   r   r   __repr__   s    zCLIPModel.__repr__)featuresr	   c              	   C   s   g }g }d|kr2| j j|d d}| j |d }d|kr| j j|d|dd |dd |dd |dd d	}| j |d }g }t|}t|}t|d
 D ].\}	}
|
dkr|t	| q|t	| qt
| |d< |S )Npixel_values)r      	input_idsattention_maskposition_idsoutput_attentionsoutput_hidden_states)r   r   r   r   r   image_text_infor   sentence_embedding)r   Zvision_modelZvisual_projectionZ
text_modelgetZtext_projectioniter	enumerateappendnexttorchstackfloat)r   r   Zimage_embedsZtext_embedsZvision_outputsZtext_outputsr!   image_featuresZtext_featuresidxZ
input_typer   r   r   forward   s.    



zCLIPModel.forwardT)paddingr	   c           
      C   s   g }g }g }t |D ]>\}}t|tjr>|| |d q|| |d qi }t|rr| jj|d|d}t|r| jj|dd}	|	j|d< ||d< t	|S )Nr   r   pt)return_tensorsr-   )r/   r   r    )
r$   
isinstancer   r%   lenr   	tokenizerZimage_processorr   dict)
r   Ztextsr-   ZimagesZtexts_valuesr    r+   dataencodingr*   r   r   r   tokenize6   s"    


zCLIPModel.tokenizec                 C   s   | j S r
   )r   r   r   r   r   r2   N   s    zCLIPModel.tokenizer)output_pathr	   c                 C   s   | j | | j| d S r
   )r   Zsave_pretrainedr   )r   r7   r   r   r   saveR   s    zCLIPModel.save)
input_pathr	   c                 C   s
   t | dS )N)r   )r   )r9   r   r   r   loadV   s    zCLIPModel.load)r   N)T)__name__
__module____qualname__strr   r   r   r'   ZTensorr,   r   boolr6   propertyr   r   r2   r8   staticmethodr:   __classcell__r   r   r   r   r   	   s   	$ $r   )
typingr   r   r'   r   ZPILr   r   Moduler   r   r   r   r   <module>   s
   