U
    4AfW                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZ dd	lmZmZ d
dlmZ eeZeG dd deZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZdS )zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)	dataclass)OptionalTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)ModelOutputlogging   )IdeficsVisionConfigc                   @   sf   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeejdf  ed< dZeeejdf  ed< dS )IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r    r   r   F/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/idefics/vision.pyr   "   s
   
r   c                       sR   e Zd Zed fddZejeeejdddZdej	e
ejdd	d
Z  ZS )IdeficsVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstrideZbias   r   position_ids)r   )
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   Zrandnclass_embeddingZConv2dnum_channelspatch_embeddingnum_patchesnum_positionsZ	Embeddingposition_embeddingZregister_bufferZarangeexpandselfr   	__class__r   r   r%   A   s"    
z IdeficsVisionEmbeddings.__init__)
embeddingsheightwidthreturnc                 C   s  |j d d }| | j}|j d d }||kr<||kr<|S |dddf }|ddddf }|j d }	|| jj }
|| jj }|
d |d  }
}t|}|dt|t||	}|	dddd}|j
tjk}|rtd |tj}tjj||
| || fd	d
d}|r|tj}t|
|j d ksFt||j d krxtdt|
t|f d|j d |j d f d|	dddddd|	}tj|d|fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r"   g?r   r    zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.ZbicubicF)Zscale_factormodeZalign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()Zdim)shaper0   r!   r   r)   mathsqrtreshapeintZpermutedtyper   Zbfloat16loggerZwarning_oncetofloatr   
functionalZinterpolate
ValueErrorviewcatZ	unsqueeze)r3   r6   r7   r8   r.   Z	pos_embedr/   Zclass_pos_embedZpatch_pos_embedr'   Znum_h_patchesZnum_w_patchesZsqrt_num_positionsZfp32_upcastingr   r   r   interpolate_pos_encodingX   sD    	

(,z0IdeficsVisionEmbeddings.interpolate_pos_encodingF)pixel_valuesrK   r9   c              
   C   s   |j \}}}}|sL|| jks&|| jkrLtd| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r|
| |
|| }
n|
| | j }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)rC   r    r   r"   r=   )r>   r(   rH   r-   ZweightrC   rE   flatten	transposer+   r1   r   rJ   rK   r0   r!   )r3   rL   rK   Z
batch_sizer,   r7   r8   Ztarget_dtypeZpatch_embedsZclass_embedsr6   r   r   r   forward   s     
zIdeficsVisionEmbeddings.forward)F)r   r   r   r   r%   r   TensorrB   rK   r   boolrP   __classcell__r   r   r4   r   r   @   s   1r   c                	       sn   e Zd ZdZ fddZejeedddZdeje	ej e	ej e	e
 eeje	ej f d	d
dZ  ZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )r$   r%   r   r&   r'   Znum_attention_heads	num_headshead_dimrH   scaleZattention_dropoutdropoutr   Lineark_projv_projq_projout_projr2   r4   r   r   r%      s    
zIdeficsVisionAttention.__init__)tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   r    )rI   rU   rV   rO   
contiguous)r3   r^   r_   r`   r   r   r   _shape   s    zIdeficsVisionAttention._shapeNFr   attention_maskcausal_attention_maskoutput_attentionsr9   c                 C   s  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrtd|| j ||f d|   |dk	rD|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dk	r|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||
}|  || j || jfkrRtd
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x Channelr"   r   r    z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r=   )ptrainingz `attn_output` should be of size )sizer\   rW   rb   rZ   r[   rU   rV   rI   r   ZbmmrO   rH   r   rG   ZsoftmaxrX   rh   rA   r]   )r3   r   rd   re   rf   r`   Ztgt_lenr'   Zquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr   r   r   rP      sX    	





zIdeficsVisionAttention.forward)NNF)r   r   r   r   r%   r   rQ   rB   rb   r   rR   r   rP   rS   r   r   r4   r   rT      s      rT   c                       s0   e Zd Z fddZejejdddZ  ZS )IdeficsVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r$   r%   r   r   Z
hidden_actactivation_fnr   rY   r&   Zintermediate_sizefc1fc2r2   r4   r   r   r%     s
    
zIdeficsVisionMLP.__init__)r   r9   c                 C   s"   |  |}| |}| |}|S rl   )rn   rm   ro   )r3   r   r   r   r   rP     s    


zIdeficsVisionMLP.forward)r   r   r   r%   r   rQ   rP   rS   r   r   r4   r   rk     s   rk   c                       sJ   e Zd Zed fddZdejejejee e	ej
 dddZ  ZS )	IdeficsVisionEncoderLayerr   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)Zeps)r$   r%   r&   r'   rT   	self_attnr   	LayerNormlayer_norm_epslayer_norm1rk   mlplayer_norm2r2   r4   r   r   r%     s    


z"IdeficsVisionEncoderLayer.__init__Frc   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   rd   re   rf   )ru   rr   rw   rv   )r3   r   rd   re   rf   Zresidualrj   outputsr   r   r   rP   &  s"    




z!IdeficsVisionEncoderLayer.forward)F)r   r   r   r   r%   r   rQ   r   rR   r   r   rP   rS   r   r   r4   r   rp     s    rp   c                	       s`   e Zd ZdZed fddZd	eej eej ee	 ee	 ee	 e
eef dddZ  ZS )
IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r   )rp   ).0_r   r   r   
<listcomp>\  s     z1IdeficsVisionEncoder.__init__.<locals>.<listcomp>F)	r$   r%   r   r   Z
ModuleListrangeZnum_hidden_layerslayersgradient_checkpointingr2   r4   r   r   r%   Y  s    
 zIdeficsVisionEncoder.__init__N)rd   re   rf   output_hidden_statesreturn_dictr9   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|rDdnd}|rPdnd}|}	t| jD ]b\}
}|rx||	f }| jr| jr| |j	|	|||}n||	|||d}|d }	|rb||d f }qb|r||	f }|st
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )rf   r   r   c                 s   s   | ]}|d k	r|V  qd S rl   r   )rz   vr   r   r   	<genexpr>  s      z/IdeficsVisionEncoder.forward.<locals>.<genexpr>)r   r   r   )r   rf   r   use_return_dict	enumerater~   r   rh   Z_gradient_checkpointing_func__call__tupler	   )r3   inputs_embedsrd   re   rf   r   r   Zencoder_statesZall_attentionsr   idxZencoder_layerZlayer_outputsr   r   r   rP   _  sH    &

  zIdeficsVisionEncoder.forward)NNNNN)r   r   r   r   r   r%   r   r   rQ   rR   r   r   r	   rP   rS   r   r   r4   r   ry   P  s   	     
ry   c                	       sZ   e Zd Zed fddZd	eej ee ee ee ee e	e
ef dddZ  ZS )
IdeficsVisionTransformerr   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S rq   )r$   r%   r   r&   r   r6   r   rs   rt   pre_layrnormry   encoderpost_layernorm)r3   r   r'   r4   r   r   r%     s    


z!IdeficsVisionTransformer.__init__NF)rL   rf   r   rK   r   r9   c           
      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| j||d}| |}| j||||d}|d }|dddddf }	| |	}	|s||	f|dd  S t	||	|j
|jdS )z
        Returns:

        Nz You have to specify pixel_values)rK   )r   rf   r   r   r   r   )r   Zpooler_outputr   r   )r   rf   r   r   rH   r6   r   r   r   r
   r   r   )
r3   rL   rf   r   rK   r   r   Zencoder_outputsr   Zpooled_outputr   r   r   rP     s2    

z IdeficsVisionTransformer.forward)NNNFN)r   r   r   r   r%   r   r   r   rR   r   r   r
   rP   rS   r   r   r4   r   r     s        
r   ) r   r?   Zdataclassesr   typingr   r   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_outputsr	   r
   utilsr   r   Zconfiguration_ideficsr   Z
get_loggerr   rD   r   Moduler   rT   rk   rp   ry   r   r   r   r   r   <module>   s&   
di3b