U
    4Af                     @   sV  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZmZm Z  e!e"Z#dZ$e
j%e
j%dddZ&e
j%e
j%dddZ'eG dd deZ(eG dd deZ)eG dd deZ*G dd dej+Z,G dd dej+Z-G dd  d ej+Z.G d!d" d"ej+Z/G d#d$ d$ej+Z0G d%d& d&eZ1d'Z2d(Z3d)Z4d*Z5G d+d, d,ej+Z6G d-d. d.ej+Z7G d/d0 d0e1Z8G d1d2 d2ej+Z9G d3d4 d4e1Z:ee2G d5d6 d6e1Z;G d7d8 d8ej+Z<G d9d: d:e1Z=ed;e2G d<d= d=e1Z>dS )>zPyTorch CLIPSeg model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfigzCIDAS/clipseg-rd64-refined)logitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalZcross_entropytorcharangelenr   )r    r    P/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/clipseg/modeling_clipseg.pycontrastive_loss0   s    r"   )
similarityr   c                 C   s    t | }t |  }|| d S )Ng       @)r"   t)r#   Zcaption_lossZ
image_lossr    r    r!   clipseg_loss5   s    r%   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZeed< dZeed	< ee d
ddZdS )CLIPSegOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS ))r,   r-   Ngetattrto_tuple.0kselfr    r!   	<genexpr>[   s   z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr5   r    r5   r!   r1   Z   s    zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r'   r   r   FloatTensor__annotations__r(   r)   r*   r+   r,   r   r-   r   r   r1   r    r    r    r!   r&   ;   s   
r&   c                   @   sL   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	e
ej  ed< dS )CLIPSegDecoderOutputa  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Classification scores for each pixel.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr   hidden_states
attentions)r;   r<   r=   r>   r   r   r?   r@   rB   r   r   rC   r    r    r    r!   rA   a   s   
rA   c                   @   sx   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZeed< dZeed< ee d	d
dZdS )CLIPSegImageSegmentationOutputa,  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nr'   r   conditional_embeddingspooled_outputr-   decoder_outputr.   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS ))r-   rG   Nr/   r2   r5   r    r!   r7      s   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>r8   r5   r    r5   r!   r1      s    z'CLIPSegImageSegmentationOutput.to_tuple)r;   r<   r=   r>   r'   r   r   r?   r@   r   rE   rF   r-   r   rG   rA   r   r   r1   r    r    r    r!   rD   u   s   
	rD   c                       s>   e Zd Zed fddZdd ZejejdddZ	  Z
S )	CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rJ   hidden_size	embed_dimZ
image_size
patch_sizer   	Parameterr   Zrandnclass_embeddingConv2dZnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr   expandr6   rJ   	__class__r    r!   rU      s"    
z CLIPSegVisionEmbeddings.__init__c                 C   s   t |dkrtdt| jd }| jjdd  jd| jj	||}t
jj||dddd| jj	|d |d  j}t| jjd d |g}|S )	NrN   z#new_size should consist of 2 valuesg      ?r   ZbicubicF)modeZalign_cornersr   )r   
ValueErrorintr]   r`   weightTviewrJ   rV   r   r   Zinterpolatesqueezer   cat)r6   Znew_sizeZnum_patches_one_directionabresultr    r    r!   interpolate_position_embeddings   s$        z7CLIPSegVisionEmbeddings.interpolate_position_embeddings)pixel_valuesr   c                 C   s   |j d }| |}|ddd}| j|dd}tj||gdd}|j d | jkrt	t
|j d d }|| ||f }||j}n|| | j }|S )Nr   rN   r   rQ   dim)shaper\   flatten	transposerZ   rb   r   rm   r^   rh   mathsqrtrq   todtyper`   rO   )r6   rr   
batch_sizeZpatch_embedsZclass_embeds
embeddingsZ	new_shaper    r    r!   forward   s    

zCLIPSegVisionEmbeddings.forward)r;   r<   r=   r   rU   rq   r   r?   Tensorr~   __classcell__r    r    rd   r!   rH      s   rH   c                       sL   e Zd Zed fddZdeej eej eej ej	dddZ
  ZS )	CLIPSegTextEmbeddingsrI   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrO   rP   FrR   )rT   rU   rV   r   r_   Z
vocab_sizetoken_embeddingZmax_position_embeddingsr`   ra   r   r   rb   r6   rJ   rW   rd   r    r!   rU      s    
  zCLIPSegTextEmbeddings.__init__N)	input_idsrO   inputs_embedsr   c                 C   sb   |d k	r|j d n|j d }|d kr:| jd d d |f }|d krL| |}| |}|| }|S )NrQ   )ru   rO   r   r`   )r6   r   rO   r   Z
seq_lengthZposition_embeddingsr}   r    r    r!   r~      s    

zCLIPSegTextEmbeddings.forward)NNN)r;   r<   r=   r   rU   r   r   
LongTensorr?   r   r~   r   r    r    rd   r!   r      s      r   c                	       sn   e Zd ZdZ fddZejeedddZdeje	ej e	ej e	e
 eeje	ej f d	d
dZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )rT   rU   rJ   rV   rW   num_attention_heads	num_headshead_dimrg   scaleZattention_dropoutdropoutr   Lineark_projv_projq_projout_projrc   rd   r    r!   rU      s    
zCLIPSegAttention.__init__)tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   rN   )rk   r   r   rw   
contiguous)r6   r   r   r   r    r    r!   _shape  s    zCLIPSegAttention._shapeNFrB   attention_maskcausal_attention_maskoutput_attentionsr   c                 C   s  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrtd|| j ||f d|   |dk	rD|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dk	r|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||
}|  || j || jfkrRtd
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelrQ   r   rN   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size rs   )ptrainingz `attn_output` should be of size )sizer   r   r   r   r   r   r   rk   r   Zbmmrw   rg   r   r   Zsoftmaxr   r   Zreshaper   )r6   rB   r   r   r   r   Ztgt_lenrW   Zquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr    r    r!   r~     sX    	





zCLIPSegAttention.forward)NNF)r;   r<   r=   r>   rU   r   r   rh   r   r   boolr   r~   r   r    r    rd   r!   r      s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rT   rU   rJ   r	   
hidden_actactivation_fnr   r   rV   intermediate_sizefc1fc2rc   rd   r    r!   rU   X  s
    
zCLIPSegMLP.__init__)rB   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r6   rB   r    r    r!   r~   _  s    


zCLIPSegMLP.forward)r;   r<   r=   rU   r   r   r~   r   r    r    rd   r!   r   W  s   r   c                       sJ   e Zd Zed fddZdejejejee e	ej
 dddZ  ZS )	CLIPSegEncoderLayerrI   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)ZepsrT   rU   rV   rW   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rc   rd   r    r!   rU   h  s    


zCLIPSegEncoderLayer.__init__Fr   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rB   r   r   r   )r   r   r   r   r6   rB   r   r   r   Zresidualr   outputsr    r    r!   r~   p  s"    




zCLIPSegEncoderLayer.forward)F)r;   r<   r=   r   rU   r   r   r   r   r   r?   r~   r   r    r    rd   r!   r   g  s    r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )CLIPSegPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clipTc                 C   sX  | j j}t|trF|jjjjd|d d |jjjjd|d d nt|t	r| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nTt|trD| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d nt|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d nPt|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr.|j j!  |jj"d t|t
j#rT|j dk	rT|j j!  dS )	zInitialize the weightsg        g{Gz?)Zmeanstdr   )r   rN   g      ?N)$rJ   Zinitializer_factor
isinstancer   r   ri   dataZnormal_r`   rH   r   initrZ   rW   r\   Zinitializer_ranger   num_hidden_layersr   r   r   r   r   rV   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rM   Zzero_Zfill_r   )r6   moduleZfactorZin_proj_stdZout_proj_stdZfc_stdr    r    r!   _init_weights  sJ    

  z$CLIPSegPreTrainedModel._init_weightsN)	r;   r<   r=   r>   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr   r    r    r    r!   r     s
   r   aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                	       s`   e Zd ZdZed fddZd	eej eej ee	 ee	 ee	 e
eef dddZ  ZS )
CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rI   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r    )r   r3   _rI   r    r!   
<listcomp>:  s     z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	rT   rU   rJ   r   
ModuleListranger   layersgradient_checkpointingrc   rd   rI   r!   rU   7  s    
 zCLIPSegEncoder.__init__N)r   r   r   output_hidden_statesreturn_dictr   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|rDdnd}|rPdnd}|}	t| jD ]b\}
}|rx||	f }| jr| jr| |j	|	|||}n||	|||d}|d }	|rb||d f }qb|r||	f }|st
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr    )r   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r    r3   vr    r    r!   r7     s      z)CLIPSegEncoder.forward.<locals>.<genexpr>)last_hidden_staterB   rC   )rJ   r   r   use_return_dict	enumerater   r   r   Z_gradient_checkpointing_func__call__r9   r   )r6   r   r   r   r   r   r   Zencoder_statesall_attentionsrB   idxZencoder_layerlayer_outputsr    r    r!   r~   =  sH    &

  zCLIPSegEncoder.forward)NNNNN)r;   r<   r=   r>   r   rU   r   r   r   r   r   r   r   r~   r   r    r    rd   r!   r   .  s   	     
r   c                       sx   e Zd Zed fddZeeeeedd	e	e
j e	e
j e	e
j e	e e	e e	e eeef dddZ  ZS )
CLIPSegTextTransformerrI   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )rT   rU   rJ   rV   r   r}   r   encoderr   r   r   final_layer_normeos_token_idr   rd   r    r!   rU     s    


zCLIPSegTextTransformer.__init__output_typer   Nr   r   rO   r   r   r   r   c                 C   sn  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| }|d|d }| j||d}t||j	|j
d}	|dk	rt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
d	jdd
f }n>|tj|jd |j
d|jtj|j
d	| jk jdd
f }|sZ||f|
dd  S t|||
j|
jdS )
        Returns:

        NzYou have to specify input_idsrQ   )r   rO   r   )r   r   r   r   r   r   r   rN   )r{   r   rs   r   r   pooler_outputrB   rC   )rJ   r   r   r   rg   r   rk   r}   r
   r{   r   r   r   r   r   r   r   ru   rz   rh   Zargmaxr   rB   rC   )r6   r   r   rO   r   r   r   Zinput_shaperB   r   encoder_outputsr   rF   r    r    r!   r~     s^      	
	zCLIPSegTextTransformer.forward)NNNNNN)r;   r<   r=   r   rU   r   CLIPSEG_TEXT_INPUTS_DOCSTRINGr   r   r   r   r   r   r   r   r~   r   r    r    rd   r!   r     s$   
      
r   c                       s   e Zd ZeZddgZed fddZejdddZ	d	d
 Z
eeeeeddeej eej eej ee ee ee eeef dddZ  ZS )CLIPSegTextModelr   r   rI   c                    s"   t  | t|| _|   d S r   )rT   rU   r   
text_model	post_initrc   rd   r    r!   rU     s    
zCLIPSegTextModel.__init__r.   c                 C   s
   | j jjS r   r   r}   r   r5   r    r    r!   get_input_embeddings  s    z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r   )r6   valuer    r    r!   set_input_embeddings  s    z%CLIPSegTextModel.set_input_embeddingsr   Nr   c                 C   s   | j ||||||dS )aM  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rO   r   r   r   )r   )r6   r   r   rO   r   r   r   r    r    r!   r~     s    zCLIPSegTextModel.forward)NNNNNN)r;   r<   r=   r   r   Z_no_split_modulesrU   r   Moduler   r   r   r   r   r   r   r   r   r   r   r   r~   r   r    r    rd   r!   r     s,   
      
r   c                
       sh   e Zd Zed fddZeeeeedd	e	e
j e	e e	e e	e eeef dddZ  ZS )
CLIPSegVisionTransformerrI   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rT   rU   rJ   rV   rH   r}   r   r   r   pre_layrnormr   r   post_layernormr   rd   r    r!   rU   )  s    


z!CLIPSegVisionTransformer.__init__r   Nrr   r   r   r   r   c           	      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| |}| |}| j||||d}|d }|dddddf }| |}|s||f|dd  S t	|||j
|jdS )r   Nz You have to specify pixel_values)r   r   r   r   r   r   r   )rJ   r   r   r   rg   r}   r   r   r   r   rB   rC   )	r6   rr   r   r   r   rB   r   r   rF   r    r    r!   r~   3  s2    


z CLIPSegVisionTransformer.forward)NNNN)r;   r<   r=   r   rU   r   CLIPSEG_VISION_INPUTS_DOCSTRINGr   r   r   r   r?   r   r   r   r~   r   r    r    rd   r!   r   '  s   

    
r   c                
       s   e Zd ZeZdZed fddZejdddZ	e
eeeeddeej ee ee ee eeef d
ddZ  ZS )CLIPSegVisionModelrr   rI   c                    s"   t  | t|| _|   d S r   )rT   rU   r   vision_modelr   rc   rd   r    r!   rU   g  s    
zCLIPSegVisionModel.__init__r.   c                 C   s
   | j jjS r   )r   r}   r\   r5   r    r    r!   r   m  s    z'CLIPSegVisionModel.get_input_embeddingsr   Nr   c                 C   s   | j ||||dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```rr   r   r   r   )r   )r6   rr   r   r   r   r    r    r!   r~   p  s    zCLIPSegVisionModel.forward)NNNN)r;   r<   r=   r   r   Zmain_input_namerU   r   r   r   r   r   r   r   r   r   r?   r   r   r   r~   r   r    r    rd   r!   r   c  s"   
    
r   c                       s  e Zd ZeZed fddZeedee	j
 ee	j
 ee	j
 ee ee ee e	jdddZeedee	j ee ee ee e	jdd	d
Zeeeeeddee	j ee	j ee	j
 ee	j ee ee ee ee eeef d	ddZ  ZS )r   rI   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rM   )rT   rU   r   text_configr   	TypeErrortypevision_configr   projection_dimrV   r   r   r   r   r   r   r   r   r   r   rY   r   r   rJ   Zlogit_scale_init_valuelogit_scaler   )r6   rJ   r   r   rd   r    r!   rU     s(    

zCLIPSegModel.__init__Nr   c           
      C   sh   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr   r   )rJ   r   r   r   r   r   )
r6   r   r   rO   r   r   r   text_outputsrF   Ztext_featuresr    r    r!   get_text_features  s    	
zCLIPSegModel.get_text_featuresr   c                 C   sd   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}|d }| |}|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr   r   )rJ   r   r   r   r   r   )r6   rr   r   r   r   vision_outputsrF   Zimage_featuresr    r    r!   get_image_features  s    
zCLIPSegModel.get_image_featuresr   )	r   rr   r   rO   return_lossr   r   r   r   c	              	   C   s*  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}	| j||||||d}
|	d }| |}|
d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rt|}|s|||||
|	f}|dk	r|f| S |S t||||||
|	d	S )
a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr   r   r   rN   rQ   T)r   rt   Zkeepdim)r'   r(   r)   r*   r+   r,   r-   )rJ   r   r   r   r   r   r   r   Znormr   expr   matmulr$   r%   r&   )r6   r   rr   r   rO   r   r   r   r   r   r   r+   r*   r   r)   r(   r'   outputr    r    r!   r~     sT    &	


zCLIPSegModel.forward)NNNNNN)NNNN)NNNNNNNN)r;   r<   r=   r   r   rU   r   r   r   r   r   r   r?   r   r   r   CLIPSEG_INPUTS_DOCSTRINGr   r&   r   r   r   r~   r   r    r    rd   r!   r     sf          .    0
        
r   c                       sN   e Zd ZdZed fddZd	ejejejee	 e
ej dddZ  ZS )
CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rI   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S r   r   rc   rd   r    r!   rU     s    


zCLIPSegDecoderLayer.__init__Fr   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r`||f7 }|S r   )r   r   r   r   r   r    r    r!   r~     s"    




zCLIPSegDecoderLayer.forward)F)r;   r<   r=   r>   r   rU   r   r   r   r   r   r?   r~   r   r    r    rd   r!   r  y  s    r  c                       sN   e Zd Zed fddZd	eej ejee	 ee	 ee	 dddZ
  ZS )
CLIPSegDecoderrI   c                    sX  t     j| _t j j| _t j j| _ j	r j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _d S )N   r   r   )rK   paddingrN   r   )rK   rL   )rL   c                    s   g | ]}t  jj jqS r    )r   r   r   rV   
reduce_dimr   rI   r    r!   r     s     z+CLIPSegDecoder.__init__.<locals>.<listcomp>Zreluc                    s   g | ]}t  qS r    )r  r   )decoder_configr    r!   r     s     )rT   rU   conditional_layerr   r   r   r  film_mulfilm_addZ"use_complex_transposed_convolutionr   rX   Z
Sequentialr[   ZReLUZConvTranspose2dtransposed_convolutionr   extract_layersr   r   reducescopydeepcopyrV   Zdecoder_num_attention_headsr   Zdecoder_intermediate_sizer   r   r   )r6   rJ   Ztransposed_kernelsdepthrd   )rJ   r	  r!   rU     sN          
zCLIPSegDecoder.__init__NT)rB   rE   r   r   r   c                 C   sr  |rdnd }|rdnd }|d d d }d }	t t|| j| jD ]\}
\}}}|	d k	rb|||	 }	n||}	|
| jkr| ||	ddd | | }	|	ddd}	||	d d |d}|d }	|r||	f7 }|r>||d f7 }q>|	d d dd d d f ddd}	tt	
|	jd }|jd }|	||	jd ||}	| |	d}|sdtdd |||fD S t|||d	S )
Nr    rQ   r   r   rN   )r   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r    r   r    r    r!   r7     s      z)CLIPSegDecoder.forward.<locals>.<genexpr>)r   rB   rC   )r   zipr   r  r
  r  Zpermuter  rh   rx   ry   ru   rk   r  rl   r9   rA   )r6   rB   rE   r   r   r   Zall_hidden_statesr   activationsr  iZ
activationZlayerreducer   r   r|   r   r    r    r!   r~     sH    "
   
$
zCLIPSegDecoder.forward)NNT)r;   r<   r=   r   rU   r   r   r   r   r   r~   r   r    r    rd   r!   r    s   .   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    c                       s   e Zd ZeZed fddZdeeej	 eej	 eej	 eej	 dddZ
eeeeeddeej eej eej eej eej	 eej eej ee ee ee eeef d	d
dZ  ZS )CLIPSegForImageSegmentationrI   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	rT   rU   rJ   r   r   r  r  decoderr   rc   rd   r    r!   rU     s    

z$CLIPSegForImageSegmentation.__init__Nr|   r   r   rO   conditional_pixel_valuesc              	   C   s   |d k	rDt ||krtdt  | jj|||d}W 5 Q R X nF|d k	rt ||kr`tdt  | j|}W 5 Q R X ntd|S )Nz@Make sure to pass as many prompt texts as there are query images)r   rO   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r   rg   r   no_gradr   r   r   )r6   r|   r   r   rO   r  rE   r    r    r!   get_conditional_embeddings,  s$    
  
z6CLIPSegForImageSegmentation.get_conditional_embeddingsr   )r   rr   r  rE   r   rO   labelsr   r   r   r   c              	      s  |
dk	r|
n| j j}
t  | jj||d|
d}| j|d }|
rL|jn|d   fdd| jD }|
rt	|j
|j|	r|jnd|jd}n |	s|dd |d	d  n|}W 5 Q R X |dkr| j|jd
 ||||d}n8|jd
 |jd
 krtd|jd | j jkrtd| j||||	|
d}|
r4|jn|d
 }d}|dk	rh||j}t }|||}|
s|||||f}|dk	r|f| S |S t||||||dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr   r   rN   c                    s   g | ]} |d   qS )r   r    )r3   r  rB   r    r!   r     s     z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r   r   r   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r   r   )r'   r   rE   rF   r-   rG   )rJ   r   r   r  r   r   r   rB   r  r   r   r   rC   r  ru   rg   r   r  r   rz   r   r   ZBCEWithLogitsLossrD   )r6   r   rr   r  rE   r   rO   r  r   r   r   r   rF   r  Zdecoder_outputsr   r'   Zloss_fnr  r    r  r!   r~   I  sx    ,


z#CLIPSegForImageSegmentation.forward)NNNNN)
NNNNNNNNNN)r;   r<   r=   r   r   rU   rh   r   r   r   r  r   r  r   rD   r   r?   r   r   r   r   r&   r~   r   r    r    rd   r!   r    sN        
          
r  )?r>   r  rx   Zdataclassesr   typingr   r   r   r   r   Ztorch.utils.checkpointr   r  r	   Zmodeling_attn_mask_utilsr
   r   Zmodeling_outputsr   r   Zmodeling_utilsr   utilsr   r   r   r   r   Zconfiguration_clipsegr   r   r   Z
get_loggerr;   loggerZ_CHECKPOINT_FOR_DOCr   r"   r%   r&   rA   rD   r   rH   r   r   r   r   r   ZCLIPSEG_START_DOCSTRINGr   r   r  r   r   r   r   r   r   r  r  r  r    r    r    r!   <module>   s`   
$>"i24 'aa7<4 b9d