U
    4Afy                    @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlZddlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& e'e(Z)dZ*dZ+dZ,dZ-dZ.dddgZ/dZ0dZ1ee&e$e%f Z2eG dd deZ3eG dd deZ4eG dd deZ5G dd dej6Z7G dd dej6Z8G d d! d!ej6Z9G d"d# d#ej6Z:G d$d% d%ej6Z;G d&d' d'ej6Z<G d(d) d)ej6Z=G d*d+ d+ej6Z>G d,d- d-ej6Z?G d.d/ d/ej6Z@G d0d1 d1ej6ZAd2ZBd3ZCd4ZDeDeC ZEd5ZFeFeC ZGd6eC ZHd7ZIeDeF eC eI ZJd8eF eD d9 eC ZKd:ZLG d;d< d<eZMed=eBjNdd>G d?d@ d@eMZOedAeBjNdd>G dBdC dCeMZPedDeBjNdd>G dEdF dFeMZQedGeBjNdHd>G dIdJ dJeMZRG dKdL dLej6ZSG dMdN dNej6ZTG dOdP dPej6ZUedQeBjNdRd>G dSdT dTeMZVG dUdV dVej6ZWG dWdX dXej6ZXG dYdZ dZej6ZYG d[d\ d\ej6ZZed]eBjNdHd>eL G d^d_ d_eMZ[dS )`zPyTorch FLAVA model.    N)OrderedDict)	dataclass)AnyDictListOptionalSetTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )FlavaConfigFlavaImageCodebookConfigFlavaImageConfigFlavaMultimodalConfigFlavaTextConfigzfacebook/flava-fullzfacebook/flava-image-codebookr   r   r         g$(~k@c                   @   s   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeej ed< dZee
 ed< dZeej ed< dZee
 ed< ee d	d
dZdS )FlavaModelOutputa  
    Output from FlavaModel containing embeddings and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

    Args:
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`].
        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
            The output of the [`FlavaTextModel`].
        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
            The output of the [`FlavaMultimodalModel`].
    Nimage_embeddingsimage_outputtext_embeddingstext_outputmultimodal_embeddingsmultimodal_outputreturnc                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS ))r%   r#   r'   Ngetattrto_tuple.0kself L/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/flava/modeling_flava.py	<genexpr>a   s   z,FlavaModelOutput.to_tuple.<locals>.<genexpr>tuplekeysr0   r2   r0   r3   r,   `   s    zFlavaModelOutput.to_tuple)__name__
__module____qualname____doc__r"   r   torchFloatTensor__annotations__r#   r   r$   r%   r&   r'   r	   r   r,   r2   r2   r2   r3   r!   A   s   
r!   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< ed	d
dZdS )FlavaLossesa"  Class representing pretraining losses from FLAVA model

    Args:
        mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
            Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
        mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
            Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
        itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
            Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
            masked pairs in FLAVA.
        global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
            Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
            data. This is calculated on unmasked images and texts.
        mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
            Masked Multimodal Modeling loss's image component calculated on paired image-text data.
        mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
            Masked Multimodal Modeling loss's text component calculated on paired image-text data.
    Nmimmlmitmglobal_contrastive	mmm_imagemmm_textr(   c                 C   s&   d}|   D ]}|d k	rd} q"q|S )NTF)values)r1   all_nonevr2   r2   r3   rG      s    zFlavaLosses.all_none)r8   r9   r:   r;   r@   r   r<   r=   r>   rA   rB   rC   rD   rE   boolrG   r2   r2   r2   r3   r?   g   s   
r?   c                   @   s  e Zd ZU dZdZeej ed< dZ	e
ed< dZeej ed< dZee ed< dZeej ed< dZee ed< dZeej ed	< dZee ed
< dZeej ed< dZee ed< dZeej ed< dZee ed< dZeej ed< dZee ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed< ee  dddZ!dS )FlavaForPreTrainingOutputa  
    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
            Total loss calculated for this model.
        loss_info (`FlavaLosses`):
            Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
            the keys.
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`].
        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
            The output of the [`FlavaTextModel`].
        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
            The output of the [`FlavaMultimodalModel`].

        image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
            to create masked images.
        image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
        text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
            The output of the [`FlavaTextModel`].
        multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
            The output of the [`FlavaMultimodalModel`].

        mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
                The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
                returned when `bool_masked_pos` has some of the patches masked.
        mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
                The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
                the tokens masked.
        itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
                The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
        mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
                The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
                output is returned when `bool_masked_pos` has some of the patches masked.
        mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
                The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
                some of the tokens masked.
        contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
            `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
            scores. This is calculated on unmasked images and texts.
        contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
            `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
            texts.
    Nloss	loss_infor"   r#   r$   r%   r&   r'   image_masked_embeddingsimage_masked_outputtext_masked_embeddingstext_masked_outputmultimodal_masked_embeddingsmultimodal_masked_output
mim_logits
mlm_logits
itm_logitscontrastive_logits_per_imagecontrastive_logits_per_textmmm_image_logitsmmm_text_logitsr(   c                    s,   ddddddgt  fdd  D S )	Nr%   r#   r'   rP   rN   rR   c                 3   s,   | ]$}|kr | nt  | V  qd S Nr*   r-   r1   Ztransformer_outputsr2   r3   r4      s     z5FlavaForPreTrainingOutput.to_tuple.<locals>.<genexpr>r5   r0   r2   r[   r3   r,      s    z"FlavaForPreTrainingOutput.to_tuple)"r8   r9   r:   r;   rK   r   r<   r=   r>   rL   r?   r"   r#   r   r$   r%   r&   r'   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   r	   r   r,   r2   r2   r2   r3   rJ      s.   
@rJ   c                       sd   e Zd ZdZdeedd fddZeje	e	ejddd	Z
dejeej eejd
ddZ  ZS )FlavaImageEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    FN)configuse_mask_tokenr)   c                    s   t    |p|j}ttdd|j| _|rFttdd|jnd | _t	|j
|j|j|jd| _| jj}ttd|d |j| _t|j| _|| _d S )Nr   
image_size
patch_sizenum_channels	embed_dim)super__init__
mask_tokenr   	Parameterr<   zeroshidden_size	cls_tokenPatchEmbeddingsr`   ra   rb   patch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutr]   )r1   r]   r^   rm   	__class__r2   r3   re      s    

 zFlavaImageEmbeddings.__init__)
embeddingsheightwidthr)   c              	   C   st  |j d d }| jj d d }||kr4||kr4| jS | jdddf }| jddddf }|j d }|| jj }	|| jj }
|	d |
d  }	}
tjj|dtt	
|tt	
||dddd|	t	
| |
t	
| fdd	d
}t|	|j d kst|
|j d krBtdt|	t|
f d|j d |j d f d|dddddd|}tj|d|fddS )a"  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/image_transformer.py#L174
        r   Nr   g?r      ZbicubicF)Zscale_factormodeZalign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shapern   r]   ra   r   
functionalZinterpolateZreshapeintmathsqrtpermute
ValueErrorviewr<   cat	unsqueeze)r1   rt   ru   rv   Znpatchnum_posZclass_pos_embedZpatch_pos_embedr}   Znum_h_patchesZnum_w_patchesr2   r2   r3   interpolate_pos_encoding  s,    	
.(,z-FlavaImageEmbeddings.interpolate_pos_encoding)pixel_valuesbool_masked_posr   r)   c                 C   s   |j \}}}}| j||d}| \}}	}
|d k	r| j||	d}| dkr`||dd}|d|}|d|  ||  }| j	|dd}t
j||fdd}|r|| ||| }n
|| j }| |}|S )N)r   rw   r   r         ?r   r|   )r~   rl   sizerf   expandr}   r   r   Ztype_asrj   r<   r   r   rn   rq   )r1   r   r   r   
batch_sizerb   ru   rv   rt   seq_len_Zmask_tokensmask
cls_tokensr2   r2   r3   forward+  s     

zFlavaImageEmbeddings.forward)F)NF)r8   r9   r:   r;   r   rI   re   r<   Tensorr   r   r   
BoolTensorr   __classcell__r2   r2   rr   r3   r\      s   &  r\   c                       sV   e Zd ZdZdeeeeeef f eed fddZdej	e
ej	d
ddZ  ZS )rk   z#
    Image to Patch Embedding.
          r   r    r_   c                    s   t    t|tjjs ||f}t|tjjs6||f}|d |d  |d |d   }|| _|| _|| _t	j
||||d| _d S )Nr   r   )kernel_sizeZstride)rd   re   
isinstancecollectionsabcIterabler`   ra   rm   r   Conv2d
projection)r1   r`   ra   rb   rc   rm   rr   r2   r3   re   T  s    
 zPatchEmbeddings.__init__F)r   r   r)   c              
   C   sx   |j \}}}}|s\|| jd ks.|| jd kr\td| d| d| jd  d| jd  d	| |ddd}|S )Nr   r   zInput image size (*z) doesn't match model (z).rx   )r~   r`   r   r   flatten	transpose)r1   r   r   r   rb   ru   rv   xr2   r2   r3   r   g  s    (zPatchEmbeddings.forward)r   r   r   r    )F)r8   r9   r:   r;   r   r
   r	   re   r<   r   rI   r   r   r2   r2   rr   r3   rk   O  s       rk   c                       sF   e Zd ZdZ fddZdeej eej eej dddZ  Z	S )	FlavaTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd d S )N)padding_idxZepsposition_embedding_typeabsoluteposition_ids)r   rw   F)
persistenttoken_type_ids)dtype)rd   re   r   	Embedding
vocab_sizeri   Zpad_token_idword_embeddingsZmax_position_embeddingsrn   Ztype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsro   rp   rq   r+   r   Zregister_bufferr<   aranger   rh   r   r   longr1   r]   rr   r2   r3   re   v  s"    
    zFlavaTextEmbeddings.__init__N	input_idsr   r   c                 C   s   |  }|d }|d kr.| jd d d |f }|d krt| drl| jd d d |f }||d |}|}ntj|tj| jjd}| 	|}| 
|}	||	 }
| jdkr| |}|
|7 }
| |
}
| |
}
|
S )Nr   r   r   )r   devicer   )r   r   hasattrr   r   r<   rh   r   r   r   r   r   rn   r   rq   )r1   r   r   r   input_shape
seq_lengthZbuffered_token_type_idsZ buffered_token_type_ids_expandedZinputs_embedsr   rt   rn   r2   r2   r3   r     s&    






zFlavaTextEmbeddings.forward)NNN)
r8   r9   r:   r;   re   r   r<   r   r   r   r2   r2   rr   r3   r   s  s      r   c                	       sx   e Zd Zedd fddZejejdddZdejeej eej e	e
eejejf eej f d	d
dZ  ZS )FlavaSelfAttentionNr]   r)   c                    s   t    |j|j dkr@t|ds@td|jf d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)rd   re   ri   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   LinearZqkv_biasquerykeyvaluero   Zattention_probs_dropout_probrq   r   rr   r2   r3   re     s    
zFlavaSelfAttention.__init__r   r)   c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nrw   r   rx   r   r   )r   r   r   r   r   )r1   r   Znew_x_shaper2   r2   r3   transpose_for_scores  s    
z'FlavaSelfAttention.transpose_for_scoresFhidden_statesattention_mask	head_maskoutput_attentionsr)   c                 C   s   |  |}| | |}| | |}| |}t||dd}	|	t| j	 }	|d k	rh|	| }	t
jj|	dd}
| |
}
|d k	r|
| }
t|
|}|dddd }| d d | jf }|j| }|r||
fn|f}|S )Nrw   rz   r|   r   rx   r   r   )r   r   r   r   r<   matmulr   r   r   r   r   r   Zsoftmaxrq   r   
contiguousr   r   r   )r1   r   r   r   r   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr2   r2   r3   r     s$    



zFlavaSelfAttention.forward)NNF)r8   r9   r:   FlavaPossibleConfigsre   r<   r   r   r   rI   r
   r	   r   r   r2   r2   rr   r3   r     s      r   c                       s@   e Zd ZdZedd fddZejejejdddZ  Z	S )	FlavaSelfOutputz
    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
    models), due to the layernorm applied before each block.
    Nr   c                    s.   t    t|j|j| _t|j| _d S rZ   )	rd   re   r   r   ri   densero   rp   rq   r   rr   r2   r3   re     s    
zFlavaSelfOutput.__init__r   input_tensorr)   c                 C   s   |  |}| |}|S rZ   r   rq   r1   r   r   r2   r2   r3   r     s    

zFlavaSelfOutput.forward)
r8   r9   r:   r;   r   re   r<   r   r   r   r2   r2   rr   r3   r     s   r   c                	       sx   e Zd Zedd fddZee ddddZdej	e
ej	 e
ej	 eeeej	ej	f eej	 f d	d
dZ  ZS )FlavaAttentionNr   c                    s*   t    t|| _t|| _t | _d S rZ   )rd   re   r   	attentionr   outputsetpruned_headsr   rr   r2   r3   re     s    


zFlavaAttention.__init__)headsr)   c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r|   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r1   r   indexr2   r2   r3   prune_heads  s       zFlavaAttention.prune_headsFr   c                 C   s8   | j ||||d}| |d |}|f|dd   }|S N)r   r   r   r   r   )r   r   )r1   r   r   r   r   Zself_outputsattention_outputr   r2   r2   r3   r     s       zFlavaAttention.forward)NNF)r8   r9   r:   r   re   r   r   r   r<   r   r   rI   r
   r	   r   r   r2   r2   rr   r3   r     s      r   c                       s8   e Zd Zedd fddZejejdddZ  ZS )FlavaIntermediateNr   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S rZ   )rd   re   r   r   ri   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnr   rr   r2   r3   re   ,  s
    
zFlavaIntermediate.__init__)r   r)   c                 C   s   |  |}| |}|S rZ   )r   r   r1   r   r2   r2   r3   r   5  s    

zFlavaIntermediate.forward	r8   r9   r:   r   re   r<   r   r   r   r2   r2   rr   r3   r   +  s   	r   c                       s<   e Zd Zedd fddZejejejdddZ  ZS )FlavaOutputNr   c                    s.   t    t|j|j| _t|j| _	d S rZ   )
rd   re   r   r   r   ri   r   ro   rp   rq   r   rr   r2   r3   re   =  s    
zFlavaOutput.__init__r   c                 C   s    |  |}| |}|| }|S rZ   r   r   r2   r2   r3   r   C  s    

zFlavaOutput.forwardr   r2   r2   rr   r3   r   <  s   r   c                	       sh   e Zd ZdZedd fddZd
ejeej eej e	e
eejejf eej f ddd	Z  ZS )
FlavaLayerz?This corresponds to the Block class in the timm implementation.Nr   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S Nr   r   )rd   re   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r   r   ri   r   layernorm_beforelayernorm_afterr   rr   r2   r3   re   O  s    



zFlavaLayer.__init__Fr   c           	      C   sb   | j | ||||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S r   )r   r   r   r   r   )	r1   r   r   r   r   Zself_attention_outputsr   r   Zlayer_outputr2   r2   r3   r   [  s    


zFlavaLayer.forward)NNF)r8   r9   r:   r;   r   re   r<   r   r   rI   r
   r	   r   r   r2   r2   rr   r3   r   L  s      r   c                
       sV   e Zd Zedd fddZd
ejeej eej eeee	e
ef ddd	Z  ZS )FlavaEncoderNr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r2   )r   r.   r   r]   r2   r3   
<listcomp>~  s     z)FlavaEncoder.__init__.<locals>.<listcomp>F)	rd   re   r]   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rr   r   r3   re   {  s    
 zFlavaEncoder.__init__FTr   r   r   r   output_hidden_statesreturn_dictr)   c                 C   s   |rdnd }|rdnd }t | jD ]t\}	}
|r8||f }|d k	rH||	 nd }| jrn| jrn| |
j||||}n|
||||}|d }|r"||d f }q"|r||f }|stdd |||fD S t|||dS )Nr2   r   r   c                 s   s   | ]}|d k	r|V  qd S rZ   r2   )r.   rH   r2   r2   r3   r4     s      z'FlavaEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   ZtrainingZ_gradient_checkpointing_func__call__r6   r   )r1   r   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsiZlayer_moduleZlayer_head_maskZlayer_outputsr2   r2   r3   r     s6    	

  zFlavaEncoder.forward)NNFFT)r8   r9   r:   r   re   r<   r   r   rI   r
   r6   r   r   r   r2   r2   rr   r3   r   z  s   	     
r   c                       s2   e Zd Zed fddZejdddZ  ZS )FlavaPoolerr   c                    s*   t    t|j|j| _t | _d S rZ   )rd   re   r   r   ri   r   ZTanh
activationr   rr   r2   r3   re     s    
zFlavaPooler.__init__)r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r  )r1   r   Zfirst_token_tensorpooled_outputr2   r2   r3   r     s    

zFlavaPooler.forwardr   r2   r2   rr   r3   r    s   r  aD  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`{config}`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a;  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`FlavaImageProcessor.__call__`] for details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
z
    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
            The concatenated hidden states of unimodal encoders.
z
    Args:
        skip_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
a  
    Args:
        input_ids_masked (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)

a  
        image_attention_mask (`torch.FloatTensor` of shape `({1})`, *optional*):
            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
            in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        skip_unmasked_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
            multimodal embeddings or outputs as of now.

        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
            ..., text_config.vocab_size - 1]`.

        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
            generated automatically using the image codebook assigned to the model. By default, it uses
            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.

        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.

        return_loss (`bool`, *optional*, default to None):
            Whether to return calculated loss or not.
z
    Parameters:
        image_codebook ([`nn.Module`]): If passed, the image codebook will be set to this. Otherwise. it will
            be initialized using the image_codebook_config defined in the config first as the first parameter.
c                   @   s<   e Zd ZdZeZdZdZee	j
e	je	jf ddddZdS )FlavaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    flavaTN)moduler)   c                 C   s   t |tjtjfr@|jjjd| jjd |j	dk	r|j	j
  nft |tjr|jjjd| jjd |jdk	r|jj|j 
  n&t |tjr|j	j
  |jjd dS )zInitialize the weightsg        )ZmeanZstdNr   )r   r   r   r   weightdataZnormal_r]   Zinitializer_ranger   Zzero_r   r   r   Zfill_)r1   r	  r2   r2   r3   _init_weightsV  s    

z"FlavaPreTrainedModel._init_weights)r8   r9   r:   r;   r   config_classbase_model_prefixsupports_gradient_checkpointingr
   r   r   r   r   r  r2   r2   r2   r3   r  L  s
   r  zeThe bare FLAVA Image Model transformer outputting raw hidden-states without any specific head on top.r   c                       s   e Zd ZeZdZdZdeed fddZe	j
ddd	Ze	j
d
ddZeeee f ddddZeedeeeededdeej eej ee eej eej ee ee ee eeef d	ddZ  Z S )FlavaImageModelzflava.image_modelr   Tr]   add_pooling_layerc                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|rFt|nd | _|   d S Nr   )rd   re   r]   r\   rt   r   encoderr   r   ri   r   	layernormr  pooler	post_initr1   r]   r  rr   r2   r3   re   q  s    

zFlavaImageModel.__init__r(   c                 C   s   | j jS rZ   rt   rl   r0   r2   r2   r3   get_input_embeddings~  s    z$FlavaImageModel.get_input_embeddingsr   c                 C   s   || j _d S rZ   r  r1   r   r2   r2   r3   set_input_embeddings  s    z$FlavaImageModel.set_input_embeddingsNheads_to_pruner)   c                 C   s*   |  D ]\}}| jj| j| qdS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsr  r   r   r   r1   r  r   r   r2   r2   r3   _prune_heads  s    zFlavaImageModel._prune_headsbatch_size, image_num_patchesZvision)
checkpointoutput_typer  ZmodalityZexpected_output	r   r   r   r   r   r   r   r   r)   c	                 C   s   |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}|d krLtd| || j j}| j|||d}	| j|	|||||d}
|
d }| 	|}| j
d k	r| 
|nd }|s||f|
dd   S t|||
j|
jdS )Nz You have to specify pixel_values)r   r   r   r   r   r   r   r   r   r   Zpooler_outputr   r   )r]   r   r   use_return_dictr   get_head_maskr   rt   r  r  r  r   r   r   )r1   r   r   r   r   r   r   r   r   embedding_outputencoder_outputssequence_outputr  r2   r2   r3   r     s>      
zFlavaImageModel.forward)T)NNNNNNNN)!r8   r9   r:   r   r  r  main_input_namerI   re   r   Moduler  r  r   r   r   r$  r   FLAVA_IMAGE_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   !_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC_EXPECTED_IMAGE_OUTPUT_SHAPEr   r<   r   r   r
   r6   r   r   r2   r2   rr   r3   r  g  sD   	        
r  zdThe bare FLAVA Text Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd ZeZdZdeed fddZedddZ	e
jd	d
dZeeee f ddddZeedeeeeddeej eej eej eej eej ee ee ee eeef d	ddZ  ZS )FlavaTextModelzflava.text_modelTr  c                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|rFt|nd | _|   d S r  )rd   re   r]   r   rt   r   r  r   r   ri   r   r  r  r  r  r  rr   r2   r3   re     s    

zFlavaTextModel.__init__r(   c                 C   s   | j jS rZ   rt   r   r0   r2   r2   r3   r    s    z#FlavaTextModel.get_input_embeddingsr  c                 C   s   || j _d S rZ   r8  r  r2   r2   r3   r    s    z#FlavaTextModel.set_input_embeddingsNr  c                 C   s*   |  D ]\}}| jj| j| qdS r   r!  r#  r2   r2   r3   r$    s    zFlavaTextModel._prune_headsbatch_size, text_seq_lengthr&  r'  r  )	r   r   r   r   r   r   r   r   r)   c	                 C   s  |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}|d krLtd| }	|d krltj|	|jd}| 	|| j j
}| ||	|j}
| j|||d}| j||
||||d}|d }| |}| jd k	r| |nd }|s||f|dd   S t|||j|jdS )NzYou have to specify input_idsr   r   r)  r   r   r*  )r]   r   r   r+  r   r   r<   onesr   r,  r   get_extended_attention_maskrt   r  r  r  r   r   r   )r1   r   r   r   r   r   r   r   r   r   extended_attention_maskr-  r.  r/  r  r2   r2   r3   r     sN      
zFlavaTextModel.forward)T)NNNNNNNN)r8   r9   r:   r   r  r  rI   re   rk   r  r   r1  r  r   r   r   r$  r   FLAVA_TEXT_INPUTS_DOCSTRINGr3  r   r4  r    _CONFIG_CLASS_FOR_TEXT_MODEL_DOCr   r<   r   r
   r6   r   r   r2   r2   rr   r3   r7    s>           
r7  zjThe bare FLAVA Multimodal Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd ZeZdZdZded fddZee	e
e	 f ddd	d
Zeedeeeeddejeej eej ee ee ee eeef dddZ  ZS )FlavaMultimodalModelzflava.multimodal_modelr   Tr   c                    sv   t  | || _| jj| _| jr:ttdd|j| _	t
|| _tj|j|jd| _|rdt|nd | _|   d S r   )rd   re   r]   use_cls_tokenr   rg   r<   rh   ri   rj   r   r  r   r   r  r  r  r  r  rr   r2   r3   re   =  s    

zFlavaMultimodalModel.__init__Nr  c                 C   s*   |  D ]\}}| jj| j| qdS r   r!  r#  r2   r2   r3   r$  K  s    z!FlavaMultimodalModel._prune_heads,batch_size, image_num_patches + text_seq_lenr:  r   c                 C   s(  |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}| \}}}	| jrz| j|dd}
tj	|
|fdd}|d7 }|d krtj
||f|jd}| || j j}| |||f|j}| j||||||d}|d }| |}| jd k	r| |nd }|s||f|dd   S t|||j|jdS )Nrw   r   r|   r;  r)  r   r*  )r]   r   r   r+  r   rB  rj   r   r<   r   r<  r   r,  r   r=  r  r  r  r   r   r   )r1   r   r   r   r   r   r   r   r   r   r   r>  r.  r/  r  r2   r2   r3   r   S  sH      
zFlavaMultimodalModel.forward)T)NNNNN)r8   r9   r:   r   r  r  r0  re   r   r   r   r$  r   !FLAVA_MULTIMODAL_INPUTS_DOCSTRINGr3  r   r4  r   &_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOCr<   r   r   rI   r
   r6   r   r   r2   r2   rr   r3   rA  3  s6        
rA  z_The bare FLAVA Model transformer outputting raw hidden-states without any specific head on top.r   c                       sN  e Zd ZeZed fddZeedde	e
j e	e
j e	e
j e	e
j e	e e	e e	e e
jdddZeed	de	e
j e	e
j e	e e	e
j e	e
j e	e e	e e	e e
jd
	ddZeedeeedde	e
j e	e
j e	e
j e	e
j e	e
j e	e
j e	e
j e	e e	e ee	e eeef dddZ  ZS )
FlavaModelr   c                    s0  t  | t|jts.tdt|j dt|jtsPtdt|j dt|j	t
svtddt|j	 d |j}|j}|j	}|j| _|j| _|j| _|j| _t|| _t|| _t|| _t| j| j| _t| j| j| _tt| jj| _t| j| j| _ t| j| j| _!| "  d S )NzLconfig.text_config is expected to be of type FlavaTextConfig but is of type r   zNconfig.image_config is expected to be of type FlavaImageConfig but is of type zMconfig.multimodal_config is expected to be of type FlavaMultimodalConfig but zis of type )#rd   re   r   text_configr   	TypeErrortypeimage_configr   multimodal_configr   Zprojection_dimri   Ztext_hidden_sizeZimage_hidden_sizeZmm_hidden_sizer7  
text_modelr  image_modelrA  multimodal_modelr   r   image_projectiontext_projectionrg   r<   Ztensorr]   Zlogit_scale_init_valuelogit_scaleimage_to_mm_projectiontext_to_mm_projectionr  )r1   r]   rG  rJ  rK  rr   r2   r3   re     s>    


zFlavaModel.__init__r9  N)r   r   r   r   r   r   r   r)   c              	   C   s8   d t | j|||||||d}|d }	| |	}
|
S )Na  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`FlavaTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   r   r   r   r   r   r   )r3  r4  rL  rP  )r1   r   r   r   r   r   r   r   Ztext_outputsr  Ztext_featuresr2   r2   r3   get_text_features  s    

zFlavaModel.get_text_featuresr%  r(  c	              
   C   s:   d t | j||||||||d}	|	d }
| |
}|S )Na  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`FlavaImageModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```)r   r   r   r   r   r   r   r   r   )r3  r4  rM  rO  )r1   r   r   r   r   r   r   r   r   Zimage_outputsr  Zimage_featuresr2   r2   r3   get_image_features  s     
zFlavaModel.get_image_featuresrC  r'  r  T)r   r   r   r   r   r   image_attention_maskskip_multimodal_encoderr   r   r   r)   c              	   C   s  |dk	r|n| j j}|
s tdd}d}d}d}|dk	rn| j||||	|
|d}|d |d  }}| |d }d}d}d}d}|dk	r| j|||||	|
|d}|d |d  }}| |d }d}d}|dk	r\|dk	r\|s\|dk	r.|j\}}}| jj	r|d7 }t
j|||jd	}t
j||gdd
}nd}t
j||gdd
}| j|||d}|d }|sr||||||fS t||||||dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)

        >>> image_embeddings = outputs.image_embeddings
        >>> text_embeddings = outputs.text_embeddings
        >>> multimodal_embeddings = outputs.multimodal_embeddings

        >>> outputs.image_embeddings.shape
        torch.Size([1, 197, 768])

        >>> text_embeddings.shape
        torch.Size([1, 7, 768])

        >>> multimodal_embeddings.shape
        torch.Size([1, 205, 768])
        ```
        NzRFLAVA model requires hidden states to work. Please set `output_hidden_states=True`)r   r   r   r   r   r   r   rx   rw   )r   r   r   r   r   r   r   r   r;  r|   )r   r   )r"   r#   r$   r%   r&   r'   )r]   r   r   rM  rR  rL  rS  r~   rN  rB  r<   r<  r   r   r!   )r1   r   r   r   r   r   r   rW  rX  r   r   r   r"   Zimage_statesZimage_mm_projectionr#   r$   Ztext_statesZtext_mm_projectionr%   r&   r'   r   r   r   Zattention_mask_imageZattention_multimodalZmultimodal_inputr2   r2   r3   r   %  s    5


  	zFlavaModel.forward)NNNNNNN)NNNNNNNN)NNNNNNNNNTN)r8   r9   r:   r   r  re   r   r?  r3  r   r<   r   rI   r=   rT  r2  r   rU  FLAVA_MODEL_INPUTS_DOCSTRINGr   r!   
LongTensorr
   r	   r   r   r   r2   r2   rr   r3   rF    s   +       +        1
           
rF  c                       s8   e Zd Zeed fddZejejdddZ  ZS )FlavaImageCodebookResPath)in_sizeout_sizec                    s   t    |d }t }t |d< tj||ddd|d< t |d< tj||ddd|d< t |d	< tj||ddd|d
< t |d< tj||ddd|d< t|| _d S )N   Zrelu_1r   r   r   paddingZconv_1Zrelu_2Zconv_2Zrelu_3Zconv_3Zrelu_4r   Zconv_4)rd   re   r   r   ReLUr   
Sequentialpath)r1   r\  r]  kwargsZhid_sizerc  rr   r2   r3   re     s    
z"FlavaImageCodebookResPath.__init__r   c                 C   s
   |  |S rZ   )rc  r1   r   r2   r2   r3   r     s    z!FlavaImageCodebookResPath.forward	r8   r9   r:   r   re   r<   r   r   r   r2   r2   rr   r3   r[    s   r[  c                       s:   e Zd Zeeed fddZejejdddZ  ZS )FlavaImageCodebookBlock)r\  r]  
num_layersc                    sP   t    d|d  | _||kr6tj||ddd| _n
t | _t||| _d S )Nr   rx   r   r_  )	rd   re   	post_gainr   r   id_pathZIdentityr[  res_path)r1   r\  r]  rh  rd  rr   r2   r3   re     s    

z FlavaImageCodebookBlock.__init__r   c                 C   s   |  || j| |  S rZ   )rj  ri  rk  re  r2   r2   r3   r     s    zFlavaImageCodebookBlock.forwardrf  r2   r2   rr   r3   rg    s   rg  c                       s@   e Zd Zdeeeeed fddZejejdddZ  Z	S )	FlavaImageCodebookLayerGroupT)
num_blocksrh  r\  r]  use_poolc                    s   t    t }t|D ]B}|dkr@t||||d|d  < qt||||d|d  < q|rptjdd|d< t|| _d S )Nr   Zblock_r   rx   )r   pool)	rd   re   r   r   rg  r   Z	MaxPool2drb  group)r1   rm  rh  r\  r]  rn  blocksr  rr   r2   r3   re     s    
z%FlavaImageCodebookLayerGroup.__init__r   c                 C   s
   |  |S rZ   )rp  re  r2   r2   r3   r     s    z$FlavaImageCodebookLayerGroup.forward)T)
r8   r9   r:   r   rI   re   r<   r   r   r   r2   r2   rr   r3   rl    s   rl  a"  
    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
    `get_codebook_indices` to get image tokens for an image.
    r   c                       sp   e Zd ZdZeZdZdZeed fddZ	e
je
jddd	Ze
je
jdd
dZe
je
jdddZ  ZS )FlavaImageCodebook r   F)r]   rd  c                    sd  t  | || _|j| _|j| _|j| _|j| _|j| _| j| j }t }t	
 |d< t	jd| j | jddd|d< t }t	j| jd| j ddd|d	< t| j|d| j d| j |d
< t| j|d| j d| j |d< t| j|d| j d| j |d< t| j|d| j d| j dd|d< t	||d< t	|| _|   | jjr`|  D ]}d|_qRd S )NZrelu   r   r   r_  conv   r   inputZgroup_1rx   Zgroup_2r^  Zgroup_3F)rn  Zgroup_4r   )rd   re   r]   Z
num_groupsinput_channelsZnum_blocks_per_groupri   r   r   r   ra  r   rl  rb  rq  r  freeze
parametersZrequires_grad)r1   r]   rd  rh  Zoutput_blocksrq  paramrr   r2   r3   re     sX                 

zFlavaImageCodebook.__init__)r   r)   c                 C   s"   d t | |}tj|ddS )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model.get_codebook_indices(**inputs)
        ```
        r   )Zaxis)r3  _CHECKPOINT_FOR_CODEBOOK_DOCrq  r<   Zargmaxr1   r   Zz_logitsr2   r2   r3   get_codebook_indices  s
    
z'FlavaImageCodebook.get_codebook_indicesc                 C   s   |  |}tjdd|S )Nr   r|   )rq  r   ZSoftmaxr}  r2   r2   r3   get_codebook_probs6  s    
z%FlavaImageCodebook.get_codebook_probsc                 C   s`   d t t|jdkr*td|j d|jd | jkrVtd|jd  d| j | |S )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model(**inputs)
        >>> print(outputs.shape)
        (1, 196)
        ```
        r^  zinput shape z
 is not 4dr   z
input has z channels but model built for )r3  r|  r   r~   r   rx  rq  )r1   r   r2   r2   r3   r   :  s    zFlavaImageCodebook.forward)r8   r9   r:   r  r   r  r0  r  r   re   r<   r   r~  r  r=   r   r   r2   r2   rr   r3   rr    s   	,rr  c                       s$   e Zd Z fddZdd Z  ZS )FlavaPredictionHeadTransformc                    sV   t    t|j|j| _t|jtr6t	|j | _
n|j| _
tj|j|jd| _d S r  )rd   re   r   r   ri   r   r   r   r   r   transform_act_fnr   r   r   rr   r2   r3   re   ^  s    
z%FlavaPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S rZ   )r   r  r   r   r2   r2   r3   r   g  s    


z$FlavaPredictionHeadTransform.forwardr8   r9   r:   re   r   r   r2   r2   rr   r3   r  ]  s   	r  c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	FlavaMaskedPredictionHeadNc                    sb   t    || _t|| _tj|j|jdd| _	t
t|j| _|d k	rT|| j	_| j| j	_d S )NFr   )rd   re   r]   r  	transformr   r   ri   r   decoderrg   r<   rh   r   r
  )r1   r]   r
  rr   r2   r3   re   o  s    

z"FlavaMaskedPredictionHead.__init__c                 C   s   | j | j_ d S rZ   )r   r  r0   r2   r2   r3   _tie_weights{  s    z&FlavaMaskedPredictionHead._tie_weightsc                 C   s   |  |}| |}|S rZ   )r  r  re  r2   r2   r3   r   ~  s    

z!FlavaMaskedPredictionHead.forward)N)r8   r9   r:   re   r  r   r   r2   r2   rr   r3   r  n  s   r  c                       s$   e Zd Z fddZdd Z  ZS )FlavaITMHeadc                    s.   t    || _t|| _t|jd| _d S )Nrx   )	rd   re   r]   r  r  r   r   ri   seq_relationshipr   rr   r2   r3   re     s    

zFlavaITMHead.__init__c                 C   s   |  |}| |}|S rZ   )r  r  re  r2   r2   r3   r     s    

zFlavaITMHead.forwardr  r2   r2   rr   r3   r    s   r  c                       s$   e Zd Z fddZdd Z  ZS )FlavaGlobalContrastiveHeadc                    s   t    || _|j| _d S rZ   )rd   re   r]   global_backprop_contrastiver   rr   r2   r3   re     s    
z#FlavaGlobalContrastiveHead.__init__c                    s2  t |}t j rt j sBt j d jd} g}g}n d}t j }	| j	r~t jj
j }t jj
j}nHfddt|	D } fddt|	D }t j|  t j| |t j  t j| jd }t |}t |}t  |dd| }
t |dd| }|
||fS )Nr   r;  c                    s   g | ]}t  qS r2   r<   Z
zeros_liker   )r$   r2   r3   r     s     z6FlavaGlobalContrastiveHead.forward.<locals>.<listcomp>c                    s   g | ]}t  qS r2   r  r   )r"   r2   r3   r     s     r   )r<   expZdistributedZis_availableZis_initializedr   r   r   Zget_world_sizer  r   r   Z
all_gatherr   Zget_rankr   r   r   )r1   r"   r$   rQ  ZtemperaturelabelsZimage_embeddings_allZtext_embeddings_allZlocal_batch_sizeZ
world_sizelogits_per_imagelogits_per_textr2   )r"   r$   r3   r     s.    


 

z"FlavaGlobalContrastiveHead.forwardr  r2   r2   rr   r3   r    s   r  zk
    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
    c                       s   e Zd ZddddgZdeeej d fddZe	j
d	d
dZeeddeeeddee	j ee	j ee	j ee	j ee	j
 ee	j
 ee	j
 ee	j ee	j
 eee	j
 ee	j
 ee	j
 ee eee ee eee	j
 ef dddZ  ZS )FlavaForPreTrainingzmmm_text_head.decoder.biaszmmm_image_head.decoder.biaszmlm_head.decoder.biaszmim_head.decoder.biasN)r]   image_codebookc                    s   t  | t|| _|| _| jd kr8|jr8t|j| _t|j	| _
t|j| _t|| _t|j	| _t|j| _t|| _|j	j| _|jj| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|   d S rZ   )rd   re   rF  r  r  Zinit_codebookrr  Zimage_codebook_configr  rJ  mim_headrG  mlm_headr  itm_headmmm_image_headmmm_text_headr  global_contrastive_headr   image_vocab_sizetext_vocab_size
mlm_weight
mim_weightglobal_contrastive_weightce_ignore_index
itm_weightmmm_image_weightmmm_text_weight skip_unmasked_multimodal_encoderr  )r1   r]   r  rr   r2   r3   re     s,    




zFlavaForPreTraining.__init__)r   c                 C   s"   |  dkr||dd}|S )Nrx   r   rw   )r}   r   r   re  r2   r2   r3   _resize_to_2d  s    z!FlavaForPreTraining._resize_to_2dzbatch_size, text_seq_lenr%  rV  T)r   input_ids_maskedr   codebook_pixel_valuesr   r   r   r   rW  r  
mlm_labels
mim_labels
itm_labelsr   r   r   return_lossr)   c           6      C   sz  |dk	r|n| j j}|dk	r |n| j j}|
dk	r4|
n| j}
|dkrX|dk	rXtd |}| j||||||	|
||dd
}| j|||||	|||dd	}d}|j}|j}|j}|j}|j	}d } } } } } }} d }! }" }#}$d }% }&}'|dk	s|dk	r@|dkr@|r@| j
dkr"td|dkr4td| j
|}| jdkr|dk	r|dkr|}(|dk	r| |}| |}| j||d< |(dd|d	 dddf }(|| j})||) }*|(|)ddf }(| |(}!|rtj|!d
| j|*d
}|| j9 }n
| |(}!| jdkr|dk	r|dkr|}+|dk	r| |}|+dd|d	 dddf }+|| j})||) },|+|)ddf }+| |+}"|rtj|"d
| j|,d
}|| j9 }n
| |+}"| jdkr|dk	r| |}%|dk	r|d}-t|-  |-|-!dg}|rJtj|%|} | | j9 } |dk	r\|| }|dk	rn|| }|dk	r|| }|| }|dk	rd| j"dkrd|}(|d	d	 }.|(dddd|. ddf }(|dk	rZ| |}| |}| j||d< || j})||) }*|(|)ddf }(| #|(}$|rdtj|$d
| j|*d
}|| j"9 }n
| #|(}$|dk	r| j$dkr|}+|+dd|d	 dddf }+|dk	r| |}|| j})||) },|+|)ddf }+| %|+}#|rtj|#d
| j|,d
}|| j$9 }n
| %|+}#|dk	r|dk	r| j&dkr| j'|dddddf }/tjj(|/d
d}/| j)|dddddf }0tjj(|0d
d}0| jj*j+,t-t. | /|0|/| jj*\}&}'}1|dk	r|&| }&|'| }'|1| }1|rtj|&|1}2tj|'|1}3|2|3 d }|| j&9 }t0||| |||d}4|rV|41 sVt2dd |43 D }|s:||j4dk	rt|j45 nd||j6dk	r|j65 nd|j	|j7dk	r|j75 nd||j4dk	r|j45 nd||j6dk	r|j65 nd||j7dk	r|j75 nd|!|"|%|&|&|$|#f}5|r(|41 s(||4f|5 }5t8dd |5D S t9||4||j4||j6|j	|j7||j4||j6||j7|!|"|%|&|'|$|#dS )ai  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import FlavaForPreTraining, AutoProcessor

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> text = ["a photo of a cat"]

        >>> inputs = processor(
        ...     images=[image],
        ...     text=text,
        ...     return_masks=True,
        ...     return_codebook_pixels=True,
        ...     padding=True,
        ...     max_length=77,
        ...     return_tensors="pt",
        ... )


        >>> output = model(**inputs)
        ```

        Return:

        Nz`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...T)
r   r   r   r   r   rW  rX  r   r   r   )	r   r   r   r   rW  r   r   r   r   z`return_loss` is set to True but the image codebook is not initialized and no `mim_labels`  have been passed. Reinstantiate the model with `init_codebook` set to True or pass in your custom `mim_labels`z`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to Truer   r   rw   rx   r|   )r@   rA   rB   rC   rD   rE   c                 s   s   | ]}|d k	r|ndV  qd S r  r2   )r.   rK   r2   r2   r3   r4     s     z.FlavaForPreTraining.forward.<locals>.<genexpr>c                 s   s   | ]}|d kr|V  qd S rZ   r2   )r.   r   r2   r2   r3   r4     s      )rK   rL   r"   r#   r$   r%   r&   r'   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   ):r]   r+  r  r  loggerwarningr  r"   r$   r&   r  RuntimeErrorr   r~  r  r  r  ner   r  r   r   Zcross_entropyr   r  r  r  r  r  r  r<   whereanynewr  r  r  r  r  rP  	normalizerO  rQ  r  Zclamp_LOGIT_SCALE_CLAMP_MINLOGIT_SCALE_CLAMP_MAXr  r?   rG   sumrF   r#   r,   r%   r'   r6   rJ   )6r1   r   r  r   r  r   r   r   r   rW  r  r  r  r  r   r   r   r  Zflava_outputZflava_masked_outputZpos_maskr"   r$   rM   rO   rQ   Z
total_lossZmim_lossZmlm_lossZmmm_text_lossZmmm_image_lossZgc_lossZitm_lossrS   rT   rY   rX   rU   r  r  Zsequence_for_imageZmasked_tokensZmim_labels_filteredZsequence_for_textZmlm_labels_filteredZ	pos_pairsZ	end_indexZtext_embeddingZimage_embeddingZ	gc_labelsZgc_loss_imageZgc_loss_textZflava_lossesr   r2   r2   r3   r     s   8
 


"
 
 

"
 











 
"


 
   


	zFlavaForPreTraining.forward)N)NNNNNNNNNNNNNNTNN)r8   r9   r:   Z_tied_weights_keysr   r   r   r1  re   r<   r   r  r   "FLAVA_PRETRAINING_INPUTS_DOCSTRINGr3  r   rJ   rZ  r=   rI   r
   r	   r   r   r2   r2   rr   r3   r    s`   	

                 r  )\r;   r   r   r   Zdataclassesr   typingr   r   r   r   r   r	   r
   r<   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_outputsr   r   Zmodeling_utilsr   r   r   utilsr   r   r   r   r   r   Zconfiguration_flavar   r   r   r   r   Z
get_loggerr8   r  r4  r|  r5  r@  rE  r6  r  r  r   r!   r?   rJ   r1  r\   rk   r   r   r   r   r   r   r   r   r  ZFLAVA_START_DOCSTRINGZFLAVA_INPUTS_DOCSTRING_COMMONZ!FLAVA_IMAGE_INPUTS_DOCSTRING_BASEr2  Z FLAVA_TEXT_INPUTS_DOCSTRING_BASEr?  rD  Z!FLAVA_MODEL_INPUTS_DOCSTRING_BASErY  r  Z'FLAVA_PRETRAINING_START_DOCSTRING_EXTRAr  r3  r  r7  rA  rF  r[  rg  rl  rr  r  r  r  r  r  r2   r2   r2   r3   <module>   s   $ 	

%$e]$9C*.3			
+/
_
e
^
  
u(