U
    4Af'                     @   sz   d Z ddlmZmZ ddlZddlmZ ddlm	Z	 G dd	 d	ej
jjZG d
d dej
jjZG dd dej
jjZdS )a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )
shape_list   )IdeficsConfigc                       sN   e Zd Zeeeeeedd fddZ fddZejejddd	Z	  Z
S )
TFIdeficsPerceiverResamplerN)config	embed_dimdepthn_headshead_dim	n_latentsreturnc           	         s   t  jf | ||||f\| _| _| _| _|jj| _t	|j
dsJ| jd n
|j
jd | _g | _t|D ]F}| jt| j| j| j| jd| ddt| j|d| ddg qftjjjddd	| _d
S )ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        r
      zblocks.z.0namez.1h㈵>
layer_normepsilonr   N)super__init__r
   r   r   r   Zperceiver_configZqk_layer_norms_perceiverqk_layer_normshasattrvision_configZintermediate_dimblocksrangeappendTFIdeficsPerceiverAttentionTFIdeficsMLPtfkeraslayersLayerNormalizationr   )	selfr	   r
   r   r   r   r   kwargsi	__class__ L/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/idefics/perceiver_tf.pyr   1   s*    


    
	z$TFIdeficsPerceiverResampler.__init__c                    s,   | j | j| jfdddd| _t | d S )NZrandom_normalTlatents)shapeZinitializerZ	trainabler   )Z
add_weightr   r
   r,   r   build)r%   Zinput_shaper(   r*   r+   r.   \   s    
   z!TFIdeficsPerceiverResampler.build)contextr   c                 C   s`   t j| jdd}t |t |d ddg}| jD ]"\}}|||| }||| }q2| |S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   axisr   )r!   Zexpand_dimsr,   Ztiler-   r   r   )r%   r/   r,   attnffr*   r*   r+   callc   s    z TFIdeficsPerceiverResampler.call)__name__
__module____qualname__r   intr   r.   r!   Tensorr4   __classcell__r*   r*   r(   r+   r   0   s        +r   c                       sB   e Zd Zeeeedd fddZejejejdddZ  Z	S )r   N)r
   r   r   r   r   c                    s   t  jf | |||  | _| _| _|| _tjjj	ddd| _
tjjj	ddd| _| jrtjjj	ddd| _tjjj	ddd| _| jd | _tjjj| j| j dd	d
| _tjjj| j| j ddd
| _tjjj| j| j ddd
| _tjjj|ddd
| _dS )ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`r   context_layer_normr   latents_layer_normq_layer_normk_layer_normg      Fq_projZuse_biasr   k_projv_projoutput_projN)r   r   r
   r   r   r   r!   r"   r#   r$   r;   r<   r=   r>   qk_scaleDenser?   rA   rB   rC   )r%   r
   r   r   r   r&   r(   r*   r+   r   p   s    z$TFIdeficsPerceiverAttention.__init__)r/   r,   r   c              
      s   |}|}t|\ }}|}tj||gdd}tj||gdd} fdd|||fD \}}}jr	|}
|}td|j |}|tj|ddd }	tjj|	dd}
td	|
|}ttj|d
dddgd djj fS )a=  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`tf.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`tf.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        r0   c              
      s<   g | ]4}t jt | |jd  jjfddd dgdqS )r   r      r   perm)r!   	transposereshaper-   r   r   ).0xZ
batch_sizer%   r*   r+   
<listcomp>   s   z4TFIdeficsPerceiverAttention.call.<locals>.<listcomp>z... i d, ... j d -> ... i jT)r1   Zkeepdimsz... i j, ... j d -> ... i dr   rG   r   r   rH   )r;   r<   r   r?   rA   r!   concatrB   r   r=   r>   ZeinsumrD   Z
reduce_maxnnZsoftmaxrC   rK   rJ   r   r   )r%   r/   r,   Z
seq_lengthr
   qkvZscoresZstabilized_scoresr2   Z	resampledr*   rN   r+   r4      s&    




*z TFIdeficsPerceiverAttention.call)
r5   r6   r7   r8   boolr   r!   r9   r4   r:   r*   r*   r(   r+   r   o   s   r   c                       s>   e Zd Zed fddZeeej  ejdddZ	  Z
S )r    )r	   c                    sp   t  jf | |jj| _tjjjddd| _tjjj	|ddd| _
tjjjdd| _tjjj	| jdd	d| _d
S )z:Simple MLP block with intermediate_size and embedding sizer   lnr   Ffcr@   actr   c_projN)r   r   r   r
   r!   r"   r#   r$   rW   rE   rX   ZReLUrY   rZ   )r%   Zintermediate_sizer	   r&   r(   r*   r+   r      s    
zTFIdeficsMLP.__init__)hidden_statesr   c                 C   s,   |  |}| |}| |}| |}|S )N)rW   rX   rY   rZ   )r%   r[   r*   r*   r+   r4      s
    



zTFIdeficsMLP.call)r5   r6   r7   r   r   r   r   r!   r9   r4   r:   r*   r*   r(   r+   r       s   	r    )__doc__typingr   r   Z
tensorflowr!   Zmodeling_tf_utilsr   Zconfiguration_ideficsr   r"   r#   ZLayerr   r   r    r*   r*   r*   r+   <module>   s   ?D