U
    4Af$                     @   sn   d Z ddlmZmZ ddlZddlmZ ddlmZ G dd dej	Z
G dd	 d	ej	ZG d
d dej	ZdS )a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )IdeficsConfigc                       sB   e Zd Zeeeeeedd fddZejejdddZ  Z	S )IdeficsPerceiverResamplerN)config	embed_dimdepthn_headshead_dim	n_latentsreturnc                    s   t    ||||f\____ jj_t	j
tjjdd_t jdsbjd n
 jjd _t	 fddt|D _t	j_dS )ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        T)Zrequires_gradr      c              	      s4   g | ],}t tjjjjtj gqS  )	nn
ModuleListIdeficsPerceiverAttentionr   r
   r   qk_layer_norms
IdeficsMLPintermediate_dim).0_r   selfr   I/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/idefics/perceiver.py
<listcomp>S   s   
z6IdeficsPerceiverResampler.__init__.<locals>.<listcomp>N)super__init__r   r
   r   r   Zperceiver_configZqk_layer_norms_perceiverr   r   	ParametertorchZrandnlatentshasattrvision_configr   r   rangeblocks	LayerNorm
layer_norm)r   r   r   r	   r
   r   r   	__class__r   r   r   1   s    



z"IdeficsPerceiverResampler.__init__)contextr   c                 C   sJ   | j |jd dd}| jD ]"\}}|||| }||| }q| |S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   r   )r    repeatshaper$   r&   )r   r)   r    attnffr   r   r   forward_   s
    z!IdeficsPerceiverResampler.forward)
__name__
__module____qualname__r   intr   r   Tensorr.   __classcell__r   r   r'   r   r   0   s        .r   c                       sB   e Zd Zeeeedd fddZejejejdddZ  Z	S )r   N)r   r
   r   r   r   c                    s   t    |||  | _| _| _|| _t| j| _t| j| _	| jrdt| j| _
t| j| _| jd | _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j |dd| _dS )ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`g      FZbiasN)r   r   r   r
   r   r   r   r%   context_layer_normlatents_layer_normq_layer_normk_layer_normqk_scaleLinearq_projk_projv_projoutput_proj)r   r   r
   r   r   r'   r   r   r   m   s    
z"IdeficsPerceiverAttention.__init__)r)   r    r   c                    s    |}|}|jdd \ }}|}tj||gdd}tj||gdd} fdd|||fD \}}}jr	|}
|}td|j |}||jdd	d
  }	|	jdd}
td|
|}|dddS )aF  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`torch.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`torch.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        N   )dimc                    s.   g | ]&}|  |jd  jjd dqS )r      )Zreshaper+   r
   r   	transpose)r   xZ
batch_sizer   r   r   r      s     z5IdeficsPerceiverAttention.forward.<locals>.<listcomp>z... i d, ... j d -> ... i jT)rB   Zkeepdimz... i j, ... j d -> ... i dr   rC   )r6   r7   r+   r<   r=   r   catr>   r   r8   r9   Zeinsumr:   ZamaxdetachZsoftmaxr?   rD   flatten)r   r)   r    Z
seq_lengthr   qkvZscoresZstabilized_scoresr,   Z	resampledr   rF   r   r.      s    


 

z!IdeficsPerceiverAttention.forward)
r/   r0   r1   r2   boolr   r   r3   r.   r4   r   r   r'   r   r   l   s   r   c                       s>   e Zd Zed fddZeeej  ejdddZ	  Z
S )r   )r   c                    sX   t    |jj| _t| j| _tj| j|dd| _t	 | _
tj|| jdd| _dS )z:Simple MLP block with intermediate_size and embedding sizeFr5   N)r   r   r"   r   r   r%   lnr;   fcZReLUactc_proj)r   Zintermediate_sizer   r'   r   r   r      s    


zIdeficsMLP.__init__)hidden_statesr   c                 C   s,   |  |}| |}| |}| |}|S )N)rO   rP   rQ   rR   )r   rS   r   r   r   r.      s
    



zIdeficsMLP.forward)r/   r0   r1   r   r   r   r   r   ZFloatTensorr.   r4   r   r   r'   r   r      s   	r   )__doc__typingr   r   r   Ztorch.nnr   Zconfiguration_ideficsr   Moduler   r   r   r   r   r   r   <module>   s   <A