U
    5Af                     @   s  d dl mZmZmZ d dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZm Z m!Z!m"Z"m#Z# d	dl$m%Z%m&Z&m'Z'm(Z( d	dl)m*Z*m+Z+m,Z, ddl-m.Z. e,/e0Z1dZ2dZ3ej4Z4dd Z5dZ6dZ7G dd dej8Z9G dd dej8Z:G dd dej8Z;G dd dej8Z<G dd dej8Z=G dd  d ej8Z>G d!d" d"ej8Z?G d#d$ d$ej8Z@G d%d& d&ej8ZAG d'd( d(ej8ZBG d)d* d*ej8ZCG d+d, d,ej8ZDG d-d. d.e&ZEG d/d0 d0ej8ZFe*d1e6G d2d3 d3eEZGe'eGe2ee3 G d4d5 d5ej8ZHe*d6e6G d7d8 d8eEZIe'eIe2ee3d9d: G d;d< d<ej8ZJe*d=e6G d>d? d?eEZKe'eKe2e"e3 G d@dA dAej8ZLe*dBe6G dCdD dDeEZMe(eMe7NdE e'eMe2e e3 G dFdG dGej8ZOe*dHe6G dIdJ dJeEZPe'ePe2e#e3 G dKdL dLej8ZQe*dMe6G dNdO dOeEZRe'eRe2e!e3 G dPdQ dQej8ZSe*dRe6G dSdT dTeEZTe'eTe2ee3 dS )U    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )	-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )RobertaConfigzFacebookAI/roberta-baser!   c                 C   sx   | |k d}|jdkrR|d|jd f}tj|dd d| }|| j}ntj|dd d| }| d| S )a!  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: jnp.ndarray
        padding_idx: int

    Returns: jnp.ndarray
    i4   r    Zaxis)astypendimreshapeshapejnpcumsum)	input_idsZpadding_idxmaskZincremental_indices r.   U/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/roberta/modeling_flax_roberta.py"create_position_ids_from_input_ids4   s    
r0   a   

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	de
ddd	Zd
S )FlaxRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjd| j	d| _
t j| jj| jjtj jj| jjd| j	d| _t j| jj| jjtj jj| jjd| j	d| _t j| jj| j	d| _t j| jjd| _d S )N)stddev)Zembedding_initr3   epsilonr3   Zrate)nnZEmbedr2   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger3   word_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfr.   r.   r/   setup   s(    zFlaxRobertaEmbeddings.setupTdeterministicc           
      C   sX   |  |d}| |d}| |d}|| | }	| |	}	| j|	|d}	|	S )Nr"   rK   )r?   r&   rA   rB   rC   rG   )
rI   r,   token_type_idsposition_idsattention_maskrL   Zinputs_embedsZposition_embedsrB   hidden_statesr.   r.   r/   __call__   s    
zFlaxRobertaEmbeddings.__call__N)T)__name__
__module____qualname____doc__r!   __annotations__r*   float32r3   rJ   boolrQ   r.   r.   r.   r/   r1      s
   
r1   c                   @   sr   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
dd Zd	d
 Zejdd Zdeej eedddZdS )FlaxRobertaSelfAttentionr2   Fcausalr3   c                 C   s   | j j| j j | _| j j| j j dkr.tdtj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _| jrttjd| j jfdddd| _d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads}r3   kernel_initr    rX   r3   )r2   r:   num_attention_headshead_dim
ValueErrorr8   Denser3   r;   r<   r=   r>   querykeyvaluerZ   r	   r*   onesr@   causal_maskrH   r.   r.   r/   rJ      s2     zFlaxRobertaSelfAttention.setupc                 C   s"   | |jd d | jj| jf S Nr#   )r(   r)   r2   r^   r_   rI   rP   r.   r.   r/   _split_heads   s    z%FlaxRobertaSelfAttention._split_headsc                 C   s   | |jd d | jjf S rg   )r(   r)   r2   r:   rh   r.   r.   r/   _merge_heads   s    z%FlaxRobertaSelfAttention._merge_headsc                 C   s   |  dd}| ddtj|j|j}| ddtj|j|j}| dddd }|r|jj^ }	}
}}|j}dt|	 |ddf }t	|j||}t	|j||}||_||_|jd	 }|j| |_t
t|
|| k t|	d	||
f }t||}|||fS )
a[  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                   S   s   t jdt jdS )Nr   r]   )r*   arrayZint32r.   r.   r.   r/   <lambda>       z@FlaxRobertaSelfAttention._concatenate_to_cache.<locals>.<lambda>)r   r   r    )has_variablevariabler*   zerosr)   r3   rd   lenr   dynamic_update_slicebroadcast_toarangetupler   )rI   rc   rd   rb   rO   Zis_initializedrl   rm   rn   Z
batch_dims
max_lengthZ	num_headsZdepth_per_headZ	cur_indexindicesZnum_updated_cache_vectorsZpad_maskr.   r.   r/   _concatenate_to_cache   s(    	

z.FlaxRobertaSelfAttention._concatenate_to_cacheNT)key_value_states
init_cacheoutput_attentionsc                 C   s  |d k	}|j d }	| |}
|r6| |}| |}n| |}| |}| |
}
| |}| |}| jr|
j d |j d  }}| ddr| jd d }| jd d j d }t	| j
dd|dfdd||f}n"| j
d d d d d |d |f }t||	f|j dd   }|d k	rJ| jrJttj|dd|j }t||}n&| jrX|}n|d k	rptj|dd}| jr| dds|r| |||
|\}}}|d k	rt|dkt|j d| jt|j t| jj| j}nd }d }|s| jjdkr| d	}t|
|||| jjd
|| jd d	}|d k	rLtd||}td||}||j d d d }|r||fn|f}|S )Nr   r    rk   rl   rn   )r%   g        rG   T)biasdropout_rngZdropout_rateZbroadcast_dropoutrL   r3   Z	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr#   )r$   )r)   rb   rc   rd   ri   rZ   rr   	variablesr   Zdynamic_slicerf   r*   rw   Zexpand_dimsr   r|   selectfullr&   r3   Zfinfominr2   Zattention_probs_dropout_probZmake_rngr   Zeinsumr(   )rI   rP   rO   layer_head_maskr}   r~   rL   r   Zis_cross_attention
batch_sizeZquery_statesZ
key_statesZvalue_statesZquery_lengthZ
key_lengthZ
mask_shiftZmax_decoder_lengthrf   Zattention_biasr   Zattn_weightsattn_outputoutputsr.   r.   r/   rQ      s    







 
 
"
   



z!FlaxRobertaSelfAttention.__call__)NFTF)rR   rS   rT   r!   rV   rZ   rX   r*   rW   r3   rJ   ri   rj   r8   compactr|   r   ndarrayrQ   r.   r.   r.   r/   rY      s"   

$    rY   c                   @   s>   e Zd ZU eed< ejZejed< dd Zd
e	dddZ
d	S )FlaxRobertaSelfOutputr2   r3   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jj| jd| _
t j| jjd| _d S )Nr\   r3   r5   r7   )r8   ra   r2   r:   r;   r<   r=   r>   r3   denserC   rD   rE   rF   rG   rH   r.   r.   r/   rJ   f  s    zFlaxRobertaSelfOutput.setupTrK   c                 C   s*   |  |}| j||d}| || }|S NrK   r   rG   rC   )rI   rP   Zinput_tensorrL   r.   r.   r/   rQ   o  s    
zFlaxRobertaSelfOutput.__call__N)TrR   rS   rT   r!   rV   r*   rW   r3   rJ   rX   rQ   r.   r.   r.   r/   r   b  s   
	r   c                   @   sJ   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
ded	d
dZdS )FlaxRobertaAttentionr2   FrZ   r3   c                 C   s,   t | j| j| jd| _t| j| jd| _d S )NrZ   r3   r]   )rY   r2   rZ   r3   rI   r   outputrH   r.   r.   r/   rJ   |  s    zFlaxRobertaAttention.setupNT)r   c              	   C   sL   | j |||||||d}|d }	| j|	||d}|f}
|rH|
|d f7 }
|
S )N)r   r}   r~   rL   r   r   rK   r    )rI   r   )rI   rP   rO   r   r}   r~   rL   r   Zattn_outputsr   r   r.   r.   r/   rQ     s    	zFlaxRobertaAttention.__call__)NFTF)rR   rS   rT   r!   rV   rZ   rX   r*   rW   r3   rJ   rQ   r.   r.   r.   r/   r   w  s   
	    r   c                   @   s6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxRobertaIntermediater2   r3   c                 C   s8   t j| jjtj j| jj| jd| _	t
| jj | _d S Nr   )r8   ra   r2   Zintermediate_sizer;   r<   r=   r>   r3   r   r   Z
hidden_act
activationrH   r.   r.   r/   rJ     s    zFlaxRobertaIntermediate.setupc                 C   s   |  |}| |}|S N)r   r   rh   r.   r.   r/   rQ     s    

z FlaxRobertaIntermediate.__call__N
rR   rS   rT   r!   rV   r*   rW   r3   rJ   rQ   r.   r.   r.   r/   r     s   
r   c                   @   s>   e Zd ZU eed< ejZejed< dd Zd
e	dddZ
d	S )FlaxRobertaOutputr2   r3   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jjd| _t j| jj| jd| _d S )Nr   r7   r5   )r8   ra   r2   r:   r;   r<   r=   r>   r3   r   rE   rF   rG   rC   rD   rH   r.   r.   r/   rJ     s    zFlaxRobertaOutput.setupTrK   c                 C   s*   |  |}| j||d}| || }|S r   r   )rI   rP   attention_outputrL   r.   r.   r/   rQ     s    
zFlaxRobertaOutput.__call__N)Tr   r.   r.   r.   r/   r     s   
	r   c                   @   sR   e Zd ZU eed< ejZejed< dd Zde	ej
 e	ej
 eeedd	d
ZdS )FlaxRobertaLayerr2   r3   c                 C   s\   t | j| jj| jd| _t| j| jd| _t| j| jd| _| jj	rXt | jd| jd| _
d S )Nr   r]   F)r   r2   Z
is_decoderr3   	attentionr   intermediater   r   add_cross_attentioncrossattentionrH   r.   r.   r/   rJ     s
    zFlaxRobertaLayer.setupNFT)encoder_hidden_statesencoder_attention_maskr~   rL   r   c	                 C   s   | j ||||||d}	|	d }
|d k	rD| j|
|||||d}|d }
| |
}| j||
|d}|f}|r||	d f7 }|d k	r||d f7 }|S )N)r   r~   rL   r   r   )rO   r   r}   rL   r   rK   r    )r   r   r   r   )rI   rP   rO   r   r   r   r~   rL   r   Zattention_outputsr   Zcross_attention_outputsr   r.   r.   r/   rQ     s6    
zFlaxRobertaLayer.__call__)NNFTF)rR   rS   rT   r!   rV   r*   rW   r3   rJ   r   r   rX   rQ   r.   r.   r.   r/   r     s   
     r   c                	   @   sb   e Zd ZU eed< ejZejed< dZe	ed< dd Z
deej eej e	e	e	e	e	d	d
dZdS )FlaxRobertaLayerCollectionr2   r3   Fgradient_checkpointingc                    sR   j r2ttdd  fddtjjD _nfddtjjD _d S )N)         )Zstatic_argnumsc                    s"   g | ]} j t|jd qS )namer3   )r2   strr3   .0iZFlaxRobertaCheckpointLayerrI   r.   r/   
<listcomp>  s   z4FlaxRobertaLayerCollection.setup.<locals>.<listcomp>c                    s"   g | ]}t  jt| jd qS r   )r   r2   r   r3   r   rH   r.   r/   r     s   )r   rematr   ranger2   num_hidden_layerslayersrH   r.   r   r/   rJ   	  s    



z FlaxRobertaLayerCollection.setupNTr   r   r~   rL   r   output_hidden_statesreturn_dictc              
   C   s$  |rdnd }|	rdnd }|r(|d k	r(dnd }|d k	rj|j d t| jkrjtdt| j d|j d  dt| jD ]l\}}|	r||f7 }||||d k	r|| nd |||||}|d }|rt||d f7 }|d k	rt||d f7 }qt|	r||f7 }||||f}|
stdd	 |D S t||||d
S )Nr.   r   z&The head_mask should be specified for z/ layers, but it is for                         .r    r#   c                 s   s   | ]}|d k	r|V  qd S r   r.   )r   vr.   r.   r/   	<genexpr>L  s      z6FlaxRobertaLayerCollection.__call__.<locals>.<genexpr>)last_hidden_staterP   
attentionscross_attentions)r)   ru   r   r`   	enumeratery   r   )rI   rP   rO   	head_maskr   r   r~   rL   r   r   r   Zall_attentionsZall_hidden_statesZall_cross_attentionsr   layerZlayer_outputsr   r.   r.   r/   rQ     sJ    

z#FlaxRobertaLayerCollection.__call__)NNFTFFTrR   rS   rT   r!   rV   r*   rW   r3   r   rX   rJ   r   r   rQ   r.   r.   r.   r/   r     s(   
       r   c                	   @   sb   e Zd ZU eed< ejZejed< dZe	ed< dd Z
deej eej e	e	e	e	e	d	d
dZdS )FlaxRobertaEncoderr2   r3   Fr   c                 C   s   t | j| j| jd| _d S )Nr3   r   )r   r2   r3   r   r   rH   r.   r.   r/   rJ   \  s
    zFlaxRobertaEncoder.setupNTr   c                 C   s   | j |||||||||	|
d
S )N)r   r   r   r~   rL   r   r   r   )r   )rI   rP   rO   r   r   r   r~   rL   r   r   r   r.   r.   r/   rQ   c  s    zFlaxRobertaEncoder.__call__)NNFTFFTr   r.   r.   r.   r/   r   W  s(   
       r   c                   @   s6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxRobertaPoolerr2   r3   c                 C   s*   t j| jjtj j| jj| jd| _	d S r   )
r8   ra   r2   r:   r;   r<   r=   r>   r3   r   rH   r.   r.   r/   rJ     s
    zFlaxRobertaPooler.setupc                 C   s$   |d d df }|  |}t|S )Nr   )r   r8   tanh)rI   rP   Zcls_hidden_stater.   r.   r/   rQ     s    
zFlaxRobertaPooler.__call__Nr   r.   r.   r.   r/   r     s   
r   c                   @   sT   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zd
dd	ZdS )FlaxRobertaLMHeadr2   r3   .	bias_initc                 C   s   t j| jj| jtj j| jjd| _	t j
| jj| jd| _t j| jj| jdtj j| jjd| _| d| j| jjf| _d S )Nr[   r5   F)r3   Zuse_biasr\   r   )r8   ra   r2   r:   r3   r;   r<   r=   r>   r   rC   rD   
layer_normr9   decoderparamr   r   rH   r.   r.   r/   rJ     s    zFlaxRobertaLMHead.setupNc                 C   sh   |  |}td |}| |}|d k	rB| jdd|jii|}n
| |}t| j| j	}||7 }|S )NZgeluparamskernel)
r   r   r   r   applyTr*   Zasarrayr   r3   )rI   rP   shared_embeddingr   r.   r.   r/   rQ     s    


zFlaxRobertaLMHead.__call__)N)rR   rS   rT   r!   rV   r*   rW   r3   r;   r8   r<   rt   r   r   npr   rJ   rQ   r.   r.   r.   r/   r     s
   
r   c                   @   s8   e Zd ZU eed< ejZejed< dd Zd	ddZ	dS )
FlaxRobertaClassificationHeadr2   r3   c                 C   sz   t j| jj| jtj j| jjd| _	| jj
d k	r:| jj
n| jj}t j|d| _t j| jj| jtj j| jjd| _d S )Nr[   r7   )r8   ra   r2   r:   r3   r;   r<   r=   r>   r   classifier_dropoutrF   rE   rG   
num_labelsout_projrI   r   r.   r.   r/   rJ     s    

z#FlaxRobertaClassificationHead.setupTc                 C   sT   |d d dd d f }| j ||d}| |}t|}| j ||d}| |}|S )Nr   rK   )rG   r   r8   r   r   )rI   rP   rL   r.   r.   r/   rQ     s    


z&FlaxRobertaClassificationHead.__call__N)Tr   r.   r.   r.   r/   r     s   
r   c                       s   e Zd ZU dZeZdZdZej	e
d< ddejddfeeeejeed	 fd
dZdd ZdejjeeedddZdd Zeeddeejjeee ee ee edddZ  ZS )FlaxRobertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    robertaNmodule_class)r    r    r   TF)r2   input_shapeseedr3   _do_initr   c           	         s4   | j f |||d|}t j||||||d d S )Nr2   r3   r   )r   r   r3   r   )r   super__init__)	rI   r2   r   r   r3   r   r   kwargsmodule	__class__r.   r/   r     s    
z#FlaxRobertaPreTrainedModel.__init__c                 C   s   | j | j| jdd| _d S )NTr   )r   r2   r3   _modulerH   r.   r.   r/   enable_gradient_checkpointing  s
    z8FlaxRobertaPreTrainedModel.enable_gradient_checkpointing)rngr   r   returnc                 C   s  t j|dd}t |}t|| jj}t |}t | jj| jjf}t	j
|\}	}
|	|
d}| jjrt || jjf }|}| jj||||||||dd	}n| jj||||||dd}|d }|d k	rtt|}tt|}| jD ]}|| ||< qt | _tt|S |S d S )Nr"   r]   )r   rG   F)r   r   )r*   rt   	ones_liker0   r2   pad_token_idre   r   r^   r;   randomsplitr   r:   r   initr   r   Z_missing_keyssetr   r   )rI   r   r   r   r,   rM   rN   rO   r   Z
params_rngr   rngsr   r   Zmodule_init_outputsZrandom_paramsZmissing_keyr.   r.   r/   init_weights  sN    


      

z'FlaxRobertaPreTrainedModel.init_weightsc                 C   sl   t j||fdd}t j|dd}t t t |jd |j}| jjt	j
d|||ddd}t|d S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        r"   r]   r$   r   FT)r   r~   rk   )r*   re   r   rw   rx   
atleast_2dr)   r   r   r;   r   PRNGKeyr   )rI   r   rz   r,   rO   rN   Zinit_variablesr.   r.   r/   r~     s    
 
     z%FlaxRobertaPreTrainedModel.init_cachezbatch_size, sequence_length)r   r   trainr   r   r   past_key_valuesc                 C   s  |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}|d krNt|}|d krdt|| j j}|d krvt|}|d krt	| j j
| j jf}i }|	d k	r|	|d< d|p| ji}| j jr|r||d< dg}nd}| jj|tj|ddtj|ddtj|ddtj|ddtj|dd|||
 |||||d}|d k	r`|r`|\}}t|d |d< |S |d k	r|s|\}}|d d	 t|d f |d	d   }nV| jj|tj|ddtj|ddtj|ddtj|ddtj|dd|
 ||||d
}|S )NrG   r   rk   Fr"   r]   )rM   rN   r   r   r   rL   r   r   r   r   mutabler   r    )rM   rN   r   rL   r   r   r   r   )r2   r   r   r   r*   
zeros_liker0   r   r   re   r   r^   r   r   r   r   ro   r   )rI   r,   rO   rM   rN   r   r   r   r   r   r   r   r   r   r   r   inputsr   r   r.   r.   r/   rQ   1  st    


(z#FlaxRobertaPreTrainedModel.__call__)N)NNNNNNNNFNNNN) rR   rS   rT   rU   r!   Zconfig_classZbase_model_prefixr   r8   ModulerV   r*   rW   r   intr3   rX   r   r   r;   r   r   r   r   r~   r   ROBERTA_INPUTS_DOCSTRINGformatdictr   rQ   __classcell__r.   r.   r   r/   r     sV   
+             	r   c                   @   s   e Zd ZU eed< ejZejed< dZe	ed< dZ
e	ed< dd Zdeej eej eej eej eej e	e	e	e	e	d

ddZd	S )FlaxRobertaModuler2   r3   Tadd_pooling_layerFr   c                 C   s>   t | j| jd| _t| j| j| jd| _t| j| jd| _d S )Nr]   r   )	r1   r2   r3   
embeddingsr   r   encoderr   poolerrH   r.   r.   r/   rJ     s    zFlaxRobertaModule.setupN)
rM   rN   r   r   r   r~   rL   r   r   r   c                 C   s   |d krt |}|d kr:t t t |jd |j}| j|||||	d}| j||||	||||
||d
}|d }| jr| 	|nd }|s|d kr|f|dd   S ||f|dd   S t
|||j|j|jdS )Nr$   rK   )r   rL   r   r   r~   r   r   r   r   r    )r   Zpooler_outputrP   r   r   )r*   r   rw   rx   r   r)   r   r   r   r  r   rP   r   r   )rI   r,   rO   rM   rN   r   r   r   r~   rL   r   r   r   rP   r   Zpooledr.   r.   r/   rQ     sH    
     zFlaxRobertaModule.__call__)
NNNNNFTFFT)rR   rS   rT   r!   rV   r*   rW   r3   r   rX   r   rJ   r   r   rQ   r.   r.   r.   r/   r     s6   
          r   zaThe bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.c                   @   s   e Zd ZeZdS )FlaxRobertaModelN)rR   rS   rT   r   r   r.   r.   r.   r/   r    s   r  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )FlaxRobertaForMaskedLMModuler2   r3   Fr   c                 C   s.   t | jd| j| jd| _t| j| jd| _d S NF)r2   r   r3   r   r2   r3   r   r2   r3   r   r   r   lm_headrH   r.   r.   r/   rJ     s    z"FlaxRobertaForMaskedLMModule.setupTrL   r   r   r   c
                 C   s   | j |||||||||	d	}
|
d }| jjrF| j jd d d d }nd }| j||d}|	sn|f|
dd   S t||
j|
jd	S )
Nr  r   r   r   r?   	embeddingr   r    logitsrP   r   )r   r2   tie_word_embeddingsr   r  r   rP   r   )rI   r,   rO   rM   rN   r   rL   r   r   r   r   rP   r   r  r.   r.   r/   rQ     s.    z%FlaxRobertaForMaskedLMModule.__call__N)TFFTrR   rS   rT   r!   rV   r*   rW   r3   r   rX   rJ   rQ   r.   r.   r.   r/   r    s   
    r  z5RoBERTa Model with a `language modeling` head on top.c                   @   s   e Zd ZeZdS )FlaxRobertaForMaskedLMN)rR   rS   rT   r  r   r.   r.   r.   r/   r     s   r  z<mask>)r-   c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )*FlaxRobertaForSequenceClassificationModuler2   r3   Fr   c                 C   s.   t | j| jd| jd| _t| j| jd| _d S )NFr2   r3   r   r   r  )r   r2   r3   r   r   r   
classifierrH   r.   r.   r/   rJ   3  s    z0FlaxRobertaForSequenceClassificationModule.setupTr  c
                 C   sZ   | j |||||||||	d	}
|
d }| j||d}|	sH|f|
dd   S t||
j|
jdS Nr  r   rK   r    r  )r   r  r   rP   r   )rI   r,   rO   rM   rN   r   rL   r   r   r   r   Zsequence_outputr  r.   r.   r/   rQ   <  s(    z3FlaxRobertaForSequenceClassificationModule.__call__N)TFFTr  r.   r.   r.   r/   r  .  s   
    r  z
    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   @   s   e Zd ZeZdS )$FlaxRobertaForSequenceClassificationN)rR   rS   rT   r  r   r.   r.   r.   r/   r  b  s   r  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )"FlaxRobertaForMultipleChoiceModuler2   r3   Fr   c                 C   s>   t | j| j| jd| _tj| jjd| _tj	d| jd| _
d S )Nr   r7   r    r]   )r   r2   r3   r   r   r8   rE   rF   rG   ra   r  rH   r.   r.   r/   rJ   {  s    z(FlaxRobertaForMultipleChoiceModule.setupTr  c
                 C   s   |j d }
|d k	r$|d|j d nd }|d k	rB|d|j d nd }|d k	r`|d|j d nd }|d k	r~|d|j d nd }| j|||||||||	d	}|d }| j||d}| |}|d|
}|	s|f|dd   S t||j|jdS )Nr    r$   r  rK   r#   r  )r)   r(   r   rG   r  r   rP   r   )rI   r,   rO   rM   rN   r   rL   r   r   r   Znum_choicesr   Zpooled_outputr  Zreshaped_logitsr.   r.   r/   rQ     s6    

z+FlaxRobertaForMultipleChoiceModule.__call__N)TFFTr  r.   r.   r.   r/   r  v  s   
    r  z
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   @   s   e Zd ZeZdS )FlaxRobertaForMultipleChoiceN)rR   rS   rT   r  r   r.   r.   r.   r/   r    s   r  z(batch_size, num_choices, sequence_lengthc                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )'FlaxRobertaForTokenClassificationModuler2   r3   Fr   c                 C   s\   t | j| jd| jd| _| jjd k	r,| jjn| jj}tj|d| _	tj
| jj| jd| _d S )NFr  r7   r]   )r   r2   r3   r   r   r   rF   r8   rE   rG   ra   r   r  r   r.   r.   r/   rJ     s    

z-FlaxRobertaForTokenClassificationModule.setupTr  c
                 C   sd   | j |||||||||	d	}
|
d }| j||d}| |}|	sR|f|
dd   S t||
j|
jdS r  )r   rG   r  r   rP   r   )rI   r,   rO   rM   rN   r   rL   r   r   r   r   rP   r  r.   r.   r/   rQ     s*    
z0FlaxRobertaForTokenClassificationModule.__call__N)TFFTr  r.   r.   r.   r/   r    s   
    r  z
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   @   s   e Zd ZeZdS )!FlaxRobertaForTokenClassificationN)rR   rS   rT   r  r   r.   r.   r.   r/   r    s   r  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )%FlaxRobertaForQuestionAnsweringModuler2   r3   Fr   c                 C   s2   t | j| jd| jd| _tj| jj| jd| _d S )NFr  r]   )	r   r2   r3   r   r   r8   ra   r   
qa_outputsrH   r.   r.   r/   rJ     s    z+FlaxRobertaForQuestionAnsweringModule.setupTr  c
                 C   s   | j |||||||||	d	}
|
d }| |}tj|| jjdd\}}|d}|d}|	sr||f|
dd   S t|||
j|
j	dS )Nr  r   r$   r%   r    )start_logits
end_logitsrP   r   )
r   r  r*   r   r2   r   Zsqueezer   rP   r   )rI   r,   rO   rM   rN   r   rL   r   r   r   r   rP   r  r  r  r.   r.   r/   rQ   '  s0    


z.FlaxRobertaForQuestionAnsweringModule.__call__N)TFFTr  r.   r.   r.   r/   r    s   
    r  z
    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   s   e Zd ZeZdS )FlaxRobertaForQuestionAnsweringN)rR   rS   rT   r  r   r.   r.   r.   r/   r  R  s   r  c                   @   sr   e Zd ZU eed< ejZejed< dZe	ed< dd Z
deej eej eej eej e	e	e	e	e	d		d
dZdS )FlaxRobertaForCausalLMModuler2   r3   Fr   c                 C   s.   t | jd| j| jd| _t| j| jd| _d S r  r  rH   r.   r.   r/   rJ   j  s    z"FlaxRobertaForCausalLMModule.setupNT)	rM   r   r   r   r~   rL   r   r   r   c                 C   s   | j |||||||||	|
||d}|d }| jjrL| j jd d d d }nd }| j||d}|st|f|dd   S t||j|j|jd	S )
Nr   r   r   r   r?   r	  r
  r    )r  rP   r   r   )	r   r2   r  r   r  r   rP   r   r   )rI   r,   rO   rN   rM   r   r   r   r~   rL   r   r   r   r   rP   r   r  r.   r.   r/   rQ   s  s6    z%FlaxRobertaForCausalLMModule.__call__)	NNNNFTFFTr   r.   r.   r.   r/   r  e  s0   
         r  z
    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   @   s.   e Zd ZeZdeej dddZdd Z	dS )FlaxRobertaForCausalLMN)rO   c           	      C   s   |j \}}| ||}tj||fdd}|d k	rP|jddd }t||d}n&ttj|ddd d d f ||f}|||dS )Nr"   r]   r$   r%   r    )r   r   )r   rO   rN   )	r)   r~   r*   re   r+   r   rv   rw   rx   )	rI   r,   rz   rO   r   Z
seq_lengthr   Zextended_attention_maskrN   r.   r.   r/   prepare_inputs_for_generation  s    
&z4FlaxRobertaForCausalLM.prepare_inputs_for_generationc                 C   s.   |j |d< |d d d dd f d |d< |S )Nr   rN   r$   r    )r   )rI   Zmodel_outputsZmodel_kwargsr.   r.   r/   update_inputs_for_generation  s    
 z3FlaxRobertaForCausalLM.update_inputs_for_generation)N)
rR   rS   rT   r  r   r   r;   Arrayr   r!  r.   r.   r.   r/   r    s   r  )Utypingr   r   r   Z
flax.linenZlinenr8   r;   Z	jax.numpyZnumpyr*   r   Zflax.core.frozen_dictr   r   r   r   r	   r
   Znn_partitioningZflax.linen.attentionr   Zflax.traverse_utilr   r   r   Zmodeling_flax_outputsr   r   r   r   r   r   r   r   r   Zmodeling_flax_utilsr   r   r   r   utilsr   r   r   Zconfiguration_robertar!   Z
get_loggerrR   loggerZ_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCr   r0   ZROBERTA_START_DOCSTRINGr   r   r1   rY   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r.   r.   r.   r/   <module>   s   ,
', -+:S(#  BG:	4	= 	;	9A