U
    4Af                     @   s  d dl mZmZmZ d dlZd dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d	dl&m'Z'm(Z(m)Z)m*Z*m+Z+ d	dl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 e03e4Z5dZ6dZ7ej8Z8ej9j:G dd de-Z;dZ<dZ=G dd dej>Z?G dd dej>Z@G dd dej>ZAG dd dej>ZBG dd dej>ZCG dd  d ej>ZDG d!d" d"ej>ZEG d#d$ d$ej>ZFG d%d& d&ej>ZGG d'd( d(ej>ZHG d)d* d*ej>ZIG d+d, d,ej>ZJG d-d. d.ej>ZKG d/d0 d0ej>ZLG d1d2 d2ej>ZMG d3d4 d4e(ZNG d5d6 d6ej>ZOe.d7e<G d8d9 d9eNZPe)ePe6ee7 G d:d; d;ej>ZQe.d<e<G d=d> d>eNZRd?ZSe+eRe=Td@eS  e*eRe;e7dA G dBdC dCej>ZUe.dDe<G dEdF dFeNZVe)eVe6e e7 G dGdH dHej>ZWe.dIe<G dJdK dKeNZXdLZYe+eXe=Td@eY  e*eXe"e7dA G dMdN dNej>ZZe.dOe<G dPdQ dQeNZ[e)e[e6e$e7 G dRdS dSej>Z\e.dTe<G dUdV dVeNZ]e+e]e=TdW e)e]e6e!e7 G dXdY dYej>Z^e.dZe<G d[d\ d\eNZ_e)e_e6e%e7 G d]d^ d^ej>Z`e.d_e<G d`da daeNZae)eae6e#e7 G dbdc dcej>Zbe.dde<G dedf dfeNZce)ece6ee7 dS )g    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )
-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutputFlaxNextSentencePredictorOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )
BertConfigzgoogle-bert/bert-base-uncasedr$   c                   @   sZ   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dS )FlaxBertForPreTrainingOutputaI  
    Output type of [`BertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logitsseq_relationship_logitshidden_states
attentions)__name__
__module____qualname____doc__r&   jnpndarray__annotations__r'   r(   r   r   r)    r1   r1   O/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/bert/modeling_flax_bert.pyr%   =   s
   
r%   a
  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].

a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	de
ddd	Zd
S )FlaxBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjd| j	d| _
t j| jj| jjtj jj| jjd| j	d| _t j| jj| jjtj jj| jjd| j	d| _t j| jj| j	d| _t j| jjd| _d S )N)stddev)Zembedding_initr5   epsilonr5   Zrate)nnZEmbedr4   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger5   word_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfr1   r1   r2   setup   s(    zFlaxBertEmbeddings.setupTdeterministicc           
      C   sX   |  |d}| |d}| |d}|| | }	| |	}	| j|	|d}	|	S )Ni4rM   )rA   astyperC   rD   rE   rI   )
rK   	input_idstoken_type_idsposition_idsattention_maskrN   Zinputs_embedsZposition_embedsrD   r(   r1   r1   r2   __call__   s    
zFlaxBertEmbeddings.__call__N)T)r*   r+   r,   r-   r$   r0   r.   float32r5   rL   boolrU   r1   r1   r1   r2   r3      s
   
r3   c                   @   sr   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
dd Zd	d
 Zejdd Zdeej eedddZdS )FlaxBertSelfAttentionr4   Fcausalr5   c                 C   s   | j j| j j | _| j j| j j dkr.tdtj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _| jrttjd| j jfdddd| _d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r5   kernel_initr#   rW   r5   )r4   r<   num_attention_headshead_dim
ValueErrorr:   Denser5   r=   r>   r?   r@   querykeyvaluerY   r	   r.   onesrB   causal_maskrJ   r1   r1   r2   rL      s2     zFlaxBertSelfAttention.setupc                 C   s"   | |jd d | jj| jf S N   )reshapeshaper4   r\   r]   rK   r(   r1   r1   r2   _split_heads   s    z"FlaxBertSelfAttention._split_headsc                 C   s   | |jd d | jjf S re   )rg   rh   r4   r<   ri   r1   r1   r2   _merge_heads  s    z"FlaxBertSelfAttention._merge_headsc                 C   s   |  dd}| ddtj|j|j}| ddtj|j|j}| dddd }|r|jj^ }	}
}}|j}dt|	 |ddf }t	|j||}t	|j||}||_||_|jd	 }|j| |_t
t|
|| k t|	d	||
f }t||}|||fS )
a[  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                   S   s   t jdt jdS )Nr   r[   )r.   arrayZint32r1   r1   r1   r2   <lambda>      z=FlaxBertSelfAttention._concatenate_to_cache.<locals>.<lambda>)r   r   r#   )has_variablevariabler.   zerosrh   r5   rb   lenr   dynamic_update_slicebroadcast_toarangetupler   )rK   ra   rb   r`   rT   Zis_initializedrm   rn   ro   Z
batch_dims
max_lengthZ	num_headsZdepth_per_headZ	cur_indexindicesZnum_updated_cache_vectorsZpad_maskr1   r1   r2   _concatenate_to_cache  s(    	

z+FlaxBertSelfAttention._concatenate_to_cacheNT)key_value_states
init_cacheoutput_attentionsc                 C   s  |d k	}|j d }	| |}
|r6| |}| |}n| |}| |}| |
}
| |}| |}| jr|
j d |j d  }}| ddr| jd d }| jd d j d }t	| j
dd|dfdd||f}n"| j
d d d d d |d |f }t||	f|j dd   }|d k	rJ| jrJttj|dd|j }t||}n&| jrX|}n|d k	rptj|dd}| jr| dds|r| |||
|\}}}|d k	rt|dkt|j d| jt|j t| jj| j}nd }d }|s| jjdkr| d	}t|
|||| jjd
|| jd d	}|d k	rLtd||}td||}||j d d d }|r||fn|f}|S )Nr   r#   rl   rm   ro   )Zaxisg        rI   T)biasdropout_rngZdropout_rateZbroadcast_dropoutrN   r5   Z	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdrf   ))rh   r`   ra   rb   rj   rY   rs   	variablesr   Zdynamic_slicerd   r.   rx   Zexpand_dimsr   r}   selectfullrP   r5   Zfinfominr4   Zattention_probs_dropout_probZmake_rngr   Zeinsumrg   )rK   r(   rT   layer_head_maskr~   r   rN   r   Zis_cross_attention
batch_sizeZquery_statesZ
key_statesZvalue_statesZquery_lengthZ
key_lengthZ
mask_shiftZmax_decoder_lengthrd   Zattention_biasr   Zattn_weightsattn_outputoutputsr1   r1   r2   rU   &  s    







 
 
"
   



zFlaxBertSelfAttention.__call__)NFTF)r*   r+   r,   r$   r0   rY   rW   r.   rV   r5   rL   rj   rk   r:   compactr}   r   r/   rU   r1   r1   r1   r2   rX      s"   

$    rX   c                   @   s>   e Zd ZU eed< ejZejed< dd Zd
e	dddZ
d	S )FlaxBertSelfOutputr4   r5   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jj| jd| _
t j| jjd| _d S )NrZ   r5   r7   r9   )r:   r_   r4   r<   r=   r>   r?   r@   r5   denserE   rF   rG   rH   rI   rJ   r1   r1   r2   rL     s    zFlaxBertSelfOutput.setupTrM   c                 C   s*   |  |}| j||d}| || }|S NrM   r   rI   rE   )rK   r(   Zinput_tensorrN   r1   r1   r2   rU     s    
zFlaxBertSelfOutput.__call__N)Tr*   r+   r,   r$   r0   r.   rV   r5   rL   rW   rU   r1   r1   r1   r2   r     s   
	r   c                   @   sJ   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
ded	d
dZdS )FlaxBertAttentionr4   FrY   r5   c                 C   s,   t | j| j| jd| _t| j| jd| _d S )NrY   r5   r[   )rX   r4   rY   r5   rK   r   outputrJ   r1   r1   r2   rL     s    zFlaxBertAttention.setupNT)r   c              	   C   sL   | j |||||||d}|d }	| j|	||d}|f}
|rH|
|d f7 }
|
S )N)r   r~   r   rN   r   r   rM   r#   )rK   r   )rK   r(   rT   r   r~   r   rN   r   Zattn_outputsr   r   r1   r1   r2   rU     s    	zFlaxBertAttention.__call__)NFTF)r*   r+   r,   r$   r0   rY   rW   r.   rV   r5   rL   rU   r1   r1   r1   r2   r     s   
	    r   c                   @   s6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxBertIntermediater4   r5   c                 C   s8   t j| jjtj j| jj| jd| _	t
| jj | _d S Nr   )r:   r_   r4   Zintermediate_sizer=   r>   r?   r@   r5   r   r   
hidden_act
activationrJ   r1   r1   r2   rL     s    zFlaxBertIntermediate.setupc                 C   s   |  |}| |}|S N)r   r   ri   r1   r1   r2   rU     s    

zFlaxBertIntermediate.__call__N
r*   r+   r,   r$   r0   r.   rV   r5   rL   rU   r1   r1   r1   r2   r     s   
r   c                   @   s>   e Zd ZU eed< ejZejed< dd Zd
e	dddZ
d	S )FlaxBertOutputr4   r5   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jjd| _t j| jj| jd| _d S )Nr   r9   r7   )r:   r_   r4   r<   r=   r>   r?   r@   r5   r   rG   rH   rI   rE   rF   rJ   r1   r1   r2   rL     s    zFlaxBertOutput.setupTrM   c                 C   s*   |  |}| j||d}| || }|S r   r   )rK   r(   attention_outputrN   r1   r1   r2   rU     s    
zFlaxBertOutput.__call__N)Tr   r1   r1   r1   r2   r     s   
	r   c                   @   sR   e Zd ZU eed< ejZejed< dd Zde	ej
 e	ej
 eeedd	d
ZdS )FlaxBertLayerr4   r5   c                 C   s\   t | j| jj| jd| _t| j| jd| _t| j| jd| _| jj	rXt | jd| jd| _
d S )Nr   r[   F)r   r4   Z
is_decoderr5   	attentionr   intermediater   r   add_cross_attentioncrossattentionrJ   r1   r1   r2   rL     s
    zFlaxBertLayer.setupNFT)encoder_hidden_statesencoder_attention_maskr   rN   r   c	                 C   s   | j ||||||d}	|	d }
|d k	rD| j|
|||||d}|d }
| |
}| j||
|d}|f}|r||	d f7 }|d k	r||d f7 }|S )N)r   r   rN   r   r   )rT   r   r~   rN   r   rM   r#   )r   r   r   r   )rK   r(   rT   r   r   r   r   rN   r   Zattention_outputsr   Zcross_attention_outputsr   r1   r1   r2   rU     s6    
zFlaxBertLayer.__call__)NNFTF)r*   r+   r,   r$   r0   r.   rV   r5   rL   r   r/   rW   rU   r1   r1   r1   r2   r     s   
     r   c                	   @   sb   e Zd ZU eed< ejZejed< dZe	ed< dd Z
deej eej e	e	e	e	e	d	d
dZdS )FlaxBertLayerCollectionr4   r5   Fgradient_checkpointingc                    sR   j r2ttdd  fddtjjD _nfddtjjD _d S )N)         )Zstatic_argnumsc                    s"   g | ]} j t|jd qS )namer5   )r4   strr5   .0iZFlaxBertCheckpointLayerrK   r1   r2   
<listcomp>-  s   z1FlaxBertLayerCollection.setup.<locals>.<listcomp>c                    s"   g | ]}t  jt| jd qS r   )r   r4   r   r5   r   rJ   r1   r2   r   2  s    )r   rematr   ranger4   num_hidden_layerslayersrJ   r1   r   r2   rL   *  s    



zFlaxBertLayerCollection.setupNTr   r   r   rN   r   output_hidden_statesreturn_dictc              
   C   s$  |rdnd }|	rdnd }|r(|d k	r(dnd }|d k	rj|j d t| jkrjtdt| j d|j d  dt| jD ]l\}}|	r||f7 }||||d k	r|| nd |||||}|d }|rt||d f7 }|d k	rt||d f7 }qt|	r||f7 }||||f}|
stdd	 |D S t||||d
S )Nr1   r   z&The head_mask should be specified for z/ layers, but it is for                         .r#   rf   c                 s   s   | ]}|d k	r|V  qd S r   r1   )r   vr1   r1   r2   	<genexpr>l  s      z3FlaxBertLayerCollection.__call__.<locals>.<genexpr>)last_hidden_stater(   r)   cross_attentions)rh   rv   r   r^   	enumeraterz   r   )rK   r(   rT   	head_maskr   r   r   rN   r   r   r   Zall_attentionsZall_hidden_statesZall_cross_attentionsr   layerZlayer_outputsr   r1   r1   r2   rU   6  sJ    

z FlaxBertLayerCollection.__call__)NNFTFFTr*   r+   r,   r$   r0   r.   rV   r5   r   rW   rL   r   r/   rU   r1   r1   r1   r2   r   %  s(   
       r   c                	   @   sb   e Zd ZU eed< ejZejed< dZe	ed< dd Z
deej eej e	e	e	e	e	d	d
dZdS )FlaxBertEncoderr4   r5   Fr   c                 C   s   t | j| j| jd| _d S )Nr5   r   )r   r4   r5   r   r   rJ   r1   r1   r2   rL   {  s
    zFlaxBertEncoder.setupNTr   c                 C   s   | j |||||||||	|
d
S )N)r   r   r   r   rN   r   r   r   )r   )rK   r(   rT   r   r   r   r   rN   r   r   r   r1   r1   r2   rU     s    zFlaxBertEncoder.__call__)NNFTFFTr   r1   r1   r1   r2   r   v  s(   
       r   c                   @   s6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxBertPoolerr4   r5   c                 C   s*   t j| jjtj j| jj| jd| _	d S r   )
r:   r_   r4   r<   r=   r>   r?   r@   r5   r   rJ   r1   r1   r2   rL     s
    zFlaxBertPooler.setupc                 C   s$   |d d df }|  |}t|S )Nr   )r   r:   tanh)rK   r(   Zcls_hidden_stater1   r1   r2   rU     s    
zFlaxBertPooler.__call__Nr   r1   r1   r1   r2   r     s   
r   c                   @   s6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxBertPredictionHeadTransformr4   r5   c                 C   s>   t j| jj| jd| _t| jj | _t j	| jj
| jd| _	d S )Nr[   r7   )r:   r_   r4   r<   r5   r   r   r   r   rE   rF   rJ   r1   r1   r2   rL     s    z%FlaxBertPredictionHeadTransform.setupc                 C   s   |  |}| |}| |S r   )r   r   rE   ri   r1   r1   r2   rU     s    

z(FlaxBertPredictionHeadTransform.__call__Nr   r1   r1   r1   r2   r     s   
r   c                   @   sT   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zd
dd	ZdS )FlaxBertLMPredictionHeadr4   r5   .	bias_initc                 C   sF   t | j| jd| _tj| jj| jdd| _| d| j	| jjf| _
d S )Nr[   F)r5   Zuse_biasr   )r   r4   r5   	transformr:   r_   r;   decoderparamr   r   rJ   r1   r1   r2   rL     s    zFlaxBertLMPredictionHead.setupNc                 C   sR   |  |}|d k	r,| jdd|jii|}n
| |}t| j| j}||7 }|S )Nparamskernel)r   r   applyTr.   Zasarrayr   r5   )rK   r(   shared_embeddingr   r1   r1   r2   rU     s    

z!FlaxBertLMPredictionHead.__call__)N)r*   r+   r,   r$   r0   r.   rV   r5   r=   r:   r>   ru   r   r   npr/   rL   rU   r1   r1   r1   r2   r     s
   
r   c                   @   s8   e Zd ZU eed< ejZejed< dd ZdddZ	dS )	FlaxBertOnlyMLMHeadr4   r5   c                 C   s   t | j| jd| _d S )Nr[   )r   r4   r5   predictionsrJ   r1   r1   r2   rL     s    zFlaxBertOnlyMLMHead.setupNc                 C   s   | j ||d}|S Nr   )r   )rK   r(   r   r1   r1   r2   rU     s    zFlaxBertOnlyMLMHead.__call__)Nr   r1   r1   r1   r2   r     s   
r   c                   @   s.   e Zd ZU ejZejed< dd Zdd ZdS )FlaxBertOnlyNSPHeadr5   c                 C   s   t jd| jd| _d S )Nrf   r[   )r:   r_   r5   seq_relationshiprJ   r1   r1   r2   rL     s    zFlaxBertOnlyNSPHead.setupc                 C   s
   |  |S r   )r   )rK   pooled_outputr1   r1   r2   rU     s    zFlaxBertOnlyNSPHead.__call__N)	r*   r+   r,   r.   rV   r5   r0   rL   rU   r1   r1   r1   r2   r     s   
r   c                   @   s8   e Zd ZU eed< ejZejed< dd ZdddZ	dS )	FlaxBertPreTrainingHeadsr4   r5   c                 C   s(   t | j| jd| _tjd| jd| _d S )Nr[   rf   )r   r4   r5   r   r:   r_   r   rJ   r1   r1   r2   rL     s    zFlaxBertPreTrainingHeads.setupNc                 C   s    | j ||d}| |}||fS r   )r   r   )rK   r(   r   r   prediction_scoresseq_relationship_scorer1   r1   r2   rU     s    
z!FlaxBertPreTrainingHeads.__call__)Nr   r1   r1   r1   r2   r     s   
r   c                       s   e Zd ZU dZeZdZdZej	e
d< ddejddfeeeejeed	 fd
dZdd ZdejjeeedddZdd Zeeddeejjeee ee ee edddZ  ZS )FlaxBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bertNmodule_class)r#   r#   r   TF)r4   input_shapeseedr5   _do_initr   c           	         s4   | j f |||d|}t j||||||d d S )Nr4   r5   r   )r   r   r5   r   )r   super__init__)	rK   r4   r   r   r5   r   r   kwargsmodule	__class__r1   r2   r     s    
z FlaxBertPreTrainedModel.__init__c                 C   s   | j | j| jdd| _d S )NTr   )r   r4   r5   _modulerJ   r1   r1   r2   enable_gradient_checkpointing  s
    z5FlaxBertPreTrainedModel.enable_gradient_checkpointing)rngr   r   returnc                 C   s(  t j|dd}t |}t t t |jd |}t |}t | j	j
| j	jf}tj|\}	}
|	|
d}| j	jrt || j	jf }|}| jj||||||||dd	}n| jj||||||dd}|d }|d k	r tt|}tt|}| jD ]}|| ||< qt | _tt|S |S d S )NrO   r[   r   )r   rI   F)r   r   )r.   ru   
zeros_likerx   ry   
atleast_2drh   	ones_likerc   r4   r   r\   r=   randomsplitr   r<   r   initr   r   Z_missing_keyssetr   r   )rK   r   r   r   rQ   rR   rS   rT   r   Z
params_rngr   rngsr   r   Zmodule_init_outputsZrandom_paramsZmissing_keyr1   r1   r2   init_weights  sN    


      

z$FlaxBertPreTrainedModel.init_weightsc                 C   sl   t j||fdd}t j|dd}t t t |jd |j}| jjt	j
d|||ddd}t|d S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        rO   r[   r   r   FT)r   r   rl   )r.   rc   r   rx   ry   r   rh   r   r   r=   r   PRNGKeyr   )rK   r   r{   rQ   rT   rS   Zinit_variablesr1   r1   r2   r   F  s    
 
     z"FlaxBertPreTrainedModel.init_cachebatch_size, sequence_length)r   r   trainr   r   r   past_key_valuesc                 C   s  |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}|d krNt|}|d krvttt|j	d |j	}|d krt
|}|d krt| j j| j jf}i }|	d k	r|	|d< d|p| ji}| j jr|r||d< dg}nd}| jj|tj|ddtj|ddtj|ddtj|ddtj|dd|||
 |||||d}|d k	rr|rr|\}}t|d |d	< |S |d k	r|s|\}}|d d
 t|d f |d
d   }nV| jj|tj|ddtj|ddtj|ddtj|ddtj|dd|
 ||||d}|S )Nr   rI   r   rl   FrO   r[   )rR   rS   r   r   r   rN   r   r   r   r   mutabler   r#   )rR   rS   r   rN   r   r   r   r   )r4   r   r   r   r.   r   rx   ry   r   rh   r   rc   r   r\   r   r   r   r   rp   r   )rK   rQ   rT   rR   rS   r   r   r   r   r   r   r   r   r   r   r   inputsr   r   r1   r1   r2   rU   Y  st    
 

(z FlaxBertPreTrainedModel.__call__)N)NNNNNNNNFNNNN) r*   r+   r,   r-   r$   config_classZbase_model_prefixr   r:   Moduler0   r.   rV   r   intr5   rW   r   r   r=   r   r   r   r   r   r!   BERT_INPUTS_DOCSTRINGformatdictr   rU   __classcell__r1   r1   r   r2   r     sV   
+             	r   c                   @   s   e Zd ZU eed< ejZejed< dZe	ed< dZ
e	ed< dd Zdeej eej eej eej eej e	e	e	e	e	d

ddZd	S )FlaxBertModuler4   r5   Tadd_pooling_layerFr   c                 C   s>   t | j| jd| _t| j| j| jd| _t| j| jd| _d S )Nr[   r   )	r3   r4   r5   
embeddingsr   r   encoderr   poolerrJ   r1   r1   r2   rL     s    zFlaxBertModule.setupN)
rR   rS   r   r   r   r   rN   r   r   r   c                 C   s   |d krt |}|d kr:t t t |jd |j}| j|||||	d}| j||||	||||
||d
}|d }| jr| 	|nd }|s|d kr|f|dd   S ||f|dd   S t
|||j|j|jdS )Nr   rM   )r   rN   r   r   r   r   r   r   r   r#   )r   Zpooler_outputr(   r)   r   )r.   r   rx   ry   r   rh   r  r	  r  r
  r   r(   r)   r   )rK   rQ   rT   rR   rS   r   r   r   r   rN   r   r   r   r(   r   Zpooledr1   r1   r2   rU     sH    
     zFlaxBertModule.__call__)
NNNNNFTFFT)r*   r+   r,   r$   r0   r.   rV   r5   r  rW   r   rL   r   r/   rU   r1   r1   r1   r2   r    s6   
          r  z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                   @   s   e Zd ZeZdS )FlaxBertModelN)r*   r+   r,   r  r   r1   r1   r1   r2   r    s   r  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )FlaxBertForPreTrainingModuler4   r5   Fr   c                 C   s,   t | j| j| jd| _t| j| jd| _d S )Nr   r4   r5   )r  r4   r5   r   r   r   clsrJ   r1   r1   r2   rL     s    z"FlaxBertForPreTrainingModule.setupTrN   r   r   r   c
                 C   s   | j |||||||||	d	}
| jjr>| j jd d d d }nd }|
d }|
d }| j|||d\}}|	s~||f|
d	d   S t|||
j|
jd
S )Nr  r   r  rA   	embeddingr   r#   r   rf   )r&   r'   r(   r)   )r   r4   tie_word_embeddingsr   r  r%   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   r   r   r(   r   r   r   r1   r1   r2   rU     s:      
z%FlaxBertForPreTrainingModule.__call__N)TFFTr*   r+   r,   r$   r0   r.   rV   r5   r   rW   rL   rU   r1   r1   r1   r2   r    s   
    r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   @   s   e Zd ZeZdS )FlaxBertForPreTrainingN)r*   r+   r,   r  r   r1   r1   r1   r2   r  J  s   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.seq_relationship_logits
    ```
r   )output_typer   c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )FlaxBertForMaskedLMModuler4   r5   Fr   c                 C   s.   t | jd| j| jd| _t| j| jd| _d S NF)r4   r  r5   r   r  r  r4   r5   r   r   r   r  rJ   r1   r1   r2   rL   v  s    zFlaxBertForMaskedLMModule.setupTr  c
                 C   s   | j |||||||||	d	}
|
d }| jjrF| j jd d d d }nd }| j||d}|	sn|f|
dd   S t||
j|
jd	S )
Nr  r   r   r  rA   r  r   r#   logitsr(   r)   )r   r4   r  r   r  r   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   r   r(   r   r  r1   r1   r2   rU     s.    z"FlaxBertForMaskedLMModule.__call__N)TFFTr  r1   r1   r1   r2   r  q  s   
    r  z2Bert Model with a `language modeling` head on top.c                   @   s   e Zd ZeZdS )FlaxBertForMaskedLMN)r*   r+   r,   r  r   r1   r1   r1   r2   r    s   r  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )'FlaxBertForNextSentencePredictionModuler4   r5   Fr   c                 C   s(   t | j| j| jd| _t| jd| _d S )Nr   r[   )r  r4   r5   r   r   r   r  rJ   r1   r1   r2   rL     s    z-FlaxBertForNextSentencePredictionModule.setupTr  c
                 C   sj   |	d k	r|	n| j j}	| j|||||||||	d	}
|
d }| |}|	sX|f|
dd   S t||
j|
jdS )Nr  r#   rf   r  )r4   r   r   r  r   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   r   r   Zseq_relationship_scoresr1   r1   r2   rU     s*    
z0FlaxBertForNextSentencePredictionModule.__call__N)TFFTr  r1   r1   r1   r2   r    s   
    r  zJBert Model with a `next sentence prediction (classification)` head on top.c                   @   s   e Zd ZeZdS )!FlaxBertForNextSentencePredictionN)r*   r+   r,   r  r   r1   r1   r1   r2   r    s   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")

    >>> outputs = model(**encoding)
    >>> logits = outputs.logits
    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
    ```
c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )'FlaxBertForSequenceClassificationModuler4   r5   Fr   c                 C   sZ   t | j| j| jd| _| jjd k	r*| jjn| jj}tj|d| _	tj
| jj| jd| _d S )Nr   r9   r[   r  r4   r5   r   r   classifier_dropoutrH   r:   rG   rI   r_   
num_labels
classifierrK   r  r1   r1   r2   rL     s    

z-FlaxBertForSequenceClassificationModule.setupTr  c
                 C   sd   | j |||||||||	d	}
|
d }| j||d}| |}|	sR|f|
dd   S t||
j|
jdS )Nr  r#   rM   rf   r  )r   rI   r!  r   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   r   r   r  r1   r1   r2   rU   %  s*    
z0FlaxBertForSequenceClassificationModule.__call__N)TFFTr  r1   r1   r1   r2   r    s   
    r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   @   s   e Zd ZeZdS )!FlaxBertForSequenceClassificationN)r*   r+   r,   r  r   r1   r1   r1   r2   r#  L  s   r#  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )FlaxBertForMultipleChoiceModuler4   r5   Fr   c                 C   s>   t | j| j| jd| _tj| jjd| _tj	d| jd| _
d S )Nr   r9   r#   r[   )r  r4   r5   r   r   r:   rG   rH   rI   r_   r!  rJ   r1   r1   r2   rL   d  s    z%FlaxBertForMultipleChoiceModule.setupTr  c
                 C   s   |j d }
|d k	r$|d|j d nd }|d k	rB|d|j d nd }|d k	r`|d|j d nd }|d k	r~|d|j d nd }| j|||||||||	d	}|d }| j||d}| |}|d|
}|	s|f|dd   S t||j|jdS )Nr#   r   r  rM   rf   r  )rh   rg   r   rI   r!  r   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   Znum_choicesr   r   r  Zreshaped_logitsr1   r1   r2   rU   m  s6    

z(FlaxBertForMultipleChoiceModule.__call__N)TFFTr  r1   r1   r1   r2   r$  _  s   
    r$  z
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   @   s   e Zd ZeZdS )FlaxBertForMultipleChoiceN)r*   r+   r,   r$  r   r1   r1   r1   r2   r%    s   r%  z(batch_size, num_choices, sequence_lengthc                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )$FlaxBertForTokenClassificationModuler4   r5   Fr   c                 C   s\   t | j| jd| jd| _| jjd k	r,| jjn| jj}tj|d| _	tj
| jj| jd| _d S )NFr4   r5   r  r   r9   r[   r  r"  r1   r1   r2   rL     s    

z*FlaxBertForTokenClassificationModule.setupTr  c
                 C   sd   | j |||||||||	d	}
|
d }| j||d}| |}|	sR|f|
dd   S t||
j|
jdS )Nr  r   rM   r#   r  )r   rI   r!  r   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   r   r(   r  r1   r1   r2   rU     s*    
z-FlaxBertForTokenClassificationModule.__call__N)TFFTr  r1   r1   r1   r2   r&    s   
    r&  z
    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   @   s   e Zd ZeZdS )FlaxBertForTokenClassificationN)r*   r+   r,   r&  r   r1   r1   r1   r2   r(    s   r(  c                   @   sP   e Zd ZU eed< ejZejed< dZe	ed< dd Z
de	e	e	e	dd	d
ZdS )"FlaxBertForQuestionAnsweringModuler4   r5   Fr   c                 C   s2   t | j| jd| jd| _tj| jj| jd| _d S )NFr'  r[   )	r  r4   r5   r   r   r:   r_   r   
qa_outputsrJ   r1   r1   r2   rL     s    z(FlaxBertForQuestionAnsweringModule.setupTr  c
                 C   s   | j |||||||||	d	}
|
d }| |}tj|| jjdd\}}|d}|d}|	sr||f|
dd   S t|||
j|
j	dS )Nr  r   r   r   r#   )start_logits
end_logitsr(   r)   )
r   r*  r.   r   r4   r   Zsqueezer   r(   r)   )rK   rQ   rT   rR   rS   r   rN   r   r   r   r   r(   r  r+  r,  r1   r1   r2   rU     s0    


z+FlaxBertForQuestionAnsweringModule.__call__N)TFFTr  r1   r1   r1   r2   r)    s   
    r)  z
    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   s   e Zd ZeZdS )FlaxBertForQuestionAnsweringN)r*   r+   r,   r)  r   r1   r1   r1   r2   r-  3  s   r-  c                   @   sr   e Zd ZU eed< ejZejed< dZe	ed< dd Z
deej eej eej eej e	e	e	e	e	d		d
dZdS )FlaxBertForCausalLMModuler4   r5   Fr   c                 C   s.   t | jd| j| jd| _t| j| jd| _d S r  r  rJ   r1   r1   r2   rL   K  s    zFlaxBertForCausalLMModule.setupNT)	rR   r   r   r   r   rN   r   r   r   c                 C   s   | j |||||||||	|
||d}|d }| jjrL| j jd d d d }nd }| j||d}|st|f|dd   S t||j|j|jd	S )
Nr   r   r   r  rA   r  r   r#   )r  r(   r)   r   )	r   r4   r  r   r  r   r(   r)   r   )rK   rQ   rT   rS   rR   r   r   r   r   rN   r   r   r   r   r(   r   r  r1   r1   r2   rU   T  s6    z"FlaxBertForCausalLMModule.__call__)	NNNNFTFFTr   r1   r1   r1   r2   r.  F  s0   
         r.  z
    Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   @   s.   e Zd ZeZdeej dddZdd Z	dS )FlaxBertForCausalLMN)rT   c           	      C   s   |j \}}| ||}tj||fdd}|d k	rP|jddd }t||d}n&ttj|ddd d d f ||f}|||dS )NrO   r[   r   r   r#   )r   r   )r   rT   rS   )	rh   r   r.   rc   Zcumsumr   rw   rx   ry   )	rK   rQ   r{   rT   r   Z
seq_lengthr   Zextended_attention_maskrS   r1   r1   r2   prepare_inputs_for_generation  s    
&z1FlaxBertForCausalLM.prepare_inputs_for_generationc                 C   s.   |j |d< |d d d dd f d |d< |S )Nr   rS   r   r#   )r   )rK   Zmodel_outputsZmodel_kwargsr1   r1   r2   update_inputs_for_generation  s    
 z0FlaxBertForCausalLM.update_inputs_for_generation)N)
r*   r+   r,   r.  r   r   r=   Arrayr0  r1  r1   r1   r1   r2   r/    s   r/  )dtypingr   r   r   ZflaxZ
flax.linenZlinenr:   r=   Z	jax.numpyZnumpyr.   r   Zflax.core.frozen_dictr   r   r   r   r	   r
   Znn_partitioningZflax.linen.attentionr   Zflax.traverse_utilr   r   r   Zmodeling_flax_outputsr   r   r   r   r   r   r   r   r   r   Zmodeling_flax_utilsr   r   r   r   r   utilsr   r    r!   r"   Zconfiguration_bertr$   Z
get_loggerr*   loggerZ_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCr   structZ	dataclassr%   ZBERT_START_DOCSTRINGr  r   r3   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  Z#FLAX_BERT_FOR_PRETRAINING_DOCSTRINGr  r  r  r  r  Z&FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRINGr  r#  r$  r%  r&  r(  r)  r-  r.  r/  r1   r1   r1   r2   <module>   s(  0
0'+ ,*9Q'
 DG=  :5  ==    ;   9A