U
    4Afg{                     @   sh  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ eeZdZdZeeejdddZejejdddZ ejejejejdddZ!G dd dej"Z#G dd dej"Z$G dd dej"Z%G dd deZ&d Z'd!Z(ed"e'G d#d$ d$e&Z)ed%e'G d&d' d'e&Z*dS )(zPyTorch CodeGen model.    )OptionalTupleUnionN)nn)CrossEntropyLoss   )ACT2FN)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )CodeGenConfigzSalesforce/codegen-2B-monor   )num_posdimreturnc                 C   s`   ddt jd|dt jd|   }t dt j| t jd | }t jt |t |fddS )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeZint64Zeinsumfloatcatsincos)r   r   Zinv_freqZsinusoid_inp r    P/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions&   s     "r"   )xr   c                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r#   x1Zx2r    r    r!   rotate_every_two-   s    ""r)   )tensorr   r   r   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   Zrepeat_interleaver)   )r*   r   r   r    r    r!   apply_rotary_pos_emb5   s    &&r+   c                       s   e Zd Z fddZdd Zdd Zddd	Zdeej	 ee
ej  eej	 eej eej	 ee ee ee
eje
ej f ee
eje
ej e
ejdf f  f dddZ  ZS )CodeGenAttentionc                    s"  t    |j}| jdttj||ftjddd||dd t	
|j| _t	
|j| _|j| _|j| _| j| j | _| j| j | jkrtd| j d| j dttj| jtjdt | _t	j| j| jd	 dd
| _t	j| j| jdd
| _|j| _| jp| j}t||| _d S )Ncausal_maskr   r   F)
persistentzEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   )bias) super__init__Zmax_position_embeddingsZregister_bufferr   ZtrilZonesboolviewr   DropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropoutZhidden_size	embed_dimnum_attention_headshead_dim
ValueErrorsqrtr*   float32toZget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr"   embed_positions)selfconfigZmax_positionsZpos_embd_dim	__class__r    r!   r1   <   s6    
   $zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr$   r%   )r$   )reshapeshape)rE   r#   Zn_headZdim_headmp_numZreshapedr    r    r!   _split_heads[   s     &zCodeGenAttention._split_headsc                 C   s   t |jdkr&|ddddd }n8t |jdkrJ|dddd }ntdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr%   )lenrJ   permute
contiguousr;   sizer3   )rE   r*   r9   Zattn_head_sizeZ	new_shaper    r    r!   _merge_heads`   s    zCodeGenAttention._merge_headsNc                 C   s   | d| d }}| jd d d d || |d |f }|tj}|tj}t||dd}	|	| j }	t|	j	j
}
tj|
|	j	d|	j}
t||	|
}	|d k	r|	| }	tjdd|	}	|	|j	}	| |	}	|d k	r|	| }	t|	|}||	fS )Nr%   r$   r   r   )rR   r-   r>   r   r=   matmulZ	transposer?   finfor   minr*   devicewherer   ZSoftmaxr5   )rE   querykeyvalueattention_mask	head_maskZquery_lengthZ
key_lengthr-   attn_weightsZ
mask_valueattn_outputr    r    r!   _attnm   s$    	&

zCodeGenAttention._attnF.hidden_states
layer_pastr\   position_idsr]   	use_cacheoutput_attentionsr   c                 C   s  |  |}d}	||jd d |	df }
| j| j |	 }tj|
|dd\}}}| j|| j| j|	d}| j|| j| j|	d}| j|| j| j|	d}|dddd}| j	}|j
|j
kr||j
}|| _	|| }tj||jd d dd\}}| jd k	r|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d k	r4|d }|d }tj||fd	d}tj||fd	d}|d
krP||j|f}nd }| |||||\}}| || j| j}| |}| |}||f}|r||f7 }|S )NrN   r$   r   )rK   r   r   r   r   r%   T)rA   rI   rJ   r:   r9   r   splitrL   rP   rD   rW   r>   rC   r+   r   r   r`   rS   rB   r7   )rE   rb   rc   r\   rd   r]   re   rf   ZqkvrK   Z	qkv_splitZ	local_dimrY   r[   rZ   rD   Zsincosr   r   Zk_rotZk_passZq_rotZq_passZpast_keyZ
past_valueZpresentr_   r^   outputsr    r    r!   forward   sX    
""""




zCodeGenAttention.forward)NN)NNNNFF)__name__
__module____qualname__r1   rL   rS   r`   r   r   FloatTensorr   Tensor
LongTensorr2   r   ri   __classcell__r    r    rG   r!   r,   ;   s4     
,      "r,   c                       s4   e Zd Z fddZeej ejdddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S N)r0   r1   n_embdr   r@   fc_infc_outr   Zactivation_functionactr4   r6   dropout)rE   Zintermediate_sizerF   r8   rG   r    r!   r1      s    
zCodeGenMLP.__init__)rb   r   c                 C   s,   |  |}| |}| |}| |}|S rr   )rt   rv   ru   rw   )rE   rb   r    r    r!   ri      s
    



zCodeGenMLP.forward)	rj   rk   rl   r1   r   r   rm   ri   rp   r    r    rG   r!   rq      s   
rq   c                       s   e Zd Z fddZd	eej eeej  eej eej	 eej ee
 ee
 eeej eeejeejdf f  f dddZ  ZS )
CodeGenBlockc                    sR   t    |jd k	r|jnd|j }tj|j|jd| _t|| _	t
||| _d S )NrN   Zeps)r0   r1   Zn_innerrs   r   	LayerNormlayer_norm_epsilonln_1r,   attnrq   mlp)rE   rF   Z	inner_dimrG   r    r!   r1      s
    

zCodeGenBlock.__init__NF.ra   c              	   C   sv   |}|  |}| j|||||||d}	|	d }
|	dd  }| |}|
| | }|r`|f| }n|f|dd   }|S )Nrb   rc   r\   rd   r]   re   rf   r   r   )r|   r}   r~   )rE   rb   rc   r\   rd   r]   re   rf   ZresidualZattn_outputsr_   rh   Zfeed_forward_hidden_statesr    r    r!   ri     s&    

	
zCodeGenBlock.forward)NNNNFF)rj   rk   rl   r1   r   r   rm   r   rn   ro   r2   r   ri   rp   r    r    rG   r!   rx      s"   
      (rx   c                       s>   e Zd ZdZeZdZdZdgZdZ	 fddZ
dd	 Z  ZS )
CodeGenPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerTrx   past_key_valuesc                    s   t  j|| d S rr   )r0   r1   )rE   inputskwargsrG   r    r!   r1   3  s    zCodeGenPreTrainedModel.__init__c                 C   s   t |tjfr<|jjjd| jjd |jdk	r|jj	  nft |tj
r||jjjd| jjd |jdk	r|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weights.g        )ZmeanZstdNr   )
isinstancer   r@   ZweightdataZnormal_rF   Zinitializer_ranger/   Zzero_	EmbeddingZpadding_idxrz   Zfill_)rE   moduler    r    r!   _init_weights6  s    

z$CodeGenPreTrainedModel._init_weights)rj   rk   rl   __doc__r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementr1   r   rp   r    r    rG   r!   r   '  s   r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CodeGenConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a:
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoProcenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare CodeGen Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zeede	e
eeddeej eeeej   eej eej eej eej eej ee ee ee ee eeef d
ddZ  ZS )CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  qS r    )rx   ).0_rF   r    r!   
<listcomp>  s     z)CodeGenModel.__init__.<locals>.<listcomp>ry   F)r0   r1   rs   r8   
vocab_sizer   r   wter4   Z
embd_pdropdropZ
ModuleListrangen_layerhrz   r{   ln_frV   rC   Zn_ctxr9   gradient_checkpointing	post_initrE   rF   rG   r   r!   r1     s     zCodeGenModel.__init__c                 C   s   | j S rr   r   rE   r    r    r!   get_input_embeddings  s    z!CodeGenModel.get_input_embeddingsc                 C   s
   || _ d S rr   r   rE   Znew_embeddingsr    r    r!   set_input_embeddings  s    z!CodeGenModel.set_input_embeddingsbatch_size, sequence_length
checkpointoutput_typer   N)	input_idsr   r\   token_type_idsrd   r]   inputs_embedsre   rf   output_hidden_statesreturn_dictr   c                 C   s|  |	d k	r|	n| j j}	|
d k	r |
n| j j}
|d k	r4|n| j j}|d k	rH|n| j j}|d k	rj|d k	rjtdnd|d k	r| || | }|d|d }|j	d }n,|d k	r| d d }|j	d }ntd|d k	r|j
n|j
}|d k	r|d|d }|d krd}td gt| j }n|d d d}|d kr`tj||d | tj|d}|d}|d k	r|dkr|td||d}|d d d d d d f }|j| jd}d	| t| jj }| || j j}|d kr| |}|}|d k	r| |}|| }| |}||df }| jrF| jrF|rFtd
 d}|rPdnd }|	r^dnd }|
rldnd }tt| j|D ]\}\}}|
r||f }| jr| jr|  |j!|d |||| ||	}n||||||| ||	d}|d }|dkr||d f }|	r|||rdnd f }q| "|}||}|
rL||f }|sltdd ||||fD S t#||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer$   r   z5You have to specify either input_ids or inputs_embedsr%   )r   rW   z$batch_size has to be defined and > 0r   r   zh`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting `use_cache=False`...Fr    r   Tr   r   c                 s   s   | ]}|d k	r|V  qd S rr   r    )r   vr    r    r!   	<genexpr>1  s      z'CodeGenModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   rb   
attentions)$rF   rf   r   re   use_return_dictr;   Z%warn_if_padding_and_no_attention_maskrR   r3   rJ   rW   tuplerO   r   r   r   longZ	unsqueezer>   r   rU   rV   Zget_head_maskr   r   r   r   ZtrainingloggerZwarning_once	enumeratezipZ_gradient_checkpointing_func__call__r   r	   )rE   r   r   r\   r   rd   r]   r   re   rf   r   r   Zinput_shapeZ
batch_sizerW   past_lengthrb   Ztoken_type_embedsZoutput_shapeZpresentsZall_self_attentionsZall_hidden_statesiblockrc   rh   r    r    r!   ri     s    
















zCodeGenModel.forward)NNNNNNNNNNN)rj   rk   rl   r1   r   r   r   CODEGEN_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr	   _CONFIG_FOR_DOCr   r   ro   r   rn   rm   r2   r   ri   rp   r    r    rG   r!   r     sD              
r   zM
    The CodeGen Model transformer with a language modeling head on top.
    c                       s   e Zd ZdgZ fddZdd Zdd Zdd	d
Zee	
deeeeddeej eeeej   eej eej eej eej eej eej ee ee ee ee eeef dddZeeeej  ejeeej  dddZ  ZS )CodeGenForCausalLMzlm_head.weightc                    s4   t  | t|| _t|j|j| _| 	  d S rr   )
r0   r1   r   r   r   r@   rs   r   lm_headr   r   rG   r    r!   r1   D  s    
zCodeGenForCausalLM.__init__c                 C   s   | j S rr   r   r   r    r    r!   get_output_embeddingsL  s    z(CodeGenForCausalLM.get_output_embeddingsc                 C   s
   || _ d S rr   r   r   r    r    r!   set_output_embeddingsO  s    z(CodeGenForCausalLM.set_output_embeddingsNc                 K   s0  | dd }|r||d d jd }|jd |kr6|}n|jd d }|d d |d f }|d k	r||d d |jd  d f }| dd }| dd }	|d k	r|	d kr| dd }	|	|dkd |r|	d d |jd  d f }	|d k	r|d krd|i}
nd	| i}
|
|| d
|	||d |
S )Nr   r   r   r   r\   rd   r$   r   r   re   )r   re   rd   r\   r   )getrJ   r   ZcumsumZmasked_fill_rQ   update)rE   r   r   r   r   r   r   Zremove_prefix_lengthr\   rd   Zmodel_inputsr    r    r!   prepare_inputs_for_generationR  s8    
	z0CodeGenForCausalLM.prepare_inputs_for_generationr   r   )r   r   r\   r   rd   r]   r   labelsre   rf   r   r   r   c                 C   s  |dk	r|n| j j}| j||||||||	|
||d}|d }| |tj}d}|dk	r||j}|dddddf  }|dddf  }t	 }||
d|d|
d}||j}|s|f|dd  }|dk	r|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r   r\   r   rd   r]   r   re   rf   r   r   r   .r$   r   )lossZlogitsr   rb   r   )rF   r   r   r   r>   r   r=   rW   rQ   r   r3   rR   r   r
   r   rb   r   )rE   r   r   r\   r   rd   r]   r   r   re   rf   r   r   Ztransformer_outputsrb   Z	lm_logitsr   Zshift_logitsZshift_labelsZloss_fctoutputr    r    r!   ri   ~  sD    zCodeGenForCausalLM.forward)r   beam_idxr   c                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s$   | ]}t  fd d|D V  qdS )c                 3   s"   | ]}| d  |jV  qdS )r   N)Zindex_selectr>   rW   )r   Z
past_stater   r    r!   r     s     z>CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   rc   r   r    r!   r     s   z4CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>r   )r   r   r    r   r!   _reorder_cache  s    	z!CodeGenForCausalLM._reorder_cache)NN)NNNNNNNNNNNN)rj   rk   rl   Z_tied_weights_keysr1   r   r   r   r   r   r   r   r   r
   r   r   r   ro   r   rn   rm   r2   r   ri   staticmethodr   rp   r    r    rG   r!   r   ;  sV   
,            
D r   )+r   typingr   r   r   r   Ztorch.utils.checkpointr   Ztorch.nnr   Zactivationsr   Zmodeling_outputsr	   r
   Zmodeling_utilsr   utilsr   r   r   r   Zconfiguration_codegenr   Z
get_loggerrj   r   r   r   intrn   r"   r)   r+   Moduler,   rq   rx   r   ZCODEGEN_START_DOCSTRINGr   r   r   r    r    r    r!   <module>   sD   
 -, 2 4