U
    5Af%Y                     @   s   d Z ddlmZmZmZmZmZ ddlZddl	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZmZmZ eeZG d	d
 d
eZdS )z)Feature extractor class for UnivNetModel.    )AnyDictListOptionalUnionN   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                       sR  e Zd ZdZdddgZd+eeeeeeeee	e eee	e eeeeeeeed fddZ
dd Zdd ZejejdddZd,ee	ejj ejd d!d"Zd-eej d#d$d%Zd.eejee eej eee  f e	e eeeef e	e ee	e ee	ejj ee	e e	e e	e e	eeef  ed&d'd(Zeeef d# fd)d*Z  ZS )/UnivNetFeatureExtractora  
    Constructs a UnivNet feature extractor.

    This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
    STFT implementation follows that of TacoTron 2 and Hifi-GAN.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value to pad with when applying the padding strategy defined by the `padding` argument to
            [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
            `__call__` will also use this padding value.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
            performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 100):
            The number of mel-frequency bins in the extracted spectrogram features. This should match
            `UnivNetModel.config.num_mel_bins`.
        hop_length (`int`, *optional*, defaults to 256):
            The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
            that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
            the `hop_length` in ms.
        win_length (`int`, *optional*, defaults to 1024):
            The direct number of samples for each sliding window. Note that this is different from other audio feature
            extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        filter_length (`int`, *optional*, defaults to 1024):
            The number of FFT components to use. If `None`, this is determined using
            `transformers.audio_utils.optimal_fft_length`.
        max_length_s (`int`, *optional*, defaults to 10):
            The maximum input lenght of the model in seconds. This is used to pad the audio.
        fmin (`float`, *optional*, defaults to 0.0):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*):
            Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
        mel_floor (`float`, *optional*, defaults to 1e-09):
            Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
            different than in [`transformers.audio_utils.spectrogram`].
        center (`bool`, *optional*, defaults to `False`):
            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
            `t` will start at time `t * hop_length`.
        compression_factor (`float`, *optional*, defaults to 1.0):
            The multiplicative compression factor for dynamic range compression during spectral normalization.
        compression_clip_val (`float`, *optional*, defaults to 1e-05):
            The clip value applied to the waveform before applying dynamic range compression during spectral
            normalization.
        normalize_min (`float`, *optional*, defaults to -11.512925148010254):
            The min value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
            The max value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        model_in_channels (`int`, *optional*, defaults to 64):
            The number of input channels to the [`UnivNetModel`] model. This should match
            `UnivNetModel.config.model_in_channels`.
        pad_end_length (`int`, *optional*, defaults to 10):
            If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
            number of appended samples will be `pad_end_length * hop_length`.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
    input_featuresnoise_sequencepadding_mask   ]          Fd         hann_window
   N&.>      ?h㈵>    '    ă@@   T)feature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionfilter_lengthmax_length_sfminfmax	mel_floorcentercompression_factorcompression_clip_valnormalize_minnormalize_maxmodel_in_channelspad_end_lengthc              	      s
  t  jf ||||d| || _|| _|| _|| _|| _|	| _|| _|d krZt	|d }|| _
|| _|
| _|
| | _| jd krt| j| _n| j| _| jd d | _t| j| jdd| _t| j| j| j| j
| jddd| _|| _|| _|| _|| _|| _|| _|| _d S )N)r#   r$   r%   return_attention_mask   r   T)Zwindow_lengthnameZperiodicZslaney)Znum_frequency_binsZnum_mel_filtersZmin_frequencyZmax_frequencyr$   ZnormZ	mel_scale)super__init__r&   r'   r(   r)   r*   r+   r-   floatr.   r/   r,   num_max_samplesr	   n_fftn_freqsr   windowr   r$   mel_filtersr0   r1   r2   r3   r4   r5   r6   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   kwargs	__class__ Z/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/univnet/feature_extraction_univnet.pyr;   e   sT    


z UnivNetFeatureExtractor.__init__c                 C   s   d|| j  | j| j    d S )Nr8   r   r3   r4   rB   r
   rF   rF   rG   	normalize   s    z!UnivNetFeatureExtractor.normalizec                 C   s   | j | j| j  |d d   S )Nr   r8   rH   rI   rF   rF   rG   denormalize   s    z#UnivNetFeatureExtractor.denormalize)waveformreturnc                 C   s   t j|t| j| j d t| j| j d fdd}t|| j| j| j| jd| jddd	}t t 	|d t 
|d  | j }t | jj|}t t j|| jdd| j }|jS )a  
        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.

        Args:
            waveform (`np.ndarray` of shape `(length,)`):
                The input waveform. This must be a single real-valued, mono waveform.

        Returns:
            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
        r8   Zreflect)modeN)r@   Zframe_lengthr(   Z
fft_lengthpowerr0   rA   r/   )Za_minZa_max)nppadintr>   r(   r
   r@   r0   sqrtrealimagr/   matmulrA   TlogZclipr2   r1   )rB   rL   Zcomplex_spectrogramZamplitude_spectrogrammel_spectrogramZlog_mel_spectrogramrF   rF   rG   rY      s0    & z'UnivNetFeatureExtractor.mel_spectrogram)noise_length	generatorrM   c                 C   s0   |dkrt j }|| jf}|j|t jd}|S )a  
        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
        [`UnivNetModel.forward`].

        Args:
            spectrogram_length (`int`):
                The length (dim 0) of the generated noise.
            model_in_channels (`int`, *optional*, defaults to `None`):
                The number of features (dim 1) of the generated noise. This should correspond to the
                `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
                `self.config.model_in_channels`.
            generator (`numpy.random.Generator`, *optional*, defaults to `None`)
                An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
                new generator with fresh entropy will be created.

        Returns:
            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
            model_in_channels)`.
        Ndtype)rP   randomZdefault_rngr5   Zstandard_normalfloat32)rB   rZ   r[   Znoise_shapenoiserF   rF   rG   generate_noise   s
    

z&UnivNetFeatureExtractor.generate_noise)rM   c                    s0   dd |D } dk	r, fddt |D }|S )a  
        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
        lengths after removing padding.

        Args:
            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                The batched output waveforms from the [`UnivNetModel`].
            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
                The batched lengths of each waveform before padding.

        Returns:
            `List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
        c                 S   s    g | ]}|     qS rF   )detachclonecpunumpy.0rL   rF   rF   rG   
<listcomp>  s     z8UnivNetFeatureExtractor.batch_decode.<locals>.<listcomp>Nc                    s    g | ]\}}|d  |  qS )NrF   )rg   irL   waveform_lengthsrF   rG   rh     s     )	enumerate)rB   Z	waveformsrk   rF   rj   rG   batch_decode  s    z$UnivNetFeatureExtractor.batch_decode)
raw_speechr$   padding
max_length
truncationpad_to_multiple_ofreturn_noiser[   pad_end
pad_lengthr&   r7   return_tensorsrM   c              
      sn  |dk	r|nj }|dk	rP|jkrZtdjj dj dj d| d	n
td t|tj	ort
|jdk}|rt
|jd	krtd
 |pt|ttfot|d tj	ttf}|rdd |D }nP|st|tj	stj|tjd}n.t|tj	r |jttjkr |tj}|s8tj|tjdg}|	rfdk	rLnjfdd|D }td|i}j|||dk	r|nj|||d}|d}fdd|D }t|d trdd |D |d< ndd |D |d< |d}|dk	rdd |D |d< |r6 fdd|d D }||d< |rVfdd|d D |d< |dk	rj||}|S )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
                padding index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

                If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*, defaults to `True`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_noise (`bool`, *optional*, defaults to `True`):
                Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
                An optional `numpy.random.Generator` random number generator to use when generating noise.
            pad_end (`bool`, *optional*, defaults to `False`):
                Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
                generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
                padding will be done before the padding strategy specified in `padding` is performed.
            pad_length (`int`, *optional*, defaults to `None`):
                If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
                will default to `self.config.pad_end_length`.
            do_normalize (`bool`, *optional*):
                Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
                the performance for some models. If not set, this will default to `self.config.do_normalize`.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.np.array` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zIt is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.r   r8   z2Only mono-channel audio is supported for input to r   c                 S   s   g | ]}t j|t jd qS r\   rP   asarrayr_   )rg   ZspeechrF   rF   rG   rh     s     z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>r\   c                    s(   g | ] }t j|d  j fjdqS )r   )Zconstant_values)rP   rQ   r(   r%   rf   )ru   rB   rF   rG   rh     s   r   )ro   rp   rq   rr   r7   c                    s   g | ]}  |qS rF   )rY   rf   rB   rF   rG   rh     s     c                 S   s   g | ]}t j|t jd qS rx   ry   rg   ZmelrF   rF   rG   rh     s     c                 S   s   g | ]}| tjqS rF   )astyperP   r_   r|   rF   rF   rG   rh     s     attention_maskc                 S   s   g | ]}t j|t jd qS rx   )rP   rz   Zint32)rg   arrayrF   rF   rG   rh     s     r   c                    s   g | ]} |jd   qS )r   )ra   shaperg   r
   )r[   rB   rF   rG   rh     s   r   c                    s   g | ]}  |qS rF   )rJ   r   r{   rF   rG   rh     s    )r&   r$   
ValueErrorrE   __name__loggerwarning
isinstancerP   ndarraylenr   listtuplerz   r_   r]   Zfloat64r}   r6   r   rQ   r=   getr   Zconvert_to_tensors)rB   rn   r$   ro   rp   rq   rr   rs   r[   rt   ru   r&   r7   rv   Zis_batched_numpyZ
is_batchedZbatched_speechZpadded_inputsr   Zmel_spectrogramsr~   r`   rF   )r[   ru   rB   rG   __call__  sr    L
$""






z UnivNetFeatureExtractor.__call__c                    s4   t   }dddddg}|D ]}||kr||= q|S )Nr@   rA   r>   r?   r=   )r:   to_dict)rB   outputnamesr9   rD   rF   rG   r     s    
zUnivNetFeatureExtractor.to_dict)r   r   r   Fr   r   r   r   r   r   r   Nr   Fr   r   r    r!   r"   r   T)N)N)NTNTNTNFNNNN)r   
__module____qualname____doc__Zmodel_input_namesrR   r<   boolstrr   r;   rJ   rK   rP   r   rY   r^   	Generatorra   r   rm   r   r   r   r   r   r   r   r   __classcell__rF   rF   rD   rG   r      s   E
                     L3 
             "
 "r   )r   typingr   r   r   r   r   re   rP   Zaudio_utilsr   r	   r
   r   Z!feature_extraction_sequence_utilsr   Zfeature_extraction_utilsr   utilsr   r   r   Z
get_loggerr   r   r   rF   rF   rF   rG   <module>   s   
