U
    5Af                     @   s   d dl mZ d dlmZ d dlZd dlZddlmZm	Z	 ddl
mZ ddlmZmZ e	eZeed	d	d
G dd deZdS )    )UserDict)UnionN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)Zhas_feature_extractorZhas_tokenizerc                       s`   e Zd ZdZ fddZeejee	f d fddZ
dd ZdddZdd Zdd Z  ZS )#ZeroShotAudioClassificationPipelinea  
    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
    provide an audio and a set of `candidate_labels`.

    <Tip warning={true}>

    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

    </Tip>

    Example:
    ```python
    >>> from transformers import pipeline
    >>> from datasets import load_dataset

    >>> dataset = load_dataset("ashraq/esc50")
    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
    ```


    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-audio-classification"`. See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
    c                    s.   t  jf | | jdkr*td| j dd S )NptzThe z is only available in PyTorch.)super__init__	framework
ValueError	__class__)selfkwargsr    Y/tmp/pip-unpacked-wheel-zw5xktn0/transformers/pipelines/zero_shot_audio_classification.pyr   ?   s    
z,ZeroShotAudioClassificationPipeline.__init__)audiosc                    s   t  j|f|S )a  
        Assign labels to the audio(s) passed as inputs.

        Args:
            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
                The pipeline handles three types of inputs:
                - A string containing a http link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in numpy
            candidate_labels (`List[str]`):
                The candidate labels for this audio
            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                The sentence used in cunjunction with *candidate_labels* to attempt the audio classification by
                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                logits_per_audio
        Return:
            A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
            following keys:
            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
        )r   __call__)r   r   r   r   r   r   r   F   s    z,ZeroShotAudioClassificationPipeline.__call__c                 K   s6   i }d|kr|d |d< d|kr,|d |d< |i i fS )Ncandidate_labelshypothesis_templater   )r   r   Zpreprocess_paramsr   r   r   _sanitize_parameters^   s    z8ZeroShotAudioClassificationPipeline._sanitize_parametersNThis is a sound of {}.c              	      s   t |trJ|ds|dr,t|j}nt|d}| }W 5 Q R X t |trbt	|| j
j}t |tjsvtdt|jdkrtd| j
|g| j
jdd}| jdkr|| j}||d	<  fd
d|D }| j|| jdd}|g|d< |S )Nzhttp://zhttps://rbz"We expect a numpy ndarray as inputr   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipeliner   )sampling_ratereturn_tensorsr   c                    s   g | ]}  |qS r   )format).0xr   r   r   
<listcomp>   s     zBZeroShotAudioClassificationPipeline.preprocess.<locals>.<listcomp>T)r   paddingtext_inputs)
isinstancestr
startswithrequestsgetcontentopenreadbytesr   Zfeature_extractorr   npndarray	TypeErrorlenshaper   r   toZtorch_dtype	tokenizer)r   audior   r   finputs	sequencesr&   r   r#   r   
preprocessg   s.    

  

z.ZeroShotAudioClassificationPipeline.preprocessc                 C   sX   | d}| d}t|d tr,|d }n|d d }| jf ||}||jd}|S )Nr   r&   r   )r   logits)popr'   r   modelZlogits_per_audio)r   Zmodel_inputsr   r&   outputsmodel_outputsr   r   r   _forward   s    


z,ZeroShotAudioClassificationPipeline._forwardc                 C   sb   | d}|d d }| jdkr6|jdd}| }ntddd tt||d	d
 dD }|S )Nr   r<   r   r   )Zdimz`tf` framework not supported.c                 S   s   g | ]\}}||d qS ))scorelabelr   )r!   rB   Zcandidate_labelr   r   r   r$      s   zCZeroShotAudioClassificationPipeline.postprocess.<locals>.<listcomp>c                 S   s
   | d  S )Nr   r   )r"   r   r   r   <lambda>       zAZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>)key)r=   r   Zsoftmaxtolistr   sortedzip)r   r@   r   r<   ZprobsZscoresresultr   r   r   postprocess   s    


z/ZeroShotAudioClassificationPipeline.postprocess)Nr   )__name__
__module____qualname____doc__r   r   r0   r1   r/   r(   r   r   r;   rA   rK   __classcell__r   r   r   r   r       s   	
r   )collectionsr   typingr   Znumpyr0   r*   utilsr   r   Zaudio_classificationr   baser	   r
   Z
get_loggerrL   loggerr   r   r   r   r   <module>   s   
