U
    5Af                      @   s   d dl mZmZ ddlmZmZmZmZmZm	Z	 ddl
mZmZ e r^d dlmZ ddlmZ e rpddlmZ e rd d	lZdd
lmZ eeZeedddG dd deZd	S )    )ListUnion   )add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)Zhas_tokenizerZhas_image_processorc                       sj   e Zd ZdZ fddZdddZeeee ded f d fd	d
Z	dddZ
dd Zdd Z  ZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    c                    s4   t  j|| t| d | | jdkr*tnt d S )NZvisiontf)super__init__r
   Zcheck_model_type	frameworkr   r   )selfargskwargs	__class__ H/tmp/pip-unpacked-wheel-zw5xktn0/transformers/pipelines/image_to_text.pyr   E   s
    
zImageToTextPipeline.__init__Nc                 C   sl   i }i }|d k	r||d< |d k	r(||d< |d k	r8||d< |d k	rb|d k	rXd|krXt d|| ||i fS )Nprompttimeoutmax_new_tokenszp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 version)
ValueErrorupdate)r   r    generate_kwargsr   r   Zforward_paramsZpreprocess_paramsr   r   r   _sanitize_parametersL   s    
z(ImageToTextPipeline._sanitize_parameterszImage.Image)imagesc                    s   t  j|f|S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        )r   __call__)r   r%   r   r   r   r   r&   a   s    zImageToTextPipeline.__call__c                 C   s  t ||d}|d k	r<t|ts4tdt| d| jjj}|dkr| j|| j	d}| j	dkrl|
| j}| j|ddj}| jjg| }t|d	}|d
|i n|dkr| j||| j	d}| j	dkr|
| j}n^|dkr*| j|| j	d}| j	dkr|
| j}| j|| j	d}|| ntd| dn(| j|| j	d}| j	dkrd|
| j}| jjjdkr|d krd |d
< |S )N)r   z&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r%   return_tensorsptF)textZadd_special_tokensr   	input_idsZ
pix2struct)r%   Zheader_textr(   zvision-encoder-decoder)r(   zModel type z- does not support conditional text generation)r   
isinstancestrr!   typemodelconfig
model_typeZimage_processorr   toZtorch_dtype	tokenizerr+   Zcls_token_idtorchZtensorZ	unsqueezer"   )r   imager   r   r1   model_inputsr+   Ztext_inputsr   r   r   
preprocess   s@    





zImageToTextPipeline.preprocessc                 K   sZ   d|kr4t |d tr4tdd |d D r4d |d< || jj}| jj|f||}|S )Nr+   c                 s   s   | ]}|d kV  qd S )Nr   ).0xr   r   r   	<genexpr>   s     z/ImageToTextPipeline._forward.<locals>.<genexpr>)r,   listallpopr/   Zmain_input_namegenerate)r   r6   r#   inputsmodel_outputsr   r   r   _forward   s    zImageToTextPipeline._forwardc                 C   s0   g }|D ]"}d| j j|ddi}|| q|S )NZgenerated_textT)Zskip_special_tokens)r3   decodeappend)r   r@   recordsZ
output_idsrecordr   r   r   postprocess   s     zImageToTextPipeline.postprocess)NNNN)NN)__name__
__module____qualname____doc__r   r$   r   r-   r   r&   r7   rA   rF   __classcell__r   r   r   r   r   -   s   
&
/r   )typingr   r   utilsr   r   r   r   r	   r
   baser   r   ZPILr   Zimage_utilsr   Zmodels.auto.modeling_tf_autor   r4   Zmodels.auto.modeling_autor   Z
get_loggerrG   loggerr   r   r   r   r   <module>   s    
