U
    4Af@X                     @   s   d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZ e rvdd	lZe rdd	lZd
ZdddZdd Zdd Zdd Zdd ZG dd deZd	S )z
Processor class for IDEFICS.
    )CallableListOptionalUnion)urlparse   )BatchFeature)ProcessorMixin)BatchEncodingPaddingStrategy	TextInputTruncationStrategy)is_tf_availableis_torch_availableN<image>c                 C   s   |dkr8|dkrd| | |k< n|dkr8t | |kd| } |dkrt| dk}d| |< tjjj| |d}d||d d f< nP|dkrt | d}t |d| } t j| |d}t |d}t |t ||}|S )Nr   pttfr   num_classes)depth)	r   wheretorchnnZ
functionalZone_hotequalZexpand_dimsZ
zeros_like)Zincremental_maskreturn_tensorsr   Z	negativesZ	attn_maskZnegatives_expanded r   R/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/idefics/processing_idefics.py$incremental_to_binary_attention_mask&   s"    r   c                 C   s(   |dkrt | |S |dkr$t| |S d S )Nr   r   ),image_attention_mask_for_packed_input_ids_pt,image_attention_mask_for_packed_input_ids_tf)	input_ids	tokenizerr   r   r   r   )image_attention_mask_for_packed_input_ids@   s    
r#   c                 C   sz  t j| dd}t j| dd}|t}|j}t| dD ]l}d}d}t| | D ]R\}	}
|
|kr||d7 }||| |	< d}n||| |	< |rd|| |	< |
|krRd}qRq:t| dD ]}d}d}t| | dd ddD ]^}	| | |	 }
|
|kr|d7 }||| |	< d}n||| |	< |
|kr*d}|rd|| |	< q|| dk}|| |  |8  < || |  d9  < q||fS )Nr   )Z
fill_valuer   F   T)r   Z	full_likeconvert_tokens_to_idsIMAGE_TOKENeos_token_idrangesize	enumerate)r!   r"   image_attention_masknext_image_attention_maskimage_token_ideod_token_id	batch_idxcountseen_eodidxtoken_idZnon_negative_indicesr   r   r   r   G   sF    


r   c                 C   s.  | t}|j}t| d }tt| d}tt| d}t|D ]}d}d}	t| d }
t|
d ddD ]}| ||f  }||kr|d7 }||gg}|g}t|||}t|||}n2||kr|	sd}	d}||gg}|g}t|||}|	rt||krt||gg}dg}t|||}qtqJ||fS )Nr   r   Fr$   T)	r%   r&   r'   r   shapefillr(   Znumpytensor_scatter_nd_update)r!   r"   r-   r.   Z
batch_sizer+   r,   r/   r0   r1   Z
seq_lengthr2   r3   indicesupdatesr   r   r   r    v   s8    



r    c                 C   s$   d| krdS t | }t|j|jgS )zChecks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
    invalidated the url F)r   allschemenetloc)stringresultr   r   r   is_url   s    r?   c                       s   e Zd ZdZddgZddgZdZdZd fd
d	Zde	e
e e
e
e  f e	eeef e	eeef ee eedddZdd Zdd Zedd Z  ZS )IdeficsProcessora  
    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`IdeficsImageProcessor`):
            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
        tokenizer (`LlamaTokenizerFast`):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
    image_processorr"   
image_sizeadd_end_of_utterance_tokenZIdeficsImageProcessorZLlamaTokenizerFastN   c                    s|   |d krt d|d kr t dt || | j| _|t| _| jj| jj	| jj	f| _
d| jjdg krrdnd| _d S )Nz)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.<end_of_utterance>Zadditional_special_tokensTF)
ValueErrorsuper__init__rA   Zcurrent_processorr%   r&   r-   Zimage_num_channelsrB   default_image_dimsr"   Zspecial_tokens_mapget1tokenizer_was_trained_with_end_of_utterance_token)selfrA   r"   rB   rC   kwargs	__class__r   r   rH      s    zIdeficsProcessor.__init__longestFr   )promptspadding
truncation
max_length	transformreturnc
           -         s:  |dkr| j }tdd |D s&|g}d dd}
 fdd}g }g }|D ]
}| jj }g }d	}d	}t|D ]\}}|d
kr|sdnd	}t|tr|d}t|r| j	
|}|||7 }|| d}n|r|r||
7 }||7 }d	}qp|||7 }|| d}qp|r|| jj7 }|dkr4td| | j	|||	d}|| || qL| j|d	|||d}|d }|d }tdd |D }td|}tdd |D d
k}g }g }g }t|||D ]\}}} |}!|!| j}"t|"|}#| d|# }$t|$d
kr|	dkrBtj|f|$ dd  }%|$|%d|$d
< nr|	dkrt|$dd }&tj|g|&gd
d}'tj|'|$jd}%t|$d
 }(tt|(d})|$}*t|%|)|*}%n:|	dkrtj|f| j }%n|	dkrt|f| j}%||% |	dkr&|t |! |t | n*|	dkr|tj!|!tj"d || q|	dkr~t#|}t#|}t#|}n(|	dkrt#|}t#|}t#|}|rt$|| j|	\}+},t%|+|	|d}+nX|	dkrtj|jd
 |jd dtj&d}+n,|	dkr&tj|jd
 |jd dftj&d}+t'||||+ddS )a  This method takes batched or non-batched prompts made of text and images and converts them into prompts that
        the model was trained on and prepares the image pixel values for the model to process.

        Args:
            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
                either a single prompt or a batched list of prompts - see the detailed description immediately after
                the end of the arguments doc section.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:
                - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different
                  lengths.
                Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"`
                  by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            transform (`Callable`, *optional*):
                A custom transform function that accepts a single image can be passed for training. For example,
                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
                set of transforms will be applied to the images
            add_eos_token (`bool`, *optional*, defaults to `False`):
                Adds `eos_token` at the end of the final prompt if True`
            add_end_of_utterance_token (`bool`, *optional*)
                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
                image). If `None` the tokenizer will be checked instead and if this token is found in
                `additional_special_tokens` then the value will be `True`.
            debug (`bool`, *optional*, defaults to `False`):
                `True` value will help debug prompt generation by dumping useful information
            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
                The type of tensors to return. Can be one of:
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.

        Returns:
            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
            directly passed to `model.generate`

        Detailed explanation:

        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.

        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.

        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
        entry into the prompt.

        Example:

        ```python
        checkpoint = "HuggingFaceM4/idefics-9b"
        processor = AutoProcessor.from_pretrained(checkpoint)
        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
        img = processor.image_processor.fetch_images([url])[0]

        prompts = [
            "User:",
            img,
            "Describe this image.
Assistant: An image of two kittens in grass.
",
            "User:",
            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
            "Describe this image.
Assistant:",
        ]

        inputs = processor(prompts, return_tensors="pt")
        generated_ids = model.generate(**inputs, max_length=100)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```

        In this example the `prompts` will be converted into:

        ```
        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant: An image of two kittens in grass.
        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant:'
        ```

        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
        `pixel_values` dict entry of the return value.

        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
        first image is passed as object and the second one as a url.

        To do training do:

        ```python
        image_transform = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                transforms.Normalize(mean=self.image_mean, std=self.image_std),
            ]
        )
        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
        ```

        In order to help debug prompt generation enable `debug=True` which will show you what's happening.

        Nc                 s   s   | ]}t |tV  qd S N)
isinstancelist).0ir   r   r   	<genexpr>E  s     z,IdeficsProcessor.__call__.<locals>.<genexpr>z<fake_token_around_image>r   rE   c                    s   | r  S     S d S rW   r   )last_was_imageZ
fake_tokenZimage_tokenr   r   image_tokensL  s    z/IdeficsProcessor.__call__.<locals>.image_tokensFr   Tr9   z
full_text=)rU   r   )textZadd_special_tokensrR   rS   rT   r!   attention_maskc                 s   s   | ]}t |V  qd S rW   lenrZ   xr   r   r   r\     s     r$   c                 s   s   | ]}t |V  qd S rW   rb   rd   r   r   r   r\     s     r   r   )Zaxis)dtype)r   r$   r   )r!   ra   Zpixel_valuesr+   )data)(rK   anyr"   Z	bos_tokenr*   rX   strstripr?   rA   Zfetch_imagesappendZ	eos_tokenprintmaxsumzipr0   r-   minrc   r   zerosr)   r   r4   concatrf   Zreshaper(   r6   rI   ZtensorZconvert_to_tensorZint32stackr#   r   boolr   )-rL   rQ   rR   rS   rT   rU   Zadd_eos_tokenrC   debugr   Zend_of_utterance_tokenr_   Zall_promptsZ
all_imagessampleZ	full_textZimage_objectsr]   Zlast_was_textr[   itemimageZtext_encodingZ	all_textsZall_attention_masksZmax_num_imagesZat_least_one_imageZoutput_input_idsZoutput_imagesZoutput_attention_masksr`   ra   ZimagesZpadded_input_idsZimage_countZlocal_max_num_imagesZcurrent_imagesZpadded_image_tensorZimage_shapeZpadded_shapeZ
num_imagesr7   r8   r+   _r   r^   r   __call__   s    x























    
   
 zIdeficsProcessor.__call__c                 O   s   | j j||S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r"   batch_decoderL   argsrM   r   r   r   r{     s    zIdeficsProcessor.batch_decodec                 O   s   | j j||S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r"   decoder|   r   r   r   r~     s    zIdeficsProcessor.decodec                 C   s"   | j j}| jj}tt|| S rW   )r"   model_input_namesrA   rY   dictfromkeys)rL   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   r     s    z"IdeficsProcessor.model_input_names)NrD   N)rP   NNNFNFr   )__name__
__module____qualname____doc__
attributesZvalid_kwargsZimage_processor_classZtokenizer_classrH   r   r   r   rt   ri   r   r   r   intr   r
   rz   r{   r~   propertyr   __classcell__r   r   rN   r   r@      s8             r@   )r   )r   typingr   r   r   r   urllib.parser   Zfeature_extraction_utilsr   Zprocessing_utilsr	   Ztokenization_utils_baser
   r   r   r   utilsr   r   r   Z
tensorflowr   r&   r   r#   r   r    r?   r@   r   r   r   r   <module>   s"   
/!	