U
    5AfB                     @   s  d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ dZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdZdZedeG dd deZ dS )zrPyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.    )ListOptionalTupleUnionN)nn)CrossEntropyLoss   )SemanticSegmenterOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)load_backbone   )UperNetConfigr   c                       sz   e Zd ZdZdeeeeeeef f eeeeef ef eeeeeef f dd fddZ	e
je
jd	d
dZ  ZS )UperNetConvModulez
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    r   Fr   N)in_channelsout_channelskernel_sizepaddingbiasdilationreturnc                    s<   t    tj||||||d| _t|| _t | _d S )N)r   r   r   r   r   r   )	super__init__r   Conv2dconvZBatchNorm2d
batch_normZReLU
activation)selfr   r   r   r   r   r   	__class__ P/tmp/pip-unpacked-wheel-zw5xktn0/transformers/models/upernet/modeling_upernet.pyr   (   s    	
zUperNetConvModule.__init__inputr   c                 C   s"   |  |}| |}| |}|S N)r   r   r   )r   r%   outputr"   r"   r#   forward=   s    


zUperNetConvModule.forward)r   Fr   )__name__
__module____qualname____doc__intr   r   strboolr   torchTensorr(   __classcell__r"   r"   r    r#   r   "   s   
   r   c                       s<   e Zd Zeeedd fddZejejdddZ  ZS )UperNetPyramidPoolingBlockN)
pool_scaler   channelsr   c                    sL   t    t|t||ddg| _t| jD ]\}}| t|| q.d S )Nr   r   )	r   r   r   ZAdaptiveAvgPool2dr   layers	enumerate
add_moduler.   )r   r4   r   r5   ilayerr    r"   r#   r   F   s    
z#UperNetPyramidPoolingBlock.__init__r$   c                 C   s   |}| j D ]}||}q
|S r&   )r7   )r   r%   Zhidden_stater;   r"   r"   r#   r(   O   s    

z"UperNetPyramidPoolingBlock.forward)	r)   r*   r+   r-   r   r0   r1   r(   r2   r"   r"   r    r#   r3   E   s   	r3   c                       sN   e Zd ZdZeedf eeedd fddZej	e
ej	 ddd	Z  ZS )
UperNetPyramidPoolingModulea}  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (`Tuple[int]`):
            Pooling scales used in Pooling Pyramid Module.
        in_channels (`int`):
            Input channels.
        channels (`int`):
            Channels after modules, before conv_seg.
        align_corners (`bool`):
            align_corners argument of F.interpolate.
    .N)pool_scalesr   r5   align_cornersr   c                    sh   t    || _|| _|| _|| _g | _t|D ]2\}}t|||d}| j	| | 
t|| q0d S )N)r4   r   r5   )r   r   r=   r>   r   r5   blocksr8   r3   appendr9   r.   )r   r=   r   r5   r>   r:   r4   blockr    r"   r#   r   e   s    
z$UperNetPyramidPoolingModule.__init__)xr   c                 C   sH   g }| j D ]8}||}tjj|| dd  d| jd}|| q
|S )N   bilinearsizemoder>   )r?   r   
functionalinterpolaterF   r>   r@   )r   rB   Zppm_outsppmZppm_outZupsampled_ppm_outr"   r"   r#   r(   q   s    
   z#UperNetPyramidPoolingModule.forward)r)   r*   r+   r,   r   r-   r/   r   r0   r1   r   r(   r2   r"   r"   r    r#   r<   V   s   "r<   c                       sL   e Zd ZdZ fddZdd Zdd Zdd	 Zej	ej	d
ddZ
  ZS )UperNetHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).
    c                    s  t    || _|j| _|| _|j| _d| _tj	| j|j
dd| _t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ]@}t|| jdd}t| j| jddd}| j| | j| qtt| j| j | jddd| _d S )NFr   r6   )r>   r   r   r   )r   r   configr=   r   Zhidden_sizer5   r>   r   r   
num_labels
classifierr<   psp_modulesr   len
bottleneckZ
ModuleListlateral_convs	fpn_convsr@   fpn_bottleneck)r   rN   r   Zl_convZfpn_convr    r"   r#   r      s@    


zUperNetHead.__init__c                 C   s   |  | j d S r&   apply_init_weightsr   r"   r"   r#   init_weights   s    zUperNetHead.init_weightsc                 C   s<   t |tjr8|jjjd| jjd |jd k	r8|jj	  d S Ng        )ZmeanZstd

isinstancer   r   ZweightdataZnormal_rN   Zinitializer_ranger   Zzero_r   moduler"   r"   r#   rY      s    
zUperNetHead._init_weightsc                 C   s:   |d }|g}| | | tj|dd}| |}|S )NrL   r   Zdim)extendrQ   r0   catrS   )r   inputsrB   Zpsp_outsr'   r"   r"   r#   psp_forward   s    
zUperNetHead.psp_forwardencoder_hidden_statesr   c                    s   fddt jD   t}t|d ddD ]H}|d  jdd  }|d  tjj	| |dj
d |d < q@fd	dt|d D }|d  t|d ddD ]0}tjj	|| |d jdd  dj
d||< qtj|dd
}|}|}|S )Nc                    s   g | ]\}}| | qS r"   r"   ).0r:   Zlateral_conv)rh   r"   r#   
<listcomp>   s     z'UperNetHead.forward.<locals>.<listcomp>r   r   rL   rC   rD   rE   c                    s   g | ]}j |  | qS r"   )rU   )ri   r:   )lateralsr   r"   r#   rj      s     rb   )r8   rT   r@   rf   rR   rangeshaper   rH   rI   r>   r0   rd   rV   rP   )r   rh   Zused_backbone_levelsr:   Z
prev_shapeZfpn_outsr'   r"   )rh   rk   r   r#   r(      s0          

zUperNetHead.forward)r)   r*   r+   r,   r   r[   rY   rf   r0   r1   r(   r2   r"   r"   r    r#   rK   |   s   '	rK   c                       sb   e Zd ZdZdeeeeeeef f dd fddZd	d
 Zdd Z	e
je
jdddZ  ZS )UperNetFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config:
            Configuration.
        in_channels (int):
            Number of input channels.
        kernel_size (int):
            The kernel size for convs in the head. Default: 3.
        dilation (int):
            The dilation rate for convs in the head. Default: 1.
    rC   r   r   N)in_indexr   r   r   c              
      s   t    || _|j| _|j| _|j| _|j	| _
|| _|d | }g }|t| j| j|||d t| jd D ] }|t| j| j|||d qp| jdkrt | _ntj| | _| j
rt| j| j | j||d d| _tj| j|jdd| _d S )NrC   )r   r   r   r   r   rM   r6   )r   r   rN   Zauxiliary_in_channelsr   Zauxiliary_channelsr5   Zauxiliary_num_convsZ	num_convsZauxiliary_concat_inputconcat_inputro   r@   r   rl   r   ZIdentityconvsZ
Sequentialconv_catr   rO   rP   )r   rN   ro   r   r   Zconv_paddingrq   r:   r    r"   r#   r      sN    
        

   zUperNetFCNHead.__init__c                 C   s   |  | j d S r&   rW   rZ   r"   r"   r#   r[     s    zUperNetFCNHead.init_weightsc                 C   s<   t |tjr8|jjjd| jjd |jd k	r8|jj	  d S r\   r]   r`   r"   r"   r#   rY     s    
zUperNetFCNHead._init_weightsrg   c                 C   s@   || j  }| |}| jr2| tj||gdd}| |}|S )Nr   rb   )ro   rq   rp   rr   r0   rd   rP   )r   rh   hidden_statesr'   r"   r"   r#   r(     s    


zUperNetFCNHead.forward)rC   r   r   )r)   r*   r+   r,   r-   r   r   r   r[   rY   r0   r1   r(   r2   r"   r"   r    r#   rn      s          $rn   c                   @   s,   e Zd ZdZeZdZg Zdd Zdd Z	dS )UperNetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    pixel_valuesc                 C   s6   t |tr2|j  |j  |jd k	r2|j  d S r&   )r^   rt   backboner[   decode_headauxiliary_headr`   r"   r"   r#   rY   *  s
    



z$UperNetPreTrainedModel._init_weightsc                 C   s,   | j   | j  | jdk	r(| j  dS )zInitialize the weightsN)rv   r[   rw   rx   rZ   r"   r"   r#   r[   1  s    


z#UperNetPreTrainedModel.init_weightsN)
r)   r*   r+   r,   r   config_classZmain_input_nameZ_no_split_modulesrY   r[   r"   r"   r"   r#   rt      s   rt   aI  
    Parameters:
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ax  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
            `attentions` under returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
            returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zMUperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.c                       sp   e Zd Z fddZeedeee	dd	e
ej e
e e
e e
ej e
e eeef dddZ  ZS )
UperNetForSemanticSegmentationc                    sH   t  | t|| _t|| jjd| _|jr6t|nd | _	| 
  d S )N)r   )r   r   r   rv   rK   r5   rw   Zuse_auxiliary_headrn   rx   Z	post_init)r   rN   r    r"   r#   r   X  s
    
z'UperNetForSemanticSegmentation.__init__zbatch_size, sequence_length)output_typery   N)ru   output_attentionsoutput_hidden_stateslabelsreturn_dictr   c                 C   sv  |dk	r| j jdkrtd|dk	r(|n| j j}|dk	r<|n| j j}|dk	rP|n| j j}| jj|||d}|j}| 	|}t
jj||jdd ddd}d}	| jdk	r| |}	t
jj|	|jdd ddd}	d}
|dk	rt| j jd	}|||}
|	dk	r||	|}|
| j j| 7 }
|sb|r8|f|dd  }n|f|dd  }|
dk	r^|
f| S |S t|
||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
        >>> model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
        >>> list(logits.shape)
        [1, 150, 512, 512]
        ```Nr   z/The number of labels should be greater than one)r}   r|   rC   rD   FrE   )Zignore_index)losslogitsrs   
attentions)rN   rO   
ValueErrorZuse_return_dictr}   r|   rv   Zforward_with_filtered_kwargsZfeature_mapsrw   r   rH   rI   rm   rx   r   Zloss_ignore_indexZauxiliary_loss_weightr	   rs   r   )r   ru   r|   r}   r~   r   outputsfeaturesr   Zauxiliary_logitsr   Zloss_fctZauxiliary_lossr'   r"   r"   r#   r(   d  sR    '  


   



z&UperNetForSemanticSegmentation.forward)NNNNN)r)   r*   r+   r   r   UPERNET_INPUTS_DOCSTRINGformatr   r	   _CONFIG_FOR_DOCr   r0   r1   r/   r   tupler(   r2   r"   r"   r    r#   rz   S  s    
     
rz   )!r,   typingr   r   r   r   r0   r   Ztorch.nnr   Zmodeling_outputsr	   Zmodeling_utilsr
   utilsr   r   r   Zutils.backbone_utilsr   Zconfiguration_upernetr   r   Moduler   r3   r<   rK   rn   rt   ZUPERNET_START_DOCSTRINGr   rz   r"   r"   r"   r#   <module>   s.   #&]G
