U
    DAfa                  	   @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZm Z  e!e"Z#ed
d*Z$d dl%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ W 5 Q R X eG dd dZ,dS )    N)defaultdict)Path)AnyDictListLiteralOptionalUnion)Document	componentdefault_from_dictdefault_to_dictlogging)get_bytestream_from_sourcenormalize_metadata)
ByteStream)
LazyImport)Secretdeserialize_secrets_inplacez4Run 'pip install "azure-ai-formrecognizer>=3.2.0b2"')message)AnalyzeResultDocumentAnalysisClientDocumentLineDocumentParagraph)AzureKeyCredentialc                
   @   s  e Zd ZdZedddddddfeeeeeee	d e
e d	d
dZejee ee dd.eeeeef  e
eeeef   dddZeeef dddZeeeef d dddZde
eeef  ee dddZde
eeef  ee dddZde
eeef  edddZd/de
eeef  eedd d!Zded"d#d$Zeed% ed&d'd(Z d0e!j"ed+d,d-Z#dS )1AzureOCRDocumentConvertera  
    Convert files to documents using Azure's Document Intelligence service.

    Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.

    In order to be able to use this component, you need an active Azure account
    and a Document Intelligence or Cognitive Services resource. Follow the steps described in the [Azure documentation]
    (https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
    to set up your resource.

    Usage example:
    ```python
    from haystack.components.converters import AzureOCRDocumentConverter
    from haystack.utils import Secret

    converter = AzureOCRDocumentConverter(endpoint="<url>", api_key=Secret.from_token("<your-api-key>"))
    results = converter.run(sources=["path/to/doc_with_images.pdf"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the PDF file.'
    ```
    ZAZURE_AI_API_KEYzprebuilt-read   Tnatural皙?)r   single_column)endpointapi_keymodel_idpreceding_context_lenfollowing_context_lenmerge_multiple_column_headerspage_layoutthreshold_yc	           	      C   sp   t   t|t| pdd| _|| _|| _|| _|| _	|| _
|| _|| _|| _| jdkrl| jdkrld| _dS )a  
        Create an AzureOCRDocumentConverter component.

        :param endpoint:
            The endpoint of your Azure resource.
        :param api_key:
            The key of your Azure resource.
        :param model_id:
            The model ID of the model you want to use. Please refer to [Azure documentation]
            (https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
            for a list of available models. Default: `"prebuilt-read"`.
        :param preceding_context_len: Number of lines before a table to extract as preceding context
            (will be returned as part of metadata).
        :param following_context_len: Number of lines after a table to extract as subsequent context (
            will be returned as part of metadata).
        :param merge_multiple_column_headers: Some tables contain more than one row as a column header
            (i.e., column description).
            This parameter lets you choose, whether to merge multiple column header rows to a single row.
        :param page_layout: The type reading order to follow. If "natural" is chosen then the natural reading order
            determined by Azure will be used. If "single_column" is chosen then all lines with the same height on the
            page will be grouped together based on a threshold determined by `threshold_y`.
        :param threshold_y: The threshold to determine if two recognized elements in a PDF should be grouped into a
            single line. This is especially relevant for section headers or numbers which may be spacially separated
            on the horizontal axis from the remaining text. The threshold is specified in units of inches.
            This is only relevant if "single_column" is chosen for `page_layout`.
         )r    Z
credentialr   Nr   )azure_importcheckr   r   Zresolve_valuedocument_analysis_clientr    r"   r!   r#   r$   r%   r&   r'   )	selfr    r!   r"   r#   r$   r%   r&   r'    r-   H/tmp/pip-unpacked-wheel-z752163x/haystack/components/converters/azure.py__init__4   s    % z"AzureOCRDocumentConverter.__init__	documentsZraw_azure_responseN)sourcesmetac                 C   s   g }g }t |t|d}t||D ]\}}zt|d}W n: tk
rr }	 ztjd||	d W Y q"W 5 d}	~	X Y nX | jj| j	|j
d}
|
 }||  |j|}| j||d}|| q"||dS )	az  
        Convert a list of files to Documents using Azure's Document Intelligence service.

        :param sources:
            List of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will be
            zipped. If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: List of created Documents
            - `raw_azure_response`: List of raw Azure responses used to create the Documents
        )r3   Zsources_count)sourcez4Could not read {source}. Skipping it. Error: {error})r4   errorN)r"   documentresultr3   r0   )r   lenzipr   	Exceptionloggerwarningr+   Zbegin_analyze_documentr"   datar8   appendto_dictr3   _convert_tables_and_textextend)r,   r2   r3   r1   Zazure_outputZ	meta_listr4   metadataZ
bytestreameZpollerr8   Zmerged_metadatadocsr-   r-   r.   runi   s&     
zAzureOCRDocumentConverter.run)returnc                 C   s.   t | | j | j| j| j| j| j| j| j	d	S )z{
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        )r!   r    r"   r#   r$   r%   r&   r'   )
r   r!   r@   r    r"   r#   r$   r%   r&   r'   )r,   r-   r-   r.   r@      s    z!AzureOCRDocumentConverter.to_dict)r>   rG   c                 C   s   t |d dgd t| |S )z
        Deserializes the component from a dictionary.

        :param data:
            The dictionary to deserialize from.
        :returns:
            The deserialized component.
        Zinit_parametersr!   )keys)r   r   )clsr>   r-   r-   r.   	from_dict   s    
z#AzureOCRDocumentConverter.from_dictr   )r8   r3   rG   c                 C   sX   | j ||d}| jdkr(| j||d}n"t| jts8t| j||| jd}||f}|S )a  
        Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents.

        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
            can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
            Can be any custom keys and values.
        :returns: List of Documents containing the tables and text extracted from the AnalyzeResult object.
        r7   r   )r8   r3   r'   )_convert_tablesr&   _convert_to_natural_text
isinstancer'   floatAssertionError_convert_to_single_column_text)r,   r8   r3   tablestextrE   r-   r-   r.   rA      s    


z2AzureOCRDocumentConverter._convert_tables_and_textc              	      s   g }|j s|S |j D ]  fddt jD }t }d}d}t jD ] \}}	|	jdd|	_|	jdd|	_|dkr|	j j	kr|	j}d}|
d qJ|	jr|	jnd}
t|
D ]}|	jr|	jnd}t|D ]v}| jr*|	jdkr*|	j|kr*|d |	j|   d	|	j 7  < ||	j|  q|	j||	j| |  |	j| < qqqJt|d
dD ]}||= qZ jrt fdd|jD }nd} jd j|r|jrfdd|jD }ng }d	|| j d d	|  }| } jrt jdkr|}n& jr*t fdd|jD }nd} jd j |rb|jrbfdd|jD }ng }d	|d| j }t|}t |t!r||d< ||d< n
||d} jrĈ jd j"|d< t#j$|d |dd d}| %|}| | }t&'|( ) }|*t+|||d q|S )av  
        Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents.

        :param result: The AnalyzeResult Azure object
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.

        :returns: List of Documents containing the tables extracted from the AnalyzeResult object.
        c                    s   g | ]}d g j  qS )r(   )column_count).0_tabler-   r.   
<listcomp>   s     z=AzureOCRDocumentConverter._convert_tables.<locals>.<listcomp>r(   r   z
:selected:z:unselected:   ZcolumnHeader
T)reversec                 3   s$   | ]}|j  jd  j kr|V  qdS )r   Npage_numberbounding_regionsrT   pagerV   r-   r.   	<genexpr>   s     z<AzureOCRDocumentConverter._convert_tables.<locals>.<genexpr>Nc                    s"   g | ]}|j d  j k r|jqS r   spansoffsetcontentrT   line)table_start_offsetr-   r.   rX     s     c                 3   s$   | ]}|j  jd  j kr|V  qdS )Nr\   r_   rV   r-   r.   ra     s     c                    s"   g | ]}|j d  j kr|jqS rb   rc   rg   )table_end_offsetr-   r.   rX     s     preceding_contextfollowing_context)rl   rm   r`   )columnsr>   )idZ	dataframer3   ),rQ   rangeZ	row_countset	enumeratecellsrf   replacecolumn_spanrS   poprow_spanr%   kindZ	row_indexZcolumn_indexaddsortedr^   nextpagesrd   re   linesjoinr#   stripr9   lengthr$   copydeepcopyrM   dictr]   pd	DataFrame_hash_dataframehashlibsha256encode	hexdigestr?   r
   )r,   r8   r3   Zconverted_tablesZ
table_listZadditional_column_header_rowscaptionZrow_idx_startidxcellru   crw   rZrow_idxZtable_beginning_pageZpreceding_linesrl   Ztable_end_pageZfollowing_linesrm   Z
table_metaZtable_dfZ	pd_hashesr>   Zdoc_idr-   )rW   rk   ri   r.   rK      s    	
"$


 





z)AzureOCRDocumentConverter._convert_tablesc                 C   s   | j |d}g }|jrtt}|jD ]p}|jr@dd |jD }n|rTt| d nd}|g}||d  }	| j|	|drzq$||d   |jd 7  < q$t	|}
t
d|
d D ]}||d	}|| qn
td
 d|}t||r|ni dS )a(  
        This converts the `AnalyzeResult` object into a single document.

        We add "" separators between to differentiate between the text on separate pages. This is the expected format
        for the PreProcessor.

        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
            can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
            Can be any custom keys and values.
        :returns: A single Document containing all the text extracted from the AnalyzeResult object.
        r8   c                 S   s   g | ]
}|j qS r-   )r]   )rT   br-   r-   r.   rX   I  s     zFAzureOCRDocumentConverter._convert_to_natural_text.<locals>.<listcomp>rj   rY   r   line_or_paragraphrZ   r(   z7No text paragraphs were detected by the OCR conversion.rf   r3   )_collect_table_spansZ
paragraphsr   strr^   rz   rH   _check_if_in_tablerf   maxrp   getr?   r<   r=   r~   r
   )r,   r8   r3   table_spans_by_pagetextsZparagraphs_to_pagesZ	paragraphZpage_numbersZcurrent_last_page_numbertables_on_pageZmax_page_numberpage_idx	page_textall_textr-   r-   r.   rL   4  s(    


z2AzureOCRDocumentConverter._convert_to_natural_text)r8   r3   r'   rG   c                    s  j |d}tt}t|jD ]\}}|jr2|jng  tdd  D rtt D ]} | j	\}	}
}
}
|| 
||g t|d t D ]D} | j	\}}
}
}
t|	d |d  |k }|r|| 
||g qqTqtdj|d tt D ]}|| 
||g qqi }|D ]8}t }|||  dd tt|D ||< qi }t|jD ]X\}}g }|jrz|jng  ||g D ]"} fd	d|D }|
| q|||< q`i }t|jD ]<\}}
g }|| D ]}|
t|d
d d q|||< qi }t|jD ]&\}}
t|| dd d}|||< qg }t|jD ]p\}}||j d}|| D ]D}tfdd|D rqh|ddd |D 7 }|d7 }qh|
| qJd|}t||r|ni dS )a  
        This converts the `AnalyzeResult` object into a single Haystack Document.

        We add "" separators between to differentiate between the text on separate pages. This is the expected format
        for the PreProcessor.

        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
            can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
            Can be any custom keys and values.
        :param threshold_y: height threshold in inches for PDF and pixels for images
        :returns: A single Document containing all the text extracted from the AnalyzeResult object.
        r   c                 s   s   | ]}|j d k	V  qd S Npolygonrg   r-   r-   r.   ra   x  s     zKAzureOCRDocumentConverter._convert_to_single_column_text.<locals>.<genexpr>rY   zPolygon information for lines on page {page_idx} is not available so it is not possible to enforce a single column page layout.)r   c                 S   s   g | ]}t |qS r-   )list)rT   ar-   r-   r.   rX     s     zLAzureOCRDocumentConverter._convert_to_single_column_text.<locals>.<listcomp>c                    s   g | ]} | qS r-   r-   )rT   Zline_idx)r}   r-   r.   rX     s     c                 S   s   | j d d S )Nr   r   xr-   r-   r.   <lambda>      zJAzureOCRDocumentConverter._convert_to_single_column_text.<locals>.<lambda>)keyc                 S   s   | d j d d S )Nr   rY   r   r   r-   r-   r.   r     r   r(   c                 3   s   | ]} j |d V  qdS )r   N)r   rg   )r,   r   r-   r.   ra     s      c                 s   s   | ]}|j V  qd S r   )rf   rg   r-   r-   r.   ra     s     rZ   r   r   )r   r   r   rr   r|   r}   allrp   r9   r   r?   absr<   infoformatnxZGraphZadd_edges_fromZconnected_componentsr   rz   r]   anyr~   r
   )r,   r8   r3   r'   r   Zpairs_by_pager   r`   iZleft_upirU   jZleft_upjZclose_on_y_axisZmerged_pairs_by_pagegraphZmerged_lines_by_pageZrowsZrow_of_linesZlines_in_rowZx_sorted_lines_by_pageZsorted_rowsZy_sorted_lines_by_pager   r   r   r-   )r}   r,   r   r.   rP   `  sn     

z8AzureOCRDocumentConverter._convert_to_single_column_text)r8   rG   c                 C   sJ   t t}|jr|jng }|D ](}|js(q||jd j |jd  q|S )a  
        Collect the spans of all tables by page number.

        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
        :returns: A dictionary with the page number as key and a list of table spans as value.
        r   )r   r   rQ   r^   r]   r?   rd   )r,   r8   r   rQ   rW   r-   r-   r.   r     s    z.AzureOCRDocumentConverter._collect_table_spans)r   r   )r   r   rG   c                 C   sD   d}|D ]6}|j |jd j   kr2|j |j krn qd} q@q|S )aF  
        Check if a line or paragraph is part of a table.

        :param tables_on_page: A dictionary with the page number as key and a list of table spans as value.
        :param line_or_paragraph: The line or paragraph to check.
        :returns: True if the line or paragraph is part of a table, False otherwise.
        Fr   T)re   rd   r   )r,   r   r   Zin_tablerW   r-   r-   r.   r     s    
*z,AzureOCRDocumentConverter._check_if_in_table      )dfrG   c                 C   sn   t  }t|}td|| }tjj|dd}|dd| }|D ]$}	t|	d| d}
|	|
 q@|
 S )a_  
        Returns a hash of the DataFrame content.

        The hash is based on the content of the DataFrame.
        :param df: The DataFrame to hash.
        :param desired_samples: The desired number of samples to hash.
        :param hash_length: The length of the hash for each sample.

        :returns: A hash of the DataFrame content.
        rY   T)indexNzutf-8)r   md5r9   r   r   utilZhash_pandas_objectr   r   updater   )r,   r   Zdesired_samplesZhash_lengthZhasherZ
total_rowsZsample_ratehashesZsampled_hashesZ
hash_valueZpartial_hashr-   r-   r.   r     s    z)AzureOCRDocumentConverter._hash_dataframe)N)r   )r   r   )$__name__
__module____qualname____doc__r   Zfrom_env_varr   intboolr   r   rN   r/   r   Zoutput_typesr   r
   r   r	   r   r   r   rF   r@   classmethodrJ   rA   rK   rL   rP   r   r   r   r   r   r   r-   r-   r-   r.   r      sN   52(""n-   Z r   )-r   r   collectionsr   pathlibr   typingr   r   r   r   r   r	   Znetworkxr   Zpandasr   Zhaystackr
   r   r   r   r   Z$haystack.components.converters.utilsr   r   Zhaystack.dataclassesr   Zhaystack.lazy_importsr   Zhaystack.utilsr   r   	getLoggerr   r<   r)   Zazure.ai.formrecognizerr   r   r   r   Zazure.core.credentialsr   r   r-   r-   r-   r.   <module>   s"    
