U
    DAf                  	   @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZmZmZ d dlmZmZ d dlmZ d d	lmZ eeZed
Zd dlZd dlmZ W 5 Q R X eG dd dZeG dd dZdS )    N)	dataclass)datetime)Path)AnyDictListOptionalUnion)Document	componentlogging)get_bytestream_from_sourcenormalize_metadata)
ByteStream)
LazyImportzRun 'pip install python-docx')r
   c                   @   s   e Zd ZU dZeed< eed< eed< eed< ee ed< eed< eed< eed	< eed
< ee ed< ee ed< eed< eed< eed< eed< dS )DOCXMetadataa  
    Describes the metadata of Docx file.

    :param author: The author
    :param category: The category
    :param comments: The comments
    :param content_status: The content status
    :param created: The creation date
    :param identifier: The identifier
    :param keywords: Available keywords
    :param language: The language of the document
    :param last_modified_by: The last modified by user date
    :param last_printed: The last printed date
    :param modified: The last modification date
    :param revision: The revision number
    :param subject: The subject
    :param title: The title
    :param version: The version
    authorcategorycommentscontent_statuscreated
identifierkeywordslanguagelast_modified_bylast_printedmodifiedrevisionsubjecttitleversionN)	__name__
__module____qualname____doc__str__annotations__r   r   int r(   r(   G/tmp/pip-unpacked-wheel-z752163x/haystack/components/converters/docx.pyr      s    
r   c                
   @   sx   e Zd ZdZdd Zejee ddee	e
eef  ee	ee
ef eee
ef  f  dddZd	ed
ddZdS )DOCXToDocumenta#  
    Converts DOCX files to Documents.

    Uses `python-docx` library to convert the DOCX file to a document.
    This component does not preserve page breaks in the original document.

    Usage example:
    ```python
    from haystack.components.converters.docx import DOCXToDocument

    converter = DOCXToDocument()
    results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the DOCX file.'
    ```
    c                 C   s   t   dS )z4
        Create a DOCXToDocument component.
        N)docx_importcheck)selfr(   r(   r)   __init__R   s    zDOCXToDocument.__init__)	documentsN)sourcesmetac                 C   s  g }t |t|d}t||D ]\}}zt|}W n: tk
rl } ztjd||d W Y qW 5 d}~X Y nX z0tt	
|j}	dd |	jD }
d|
}W n: tk
r } ztjd||d W Y qW 5 d}~X Y nX | j|	d	}|j|d
|i}t||d}|| qd|iS )a  
        Converts DOCX files to Documents.

        :param sources:
            List of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: Created Documents
        )r1   Zsources_countz4Could not read {source}. Skipping it. Error: {error})sourceerrorNc                 S   s   g | ]
}|j qS r(   )text).0parar(   r(   r)   
<listcomp>z   s     z&DOCXToDocument.run.<locals>.<listcomp>
zSCould not read {source} and convert it to a DOCX Document, skipping. Error: {error})documentdocx)contentr1   r/   )r   lenzipr   	Exceptionloggerwarningr:   r
   ioBytesIOdata
paragraphsjoin_get_docx_metadatar1   append)r-   r0   r1   r/   Z	meta_listr2   metadataZ
bytestreamefilerD   r4   Zdocx_metadataZmerged_metadatar9   r(   r(   r)   runX   s0    zDOCXToDocument.runDocxDocument)r9   returnc                 C   sb   t |jj|jj|jj|jj|jj|jj|jj|jj	|jj
|jj|jj|jj|jj|jj|jjdS )a)  
        Get all relevant data from the 'core_properties' attribute from a DOCX Document.

        :param document:
            The DOCX Document you want to extract metadata from

        :returns:
            A `DOCXMetadata` dataclass all the relevant fields from the 'core_properties'
        )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    )r   Zcore_propertiesr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    )r-   r9   r(   r(   r)   rF      s"    
z!DOCXToDocument._get_docx_metadata)N)r!   r"   r#   r$   r.   r   Zoutput_typesr   r
   r	   r%   r   r   r   r   r   rK   r   rF   r(   r(   r(   r)   r*   >   s    "2r*   ) rA   Zdataclassesr   r   pathlibr   typingr   r   r   r   r	   Zhaystackr
   r   r   Z$haystack.components.converters.utilsr   r   Zhaystack.dataclassesr   Zhaystack.lazy_importsr   	getLoggerr!   r?   r+   r:   Zdocx.documentrL   r   r*   r(   r(   r(   r)   <module>   s    

&