U
    DAf^                  	   @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ edZd d	lmZ W 5 Q R X eeZeG d
d dZdS )    N)Path)AnyDictListOptionalUnion)Document	componentlogging)get_bytestream_from_sourcenormalize_metadata)
ByteStream)
LazyImportzRun 'pip install tika')parserc                
   @   sp   e Zd ZdZdedddZejee	 ddee
eeef  ee
eeef eeeef  f  dd	d
ZdS )TikaDocumentConvertera  
    Converts files of different types to Documents.

    This component uses [Apache Tika](https://tika.apache.org/) for parsing the files and, therefore,
    requires a running Tika server.
    For more options on running Tika,
    see the [official documentation](https://github.com/apache/tika-docker/blob/main/README.md#usage).

    Usage example:
    ```python
    from haystack.components.converters.tika import TikaDocumentConverter

    converter = TikaDocumentConverter()
    results = converter.run(
        sources=["sample.docx", "my_document.rtf", "archive.zip"],
        meta={"date_added": datetime.now().isoformat()}
    )
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the docx file.'
    ```
    http://localhost:9998/tika)tika_urlc                 C   s   t   || _dS )zr
        Create a TikaDocumentConverter component.

        :param tika_url:
            Tika server URL.
        N)tika_importcheckr   )selfr    r   G/tmp/pip-unpacked-wheel-z752163x/haystack/components/converters/tika.py__init__-   s    zTikaDocumentConverter.__init__)	documentsN)sourcesmetac                 C   s   g }t |t|d}t||D ]\}}zt|}W n: tk
rl } ztjd||d W Y qW 5 d}~X Y nX z tjt	
|j| jdd }	W n: tk
r }
 ztjd||
d W Y qW 5 d}
~
X Y nX |j|}t|	|d}|| qd	|iS )
a  
        Converts files to Documents.

        :param sources:
            List of HTML file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: Created Documents
        )r   Zsources_countz4Could not read {source}. Skipping it. Error: {error})sourceerrorN)ZserverEndpointcontentzAFailed to extract text from {source}. Skipping it. Error: {error})r   r   r   )r   lenzipr   	Exceptionloggerwarningtika_parserfrom_bufferioBytesIOdatar   r   r   append)r   r   r   r   Z	meta_listr   metadataZ
bytestreametextZconversion_eZmerged_metadatadocumentr   r   r   run7   s*     
zTikaDocumentConverter.run)r   )N)__name__
__module____qualname____doc__strr   r	   Zoutput_typesr   r   r   r   r   r   r   r   r.   r   r   r   r   r      s   
 "r   )r&   pathlibr   typingr   r   r   r   r   Zhaystackr   r	   r
   Z$haystack.components.converters.utilsr   r   Zhaystack.dataclassesr   Zhaystack.lazy_importsr   r   Ztikar   r$   	getLoggerr/   r"   r   r   r   r   r   <module>   s   

