U
    DAf                  	   @   s   d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
mZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ ed	Zd d
lmZ W 5 Q R X eeZG dd deZG dd dZ eG dd dZ!dS )    N)Path)AnyDictListOptionalProtocolUnion)Document	componentdefault_from_dictdefault_to_dictlogging)get_bytestream_from_sourcenormalize_metadata)
ByteStream)
LazyImport)deserialize_typezRun 'pip install pypdf')	PdfReaderc                   @   s4   e Zd ZdZdedddZdd Zedd	 Zd
S )PyPDFConverterzt
    A protocol that defines a converter which takes a PdfReader object and converts it into a Document object.
    r   readerreturnc                 C   s   d S N )selfr   r   r   H/tmp/pip-unpacked-wheel-z752163x/haystack/components/converters/pypdf.pyconvert   s    zPyPDFConverter.convertc                 C   s   d S r   r   r   r   r   r   to_dict   s    zPyPDFConverter.to_dictc                 C   s   d S r   r   clsdatar   r   r   	from_dict!   s    zPyPDFConverter.from_dictN	__name__
__module____qualname____doc__r	   r   r   classmethodr"   r   r   r   r   r      s
   r   c                   @   s4   e Zd ZdZdedddZdd Zedd	 Zd
S )DefaultConverterzp
    The default converter class that extracts text from a PdfReader object's pages and returns a Document.
    r   r   c                 C   s    d dd |jD }t|dS )zMExtract text from the PDF and return a Document object with the text content.c                 s   s   | ]}|  V  qd S r   )Zextract_text).0pager   r   r   	<genexpr>-   s     z+DefaultConverter.convert.<locals>.<genexpr>)content)joinZpagesr	   )r   r   textr   r   r   r   +   s    zDefaultConverter.convertc                 C   s   t | S )z(Serialize the converter to a dictionary.)r   r   r   r   r   r   0   s    zDefaultConverter.to_dictc                 C   s
   t | |S )z,Deserialize the converter from a dictionary.)r   r   r   r   r   r"   4   s    zDefaultConverter.from_dictNr#   r   r   r   r   r)   &   s
   r)   c                
   @   s   e Zd ZdZdee dddZdd Zedd	 Z	e
jee d
deeeeef  eeeeef eeeef  f  dddZdS )PyPDFToDocumenta#  
    Converts PDF files to Documents.

    Uses `pypdf` compatible converters to convert PDF files to Documents.
    A default text extraction converter is used if one is not provided.

    Usage example:
    ```python
    from haystack.components.converters.pypdf import PyPDFToDocument

    converter = PyPDFToDocument()
    results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the PDF file.'
    ```
    N	converterc                 C   s   t   |pt | _dS )z
        Create an PyPDFToDocument component.

        :param converter:
            An instance of a PyPDFConverter compatible class.
        N)pypdf_importcheckr)   r3   )r   r3   r   r   r   __init__N   s    zPyPDFToDocument.__init__c                 C   s   t | | j dS )z{
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        r2   )r   r3   r   r   r   r   r   r   Y   s    zPyPDFToDocument.to_dictc                 C   s8   t |d d d }||d d |d d< t| |S )z
        Deserializes the component from a dictionary.

        :param data:
            Dictionary with serialized data.

        :returns:
            Deserialized component.
        Zinit_parametersr3   type)r   r"   r   )r    r!   Zconverter_classr   r   r   r"   b   s    zPyPDFToDocument.from_dict)	documents)sourcesmetac                 C   s   g }t |t|d}t||D ]\}}zt|}W n: tk
rl } ztjd||d W Y qW 5 d}~X Y nX z tt	|j
}	| j|	}
W n: tk
r } ztjd||d W Y qW 5 d}~X Y nX |j|}||
_||
 qd|iS )a  
        Converts PDF files to Documents.

        :param sources:
            List of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: Created Documents
        )Zsources_countz4Could not read {source}. Skipping it. Error: {error})sourceerrorNzECould not read {source} and convert it to Document, skipping. {error}r8   )r   lenzipr   	Exceptionloggerwarningr   ioBytesIOr!   r3   r   r:   append)r   r9   r:   r8   Z	meta_listr;   metadataZ
bytestreameZ
pdf_readerdocumentZmerged_metadatar   r   r   runq   s,      
zPyPDFToDocument.run)N)N)r$   r%   r&   r'   r   r   r6   r   r(   r"   r
   Zoutput_typesr   r	   r   strr   r   r   r   rH   r   r   r   r   r1   :   s   	
 "r1   )"rB   pathlibr   typingr   r   r   r   r   r   Zhaystackr	   r
   r   r   r   Z$haystack.components.converters.utilsr   r   Zhaystack.dataclassesr   Zhaystack.lazy_importsr   Z!haystack.utils.type_serializationr   r4   Zpypdfr   	getLoggerr$   r@   r   r)   r1   r   r   r   r   <module>   s    

