U
    DAf?                  	   @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
mZmZmZmZ d dlmZmZ d dlmZ d dlmZ eeZedZd d	lmZ W 5 Q R X eG d
d dZdS )    N)Path)AnyDictListOptionalUnion)Document	componentdefault_from_dictdefault_to_dictlogging)get_bytestream_from_sourcenormalize_metadata)
ByteStream)
LazyImportzRun 'pip install trafilatura')extractc                
   @   s   e Zd ZdZdee ee eeeef  dddZ	eeef dddZ
eeeef d d	d
dZejee ddeeeeef  eeeeef eeeef  f  eeeef  dddZdS )HTMLToDocumentae  
    Converts an HTML file to a Document.

    Usage example:
    ```python
    from haystack.components.converters import HTMLToDocument

    converter = HTMLToDocument()
    results = converter.run(sources=["path/to/sample.html"])
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the HTML file.'
    ```
    N)extractor_type
try_othersextraction_kwargsc                 C   s>   t   |dk	rtdt |dk	r0tdt |p6i | _dS )a  
        Create an HTMLToDocument component.

        :param extractor_type: Ignored. This parameter is kept for compatibility with previous versions. It will be
            removed in Haystack 2.4.0. To customize the extraction, use the `extraction_kwargs` parameter.
        :param try_others: Ignored. This parameter is kept for compatibility with previous versions. It will be
            removed in Haystack 2.4.0.
        :param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
            are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
            the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
        NzThe `extractor_type` parameter is ignored and will be removed in Haystack 2.4.0. To customize the extraction, use the `extraction_kwargs` parameter.zMThe `try_others` parameter is ignored and will be removed in Haystack 2.4.0. )trafilatura_importcheckwarningswarnDeprecationWarningr   )selfr   r   r    r   G/tmp/pip-unpacked-wheel-z752163x/haystack/components/converters/html.py__init__%   s     zHTMLToDocument.__init__)returnc                 C   s   t | | jdS )z{
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        )r   )r   r   )r   r   r   r   to_dictD   s    zHTMLToDocument.to_dict)datar   c                 C   s
   t | |S )z
        Deserializes the component from a dictionary.

        :param data:
            The dictionary to deserialize from.
        :returns:
            The deserialized component.
        )r
   )clsr!   r   r   r   	from_dictM   s    
zHTMLToDocument.from_dict)	documents)sourcesmetar   c                 C   s   | j |p
i }g }t|t|d}t||D ]\}}zt|d}	W n: tk
r| }
 ztjd||
d W Y q,W 5 d}
~
X Y nX zt|	j	
df|}W n: tk
r } ztjd||d W Y q,W 5 d}~X Y nX t||	j|d}|| q,d	|iS )
ap  
        Converts a list of HTML files to Documents.

        :param sources:
            List of HTML file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
        :param extraction_kwargs:
            Additional keyword arguments to customize the extraction process.

        :returns:
            A dictionary with the following keys:
            - `documents`: Created Documents
        )r&   Zsources_count)sourcez4Could not read {source}. Skipping it. Error: {error})r'   errorNzutf-8zAFailed to extract text from {source}. Skipping it. Error: {error})contentr&   r$   )r   r   lenzipr   	Exceptionloggerwarningr   r!   decoder   r&   append)r   r%   r&   r   Zmerged_extraction_kwargsr$   Z	meta_listr'   metadataZ
bytestreametextZconversion_edocumentr   r   r   runY   s*    zHTMLToDocument.run)NNN)NN)__name__
__module____qualname____doc__r   strboolr   r   r   r    classmethodr#   r	   Zoutput_typesr   r   r   r   r   r5   r   r   r   r   r      s(      	  "r   )r   pathlibr   typingr   r   r   r   r   Zhaystackr   r	   r
   r   r   Z$haystack.components.converters.utilsr   r   Zhaystack.dataclassesr   Zhaystack.lazy_importsr   	getLoggerr6   r-   r   Ztrafilaturar   r   r   r   r   r   <module>   s   

