U
    DAfZO                     @   sl   d dl mZ d dlmZmZmZmZmZmZ d dl	m
Z d dlmZmZmZ eeZeG dd dZdS )    )defaultdict)AnyCallableDictListLiteralOptional)parse)Document	componentloggingc                   @   sB  e Zd ZdZd"eeee ed ed ed	 eed
  dddZ	eee ed ed ed	 eed
  dddZ
ejee dd#ee ee ee eed  eed  eed	  eed
  dddZee eed
  ee dddZee ee eed ee dddZed$eeedddZeeeedd d!ZdS )%MetaFieldRankerav  
    Ranks Documents based on the value of their specific meta field.

    The ranking can be performed in descending order or ascending order.

    Usage example:
    ```
    from haystack import Document
    from haystack.components.rankers import MetaFieldRanker

    ranker = MetaFieldRanker(meta_field="rating")
    docs = [
        Document(content="Paris", meta={"rating": 1.3}),
        Document(content="Berlin", meta={"rating": 0.7}),
        Document(content="Barcelona", meta={"rating": 2.1}),
    ]

    output = ranker.run(documents=docs)
    docs = output["documents"]
    assert docs[0].content == "Barcelona"
          ?Nreciprocal_rank_fusion
descendingbottomr   linear_scoreZ	ascendingr   Zdroptopr   )floatintdate)
meta_fieldweighttop_kranking_mode
sort_ordermissing_metameta_value_typec                 C   sN   || _ || _|| _|| _|| _|| _| j| j| j| j| j| j|d || _dS )a  
        Creates an instance of MetaFieldRanker.

        :param meta_field:
            The name of the meta field to rank by.
        :param weight:
            In range [0,1].
            0 disables ranking by a meta field.
            0.5 ranking from previous component and based on meta field have the same weight.
            1 ranking by a meta field only.
        :param top_k:
            The maximum number of Documents to return per query.
            If not provided, the Ranker returns all documents it receives in the new ranking order.
        :param ranking_mode:
            The mode used to combine the Retriever's and Ranker's scores.
            Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
            Use the 'linear_score' mode only with Retrievers or Rankers that return a score in range [0,1].
        :param sort_order:
            Whether to sort the meta field by ascending or descending order.
            Possible values are `descending` (default) and `ascending`.
        :param missing_meta:
            What to do with documents that are missing the sorting metadata field.
            Possible values are:
            - 'drop' will drop the documents entirely.
            - 'top' will place the documents at the top of the metadata-sorted list
                (regardless of 'ascending' or 'descending').
            - 'bottom' will place the documents at the bottom of metadata-sorted list
                (regardless of 'ascending' or 'descending').
        :param meta_value_type:
            Parse the meta value into the data type specified before sorting.
            This will only work if all meta values stored under `meta_field` in the provided documents are strings.
            For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
            we would parse the string into a datetime object and then sort the documents by date.
            The available options are:
            - 'float' will parse the meta values into floats.
            - 'int' will parse the meta values into integers.
            - 'date' will parse the meta values into datetime objects.
            - 'None' (default) will do no parsing.
        r   r   r   r   r   r    N)r   r   r   r   r   r   _validate_paramsr    )selfr   r   r   r   r   r   r     r$   J/tmp/pip-unpacked-wheel-z752163x/haystack/components/rankers/meta_field.py__init__'   s    2zMetaFieldRanker.__init__r!   c                 C   s   |d k	r|dkrt d| |dk s,|dkr8t d| |dkrLt d| |dkr`t d| |d	krtt d
| |dkrt d| d S )Nr   ztop_k must be > 0, but got %s   aC  Parameter <weight> must be in range [0,1] but is currently set to '%s'.
'0' disables sorting by a meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and '1' ranks by the meta field only.
Change the <weight> parameter to a value in range 0 to 1 when initializing the MetaFieldRanker.r   zThe value of parameter <ranking_mode> must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '%s'.
Change the <ranking_mode> value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker.r   zThe value of parameter <sort_order> must be 'ascending' or 'descending', but is currently set to '%s'.
Change the <sort_order> value to 'ascending' or 'descending' when initializing the MetaFieldRanker.r   zThe value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', but is currently set to '%s'.
Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the MetaFieldRanker.)r   r   r   NzThe value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is currently set to '%s'.
Change the <meta_value_type> value to 'float', 'int', 'date' or None when initializing the MetaFieldRanker.)
ValueError)r#   r   r   r   r   r   r    r$   r$   r%   r"   i   s@    	z MetaFieldRanker._validate_params)	documents)r)   r   r   r   r   r   r    c              
      sb  |sdg iS |p j }|dk	r"|n j}|p0 j}|p: j}|pD j}|pN j} j||||||d |dkr~d|d| iS  fdd|D } fdd|D }	t|dkrtj	d j
d	d
d |D d d|d| iS t|	dkrTd j
 dd	dd |	D  d}
|dkr,tj	d|
d n(|dkrFtj	d|
d ntj	d|
d  j||d}tt||}|dk}zt|dd |d}W nX tk
r } z8tj	dd	dd |D |d d|d| i W Y S d}~X Y nX dd |D }|dkr||	 } ||||}n8|dkr>|	| } ||||}n|} ||||}d|d| iS )a  
        Ranks a list of Documents based on the selected meta field by:

        1. Sorting the Documents by the meta field in descending or ascending order.
        2. Merging the rankings from the previous component and based on the meta field according to ranking mode and
        weight.
        3. Returning the top-k documents.

        :param documents:
            Documents to be ranked.
        :param top_k:
            The maximum number of Documents to return per query.
            If not provided, the top_k provided at initialization time is used.
        :param weight:
            In range [0,1].
            0 disables ranking by a meta field.
            0.5 ranking from previous component and based on meta field have the same weight.
            1 ranking by a meta field only.
            If not provided, the weight provided at initialization time is used.
        :param ranking_mode:
            (optional) The mode used to combine the Retriever's and Ranker's scores.
            Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
            Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
            If not provided, the ranking_mode provided at initialization time is used.
        :param sort_order:
            Whether to sort the meta field by ascending or descending order.
            Possible values are `descending` (default) and `ascending`.
            If not provided, the sort_order provided at initialization time is used.
        :param missing_meta:
            What to do with documents that are missing the sorting metadata field.
            Possible values are:
            - 'drop' will drop the documents entirely.
            - 'top' will place the documents at the top of the metadata-sorted list
                (regardless of 'ascending' or 'descending').
            - 'bottom' will place the documents at the bottom of metadata-sorted list
                (regardless of 'ascending' or 'descending').
            If not provided, the missing_meta provided at initialization time is used.
        :param meta_value_type:
            Parse the meta value into the data type specified before sorting.
            This will only work if all meta values stored under `meta_field` in the provided documents are strings.
            For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
            we would parse the string into a datetime object and then sort the documents by date.
            The available options are:
            -'float' will parse the meta values into floats.
            -'int' will parse the meta values into integers.
            -'date' will parse the meta values into datetime objects.
            -'None' (default) will do no parsing.
        :returns:
            A dictionary with the following keys:
            - `documents`: List of Documents sorted by the specified meta field.

        :raises ValueError:
            If `top_k` is not > 0.
            If `weight` is not in range [0,1].
            If `ranking_mode` is not 'reciprocal_rank_fusion' or 'linear_score'.
            If `sort_order` is not 'ascending' or 'descending'.
            If `meta_value_type` is not 'float', 'int', 'date' or `None`.
        r)   Nr!   r   c                    s   g | ]} j |jkr|qS r$   r   meta.0docr#   r$   r%   
<listcomp>   s      z'MetaFieldRanker.run.<locals>.<listcomp>c                    s   g | ]} j |jkr|qS r$   r*   r,   r/   r$   r%   r0      s      a7  The parameter <meta_field> is currently set to '{meta_field}', but none of the provided Documents with IDs {document_ids} have this meta key.
Set <meta_field> to the name of a field that is present within the provided Documents.
Returning the <top_k> of the original Documents since there are no values to rank.,c                 S   s   g | ]
}|j qS r$   idr,   r$   r$   r%   r0     s     r   document_idsz0The parameter <meta_field> is currently set to 'z' but the Documents with IDs c                 S   s   g | ]
}|j qS r$   r2   r,   r$   r$   r%   r0     s     z don't have this meta key.
r   z{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.)warning_startr   z{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.z{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.)docs_with_meta_fieldr    r   c                 S   s   | d S )Nr   r$   )xr$   r$   r%   <lambda>&      z%MetaFieldRanker.run.<locals>.<lambda>keyreversezTried to sort Documents with IDs {document_ids}, but got TypeError with the message: {error}
Returning the <top_k> of the original Documents since meta field ranking is not possible.c                 S   s   g | ]
}|j qS r$   r2   r,   r$   r$   r%   r0   ,  s     r5   errorc                 S   s   g | ]\}}|qS r$   r$   )r-   r+   r.   r$   r$   r%   r0   2  s     )r   r   r   r   r   r    r"   lenloggerwarningr   join_parse_metalistzipsorted	TypeError_merge_rankings)r#   r)   r   r   r   r   r   r    r7   Zdocs_missing_meta_fieldr6   Zparsed_metaZtuple_parsed_meta_and_docsr=   Ztuple_sorted_by_metar?   Zsorted_by_metasorted_documentsr$   r/   r%   run   s    E





"

&

zMetaFieldRanker.run)r7   r    returnc              
      s  |dkrfdd|D S fdd|D }t dd |D sptjd|d	d
d |D d fdd|D S |dkr~t n|dkrt nt z fdd|D }W nT tk
r } z6tjdd	dd |D |d fdd|D }W 5 d}~X Y nX |S )z|
        Parse the meta values stored under `self.meta_field` for the Documents provided in `docs_with_meta_field`.
        Nc                    s   g | ]}|j  j qS r$   r+   r   r-   dr/   r$   r%   r0   F  s     z/MetaFieldRanker._parse_meta.<locals>.<listcomp>c                    s   h | ]}|j  j qS r$   rM   r,   r/   r$   r%   	<setcomp>H  s     z.MetaFieldRanker._parse_meta.<locals>.<setcomp>c                 s   s   | ]}t |tV  qd S )N)
isinstancestr)r-   Z
meta_valuer$   r$   r%   	<genexpr>I  s     z.MetaFieldRanker._parse_meta.<locals>.<genexpr>a"  The parameter <meta_value_type> is currently set to '{meta_field}', but not all of meta values in the provided Documents with IDs {document_ids} are strings.
Skipping parsing of the meta values.
Set all meta values found under the <meta_field> parameter to strings to use <meta_value_type>.r1   c                 S   s   g | ]
}|j qS r$   r2   r,   r$   r$   r%   r0   P  s     r4   c                    s   g | ]}|j  j qS r$   rM   rN   r/   r$   r%   r0   R  s     r   r   c                    s   g | ]} |j j qS r$   rM   rN   Zparse_fnr#   r$   r%   r0   ]  s     zTried to parse the meta values of Documents with IDs {document_ids}, but got ValueError with the message: {error}
Skipping parsing of the meta values.c                 S   s   g | ]
}|j qS r$   r2   r,   r$   r$   r%   r0   c  s     r>   c                    s   g | ]}|j  j qS r$   rM   rN   r/   r$   r%   r0   f  s     )allrA   rB   rC   r   r   
date_parser(   )r#   r7   r    Zunique_meta_valuesZmeta_valuesr?   r$   rT   r%   rD   ?  s2    $zMetaFieldRanker._parse_meta)r)   rJ   r   r   rL   c                 C   sN  t t}|dkrntt||D ]L\}\}}||j  | j|dd|  7  < ||j  | j|d| 7  < qn|dkr tt||D ]\}\}}td}	|jdkrt	d n0|jdk s|jdkrtj	d|j|jd	 n|j}	||j  |	d|  7  < ||j  | j
|t|d
| 7  < q|D ]}||j |_q$t|dd dd}
|
S )zv
        Merge the two different rankings for Documents sorted both by their content and by their meta field.
        r   )rankr'   r   r   Nz+The score wasn't provided; defaulting to 0.zXThe score {score} for Document {document_id} is outside the [0,1] range; defaulting to 0)scoreZdocument_idrW   amountc                 S   s   | j r| j S dS )N)rX   )r.   r$   r$   r%   r9     r:   z1MetaFieldRanker._merge_rankings.<locals>.<lambda>Tr;   )r   r   	enumeraterF   r3   _calculate_rrfr   rX   rA   rB   _calc_linear_scorer@   rG   )r#   r)   rJ   r   r   Z
scores_mapidocumentZ
sorted_docrX   Znew_sorted_documentsr$   r$   r%   rI   j  s.    
""

&zMetaFieldRanker._merge_rankings=   )rW   krL   c                 C   s   d||   S )a  
        Calculates the reciprocal rank fusion.

        The constant K is set to 61 (60 was suggested by the original paper, plus 1 as python lists are 0-based and
        the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking).
        r'   r$   )rW   rb   r$   r$   r%   r]     s    zMetaFieldRanker._calculate_rrf)rW   rZ   rL   c                 C   s   ||  | S )a\  
        Calculate the meta field score as a linear score between the greatest and the lowest score in the list.

        This linear scaling is useful for:
        - Reducing the effect of outliers
        - Creating scores that are meaningfully distributed in the range [0,1],
        similar to scores coming from a Retriever or Ranker.
        r$   rY   r$   r$   r%   r^     s    
z"MetaFieldRanker._calc_linear_score)r   Nr   r   r   N)NNNNNN)ra   )__name__
__module____qualname____doc__rR   r   r   r   r   r&   r"   r   Zoutput_typesr   r
   rK   r   rD   rI   staticmethodr]   r^   r$   r$   r$   r%   r      sl         
D
3      



 $ 
-'	r   N)collectionsr   typingr   r   r   r   r   r   Zdateutil.parserr	   rV   Zhaystackr
   r   r   	getLoggerrc   rA   r   r$   r$   r$   r%   <module>   s    
