U
    <AfBS                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ erd d	lmZ eeZG d
d deZdS )    N)nullcontext)TYPE_CHECKINGCallableDictListOptionalSetUnion)Tensor)trange)SentenceEvaluator)SimilarityFunction)cos_sim	dot_score)SentenceTransformerc                       s>  e Zd ZdZddgdgddddgddddgdgdd	d
ddejjeejje	idfe
eef e
eef e
eee f eee ee ee ee ee eeeeee e
eeeegef f eeeef  dd fddZddeeee
eef dddZd dee
eef dddZee dddZdd Zedd Z  ZS )!InformationRetrievalEvaluatora  
    This class evaluates an Information Retrieval (IR) setting.

    Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
    Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)

    Example:
        ::

            import random
            from sentence_transformers import SentenceTransformer
            from sentence_transformers.evaluation import InformationRetrievalEvaluator
            from datasets import load_dataset

            # Load a model
            model = SentenceTransformer('all-mpnet-base-v2')

            # Load the Quora IR dataset (https://huggingface.co/datasets/BeIR/quora, https://huggingface.co/datasets/BeIR/quora-qrels)
            corpus = load_dataset("BeIR/quora", "corpus", split="corpus")
            queries = load_dataset("BeIR/quora", "queries", split="queries")
            relevant_docs_data = load_dataset("BeIR/quora-qrels", split="validation")

            # Shrink the corpus size heavily to only the relevant documents + 10,000 random documents
            required_corpus_ids = list(map(str, relevant_docs_data["corpus-id"]))
            required_corpus_ids += random.sample(corpus["_id"], k=10_000)
            corpus = corpus.filter(lambda x: x["_id"] in required_corpus_ids)

            # Convert the datasets to dictionaries
            corpus = dict(zip(corpus["_id"], corpus["text"]))  # Our corpus (cid => document)
            queries = dict(zip(queries["_id"], queries["text"]))  # Our queries (qid => question)
            relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
            for qid, corpus_ids in zip(relevant_docs_data["query-id"], relevant_docs_data["corpus-id"]):
                qid = str(qid)
                corpus_ids = str(corpus_ids)
                if qid not in relevant_docs:
                    relevant_docs[qid] = set()
                relevant_docs[qid].add(corpus_ids)

            # Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
            ir_evaluator = InformationRetrievalEvaluator(
                queries=queries,
                corpus=corpus,
                relevant_docs=relevant_docs,
                name="BeIR-quora-dev",
            )
            results = ir_evaluator(model)
            '''
            Information Retrieval Evaluation of the model on the BeIR-quora-dev dataset:
            Queries: 5000
            Corpus: 17476

            Score-Function: cosine
            Accuracy@1: 96.26%
            Accuracy@3: 99.38%
            Accuracy@5: 99.74%
            Accuracy@10: 99.94%
            Precision@1: 96.26%
            Precision@3: 43.01%
            Precision@5: 27.66%
            Precision@10: 14.58%
            Recall@1: 82.93%
            Recall@3: 96.28%
            Recall@5: 98.38%
            Recall@10: 99.55%
            MRR@10: 0.9782
            NDCG@10: 0.9807
            MAP@100: 0.9732
            Score-Function: dot
            Accuracy@1: 96.26%
            Accuracy@3: 99.38%
            Accuracy@5: 99.74%
            Accuracy@10: 99.94%
            Precision@1: 96.26%
            Precision@3: 43.01%
            Precision@5: 27.66%
            Precision@10: 14.58%
            Recall@1: 82.93%
            Recall@3: 96.28%
            Recall@5: 98.38%
            Recall@10: 99.55%
            MRR@10: 0.9782
            NDCG@10: 0.9807
            MAP@100: 0.9732
            '''
            print(ir_evaluator.primary_metric)
            # => "BeIR-quora-dev_cosine_map@100"
            print(results[ir_evaluator.primary_metric])
            # => 0.9732046108457585
    iP  
            d   F     TN)queriescorpusrelevant_docscorpus_chunk_sizemrr_at_k	ndcg_at_kaccuracy_at_kprecision_recall_at_kmap_at_kshow_progress_bar
batch_sizename	write_csvtruncate_dimscore_functionsmain_score_functionreturnc                    s  t    g | _D ](}||krt|| dkr| j| qfdd| jD | _t  | _ fdd| jD | _	|| _
|| _|| _|| _|| _|| _|	| _|
| _|| _|| _|| _|| _tt| j | _|rt|nd| _|| _|rd| }d| d | _d	d
g| _| jD ]}|D ]}| jd|| q|D ].}| jd|| | jd|| q>|D ]}| jd|| qr|D ]}| jd|| q|	D ]}| jd|| qqdS )a  
        Initializes the InformationRetrievalEvaluator.

        Args:
            queries (Dict[str, str]): A dictionary mapping query IDs to queries.
            corpus (Dict[str, str]): A dictionary mapping document IDs to documents.
            relevant_docs (Dict[str, Set[str]]): A dictionary mapping query IDs to a set of relevant document IDs.
            corpus_chunk_size (int): The size of each chunk of the corpus. Defaults to 50000.
            mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
            ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
            accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
            precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
            map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
            show_progress_bar (bool): Whether to show a progress bar during evaluation. Defaults to False.
            batch_size (int): The batch size for evaluation. Defaults to 32.
            name (str): A name for the evaluation. Defaults to "".
            write_csv (bool): Whether to write the evaluation results to a CSV file. Defaults to True.
            truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
            score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
            main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
        r   c                    s   g | ]} | qS  r*   ).0qid)r   r*   b/tmp/pip-unpacked-wheel-i7fohqg6/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
<listcomp>   s     z:InformationRetrievalEvaluator.__init__.<locals>.<listcomp>c                    s   g | ]} | qS r*   r*   )r+   Zcid)r   r*   r-   r.      s     N_z Information-Retrieval_evaluationz_results.csvepochstepsz{}-Accuracy@{}z{}-Precision@{}z{}-Recall@{}z	{}-MRR@{}z
{}-NDCG@{}z	{}-MAP@{})super__init__queries_idslenappendr   listkeys
corpus_idsr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r'   sortedscore_function_namesr   r(   r&   csv_filecsv_headersformat)selfr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r,   Z
score_namek	__class__)r   r   r-   r3   q   sN    +


z&InformationRetrievalEvaluator.__init__r   )modeloutput_pathr0   r1   r)   c                    s  |dkr0|dkrd| }q4d| d| d}nd}j d k	rP|dj  d7 }td	j d
| d j|f|| |d k	rjrtj|j	}tj
|st|ddd}	|	dj |	d nt|ddd}	||g}
jD ]}jD ]}|
 | d |  qjD ]2}|
 | d |  |
 | d |  qjD ]}|
 | d |  qXjD ]}|
 | d |  q|jD ]}|
 | d |  qq|	dtt|
 |	d |	  jsTjd kr:t fddjD dd dd }| dtj _njj dtj _dd    D }|j}|| |S )!NrC   z after epoch z
 in epoch z after z stepsr   z (truncated to )z5Information Retrieval Evaluation of the model on the z dataset:wzutf-8)modeencoding,
a
accuracy@kprecision@krecall@kmrr@kndcg@kmap@kc                    s&   g | ]}| | d  t j fqS )rS   )maxr!   r+   r$   scoresr?   r*   r-   r.     s     z:InformationRetrievalEvaluator.__call__.<locals>.<listcomp>c                 S   s   | d S )Nr   r*   xr*   r*   r-   <lambda>      z8InformationRetrievalEvaluator.__call__.<locals>.<lambda>)keyr   z_map@c                 S   sT   i | ]L\}}|  D ]:\}}|  D ](\}}| d |ddt|  |q$qqS )r/   z@k@)itemsreplacestr)r+   score_functionZvalues_dictZmetric_namevaluesr@   valuer*   r*   r-   
<dictcomp>  s   
 
  z:InformationRetrievalEvaluator.__call__.<locals>.<dictcomp>)r&   loggerinfor$   compute_metricesr%   ospathjoinr<   isfileopenwriter=   r;   r   r6   r    r   r   r!   mapr`   closeZprimary_metricr(   rT   rc   r^   Zprefix_name_to_metricsZ store_metrics_in_model_card_data)r?   rD   rE   r0   r1   argskwargsZout_txtZcsv_pathZfOutZoutput_datar$   r@   ra   Zmetricsr*   rV   r-   __call__   s`    







z&InformationRetrievalEvaluator.__call__)rD   corpus_embeddingsr)   c              
      s   |d kr|}t t jt jt jt jt j}jd krJt n
|j |j	j
jjdd}W 5 Q R X i  jD ]}dd tt|D  |< qtdtjjdj dD ]^}t|j tj}|d kr0jd krt n
|j$ |j	j|| djdd}	W 5 Q R X n||| }	j D ]\}}
|
||	}tj|t|t|d d	ddd
\}}|  }|  }tt|D ]z}t|| || D ]`\}}j||  }t | | |k rt | | ||f nt | | ||f qqqFq D ]d}tt | D ]L}tt | | D ]0} | | | \}}||d | | |< qTq<q(td tj
 td tj  fddjD }j!D ]$}td | "||  q|S )NT)r"   r#   Zconvert_to_tensorc                 S   s   g | ]}g qS r*   r*   )r+   r/   r*   r*   r-   r.   .  s     zBInformationRetrievalEvaluator.compute_metrices.<locals>.<listcomp>r   zCorpus Chunks)descdisableFr   )Zdimlargestr:   )	corpus_idscorezQueries: {}zCorpus: {}
c                    s   i | ]}|  | qS r*   )compute_metricsrU   queries_result_listr?   r*   r-   rd   e  s      zBInformationRetrievalEvaluator.compute_metrices.<locals>.<dictcomp>zScore-Function: {})#rT   r   r   r   r    r!   r&   r   Ztruncate_sentence_embeddingsencoder   r"   r#   r'   ranger5   r   r   r   minr^   torchZtopkcputolistzipr9   heapqheappushheappushpopre   rf   r>   r;   output_scores)r?   rD   Zcorpus_modelrs   Zmax_kZquery_embeddingsr$   Zcorpus_start_idxZcorpus_end_idxZsub_corpus_embeddingsra   Zpair_scoresZpair_scores_top_k_valuesZpair_scores_top_k_idx	query_itrZsub_corpus_idrx   rw   Zdoc_itrrW   r*   rz   r-   rg     s    	
    

    
 
 &"
z.InformationRetrievalEvaluator.compute_metrices)r{   c              	      s2  dd | j D }dd | jD }dd | jD }dd | jD }dd | jD }dd | jD }tt|D ]}| j| }	t|| dd	 d
d}
| j	|	  | j D ]6}|
d| D ]$}|d  kr||  d7  <  qqq| jD ]V}d}|
d| D ]}|d  kr|d7 }q|| 
||  || 
|t   q| jD ]N}t|
d| D ]6\}}|d  krR||  d|d  7  <  q>qRq>| jD ]T} fdd|
d| D }dgt  }| ||| || }|| 
| q| jD ]p}d}d}t|
d| D ]0\}}|d  kr|d7 }|||d  7 }q|t|t  }|| 
| qql|D ]}||  t| j  < qh|D ]}t|| ||< q|D ]}t|| ||< q|D ]}t|| ||< q|D ]}||  t| j  < q|D ]}t|| ||< q||||||dS )Nc                 S   s   i | ]
}|d qS r   r*   r+   r@   r*   r*   r-   rd   p  s      zAInformationRetrievalEvaluator.compute_metrics.<locals>.<dictcomp>c                 S   s   i | ]
}|g qS r*   r*   r   r*   r*   r-   rd   q  s      c                 S   s   i | ]
}|g qS r*   r*   r   r*   r*   r-   rd   r  s      c                 S   s   i | ]
}|d qS r   r*   r   r*   r*   r-   rd   s  s      c                 S   s   i | ]
}|g qS r*   r*   r   r*   r*   r-   rd   t  s      c                 S   s   i | ]
}|g qS r*   r*   r   r*   r*   r-   rd   u  s      c                 S   s   | d S )Nrx   r*   rX   r*   r*   r-   rZ   |  r[   z?InformationRetrievalEvaluator.compute_metrics.<locals>.<lambda>T)r\   reverser   rw   r   g      ?c                    s    g | ]}|d   krdndqS )rw   r   r   r*   )r+   Ztop_hitZquery_relevant_docsr*   r-   r.     s    zAInformationRetrievalEvaluator.compute_metrics.<locals>.<listcomp>)rN   rO   rP   rR   rQ   rS   )r   r    r   r   r!   r}   r5   r4   r:   r   r6   	enumeratecompute_dcg_at_kr~   r   npZmean)r?   r{   Znum_hits_at_kZprecisions_at_kZrecall_at_kZMRRZndcgZ	AveP_at_kr   Zquery_idZtop_hitsZk_valhitZnum_correctZrankZpredicted_relevanceZtrue_relevancesZ
ndcg_valueZsum_precisionsZavg_precisionr@   r*   r   r-   ry   n  s    








 
z-InformationRetrievalEvaluator.compute_metricsc                 C   s   |d D ]"}t d||d | d  q|d D ]"}t d||d | d  q4|d D ]"}t d||d | d  q`|d D ]}t d	||d |  q|d
 D ]}t d||d
 |  q|d D ]}t d||d |  qd S )NrN   zAccuracy@{}: {:.2f}%r   rO   zPrecision@{}: {:.2f}%rP   zRecall@{}: {:.2f}%rQ   zMRR@{}: {:.4f}rR   zNDCG@{}: {:.4f}rS   zMAP@{}: {:.4f})re   rf   r>   )r?   rW   r@   r*   r*   r-   r     s       z+InformationRetrievalEvaluator.output_scoresc                 C   s:   d}t tt| |D ]}|| | t|d  7 }q|S )Nr      )r}   r~   r5   r   log2)Z
relevancesr@   Zdcgir*   r*   r-   r     s    z.InformationRetrievalEvaluator.compute_dcg_at_k)NrC   rC   )NN)__name__
__module____qualname____doc__r   ZCOSINErc   r   ZDOT_PRODUCTr   r   r`   r   intr   boolr   r   r
   r	   r3   floatrr   rg   objectry   r   staticmethodr   __classcell__r*   r*   rA   r-   r      st   _

  

_        
G    
Y^r   ) r   loggingrh   
contextlibr   typingr   r   r   r   r   r   r	   Znumpyr   r   r
   Ztqdmr   Z2sentence_transformers.evaluation.SentenceEvaluatorr   Z*sentence_transformers.similarity_functionsr   Zsentence_transformers.utilr   r   Z)sentence_transformers.SentenceTransformerr   	getLoggerr   re   r   r*   r*   r*   r-   <module>   s   $
