haystack
haystack copied to clipboard
Add Distribution-based rank fusion in JoinDocuments
Is your feature request related to a problem? Please describe. Add Distribution-based rank fusion in JoinDocuments
Describe the solution you'd like
def _distribution_based_rank_fusion(self, document_lists):
"""
Merge multiple lists of Documents and assign scores based on Distribution-Based Score Fusion.
(https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18)
If a Document is in more than one retriever, the sone with the highest score is used.
"""
for documents in document_lists:
scores_list = []
for doc in documents:
scores_list.append(doc.score)
mean_score = sum(scores_list) / len(scores_list)
std_dev = (
sum((x - mean_score) ** 2 for x in scores_list) / len(scores_list)
) ** 0.5
min_score = mean_score - 3 * std_dev
max_score = mean_score + 3 * std_dev
for doc in documents:
doc.score = (doc.score - min_score) / (max_score - min_score)
output = self._concatenate(document_lists=document_lists)
return output