haystack icon indicating copy to clipboard operation
haystack copied to clipboard

Add Distribution-based rank fusion in JoinDocuments

Open nickprock opened this issue 1 year ago • 0 comments

Is your feature request related to a problem? Please describe. Add Distribution-based rank fusion in JoinDocuments

Describe the solution you'd like

def _distribution_based_rank_fusion(self, document_lists):
        """
        Merge multiple lists of Documents and assign scores based on Distribution-Based Score Fusion.
        (https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18)

        If a Document is in more than one retriever, the sone with the highest score is used.
        """
        for documents in document_lists:
            scores_list = []

            for doc in documents:
                scores_list.append(doc.score)

            mean_score = sum(scores_list) / len(scores_list)
            std_dev = (
                sum((x - mean_score) ** 2 for x in scores_list) / len(scores_list)
            ) ** 0.5
            min_score = mean_score - 3 * std_dev
            max_score = mean_score + 3 * std_dev

            for doc in documents:
                doc.score = (doc.score - min_score) / (max_score - min_score)

        output = self._concatenate(document_lists=document_lists)

        return output

nickprock avatar Jun 23 '24 15:06 nickprock