2 tháng trước cách đây · ce7cf62a55
--- a/backend/open_webui/retrieval/utils.py
+++ b/backend/open_webui/retrieval/utils.py
@@ -5,6 +5,7 @@ from typing import Optional, Union
 
				 
			
 
				 import asyncio
			
 
				 import requests
			
 
				+import hashlib
			
 
				 
			
 
				 from huggingface_hub import snapshot_download
			
 
				 from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
			
@@ -175,46 +176,41 @@ def merge_get_results(get_results: list[dict]) -> dict:
 
				 
			
 
				 def merge_and_sort_query_results(
			
 
				     query_results: list[dict], k: int, reverse: bool = False
			
 
				-) -> list[dict]:
			
 
				+) -> dict:
			
 
				     # Initialize lists to store combined data
			
 
				-    combined_distances = []
			
 
				-    combined_documents = []
			
 
				-    combined_metadatas = []
			
 
				+    combined = []
			
 
				+    seen_hashes = set()  # To store unique document hashes
			
 
				 
			
 
				     for data in query_results:
			
 
				-        combined_distances.extend(data["distances"][0])
			
 
				-        combined_documents.extend(data["documents"][0])
			
 
				-        combined_metadatas.extend(data["metadatas"][0])
			
 
				+        distances = data["distances"][0]
			
 
				+        documents = data["documents"][0]
			
 
				+        metadatas = data["metadatas"][0]
			
 
				+
			
 
				+        for distance, document, metadata in zip(distances, documents, metadatas):
			
 
				+            if isinstance(document, str):
			
 
				+                doc_hash = hashlib.md5(
			
 
				+                    document.encode()
			
 
				+                ).hexdigest()  # Compute a hash for uniqueness
			
 
				 
			
 
				-    # Create a list of tuples (distance, document, metadata)
			
 
				-    combined = list(zip(combined_distances, combined_documents, combined_metadatas))
			
 
				+                if doc_hash not in seen_hashes:
			
 
				+                    seen_hashes.add(doc_hash)
			
 
				+                    combined.append((distance, document, metadata))
			
 
				 
			
 
				     # Sort the list based on distances
			
 
				     combined.sort(key=lambda x: x[0], reverse=reverse)
			
 
				 
			
 
				-    # We don't have anything :-(
			
 
				-    if not combined:
			
 
				-        sorted_distances = []
			
 
				-        sorted_documents = []
			
 
				-        sorted_metadatas = []
			
 
				-    else:
			
 
				-        # Unzip the sorted list
			
 
				-        sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
			
 
				-
			
 
				-        # Slicing the lists to include only k elements
			
 
				-        sorted_distances = list(sorted_distances)[:k]
			
 
				-        sorted_documents = list(sorted_documents)[:k]
			
 
				-        sorted_metadatas = list(sorted_metadatas)[:k]
			
 
				+    # Slice to keep only the top k elements
			
 
				+    sorted_distances, sorted_documents, sorted_metadatas = (
			
 
				+        zip(*combined[:k]) if combined else ([], [], [])
			
 
				+    )
			
 
				 
			
 
				-    # Create the output dictionary
			
 
				-    result = {
			
 
				-        "distances": [sorted_distances],
			
 
				-        "documents": [sorted_documents],
			
 
				-        "metadatas": [sorted_metadatas],
			
 
				+    # Create and return the output dictionary
			
 
				+    return {
			
 
				+        "distances": [list(sorted_distances)],
			
 
				+        "documents": [list(sorted_documents)],
			
 
				+        "metadatas": [list(sorted_metadatas)],
			
 
				     }
			
 
				 
			
 
				-    return result
			
 
				-
			
 
				 
			
 
				 def get_all_items_from_collections(collection_names: list[str]) -> dict:
			
 
				     results = []