|
@@ -5,6 +5,7 @@ from typing import Optional, Union
|
|
|
|
|
|
import asyncio
|
|
|
import requests
|
|
|
+import hashlib
|
|
|
|
|
|
from huggingface_hub import snapshot_download
|
|
|
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
|
|
@@ -175,46 +176,41 @@ def merge_get_results(get_results: list[dict]) -> dict:
|
|
|
|
|
|
def merge_and_sort_query_results(
|
|
|
query_results: list[dict], k: int, reverse: bool = False
|
|
|
-) -> list[dict]:
|
|
|
+) -> dict:
|
|
|
# Initialize lists to store combined data
|
|
|
- combined_distances = []
|
|
|
- combined_documents = []
|
|
|
- combined_metadatas = []
|
|
|
+ combined = []
|
|
|
+ seen_hashes = set() # To store unique document hashes
|
|
|
|
|
|
for data in query_results:
|
|
|
- combined_distances.extend(data["distances"][0])
|
|
|
- combined_documents.extend(data["documents"][0])
|
|
|
- combined_metadatas.extend(data["metadatas"][0])
|
|
|
+ distances = data["distances"][0]
|
|
|
+ documents = data["documents"][0]
|
|
|
+ metadatas = data["metadatas"][0]
|
|
|
+
|
|
|
+ for distance, document, metadata in zip(distances, documents, metadatas):
|
|
|
+ if isinstance(document, str):
|
|
|
+ doc_hash = hashlib.md5(
|
|
|
+ document.encode()
|
|
|
+ ).hexdigest() # Compute a hash for uniqueness
|
|
|
|
|
|
- # Create a list of tuples (distance, document, metadata)
|
|
|
- combined = list(zip(combined_distances, combined_documents, combined_metadatas))
|
|
|
+ if doc_hash not in seen_hashes:
|
|
|
+ seen_hashes.add(doc_hash)
|
|
|
+ combined.append((distance, document, metadata))
|
|
|
|
|
|
# Sort the list based on distances
|
|
|
combined.sort(key=lambda x: x[0], reverse=reverse)
|
|
|
|
|
|
- # We don't have anything :-(
|
|
|
- if not combined:
|
|
|
- sorted_distances = []
|
|
|
- sorted_documents = []
|
|
|
- sorted_metadatas = []
|
|
|
- else:
|
|
|
- # Unzip the sorted list
|
|
|
- sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
|
|
|
-
|
|
|
- # Slicing the lists to include only k elements
|
|
|
- sorted_distances = list(sorted_distances)[:k]
|
|
|
- sorted_documents = list(sorted_documents)[:k]
|
|
|
- sorted_metadatas = list(sorted_metadatas)[:k]
|
|
|
+ # Slice to keep only the top k elements
|
|
|
+ sorted_distances, sorted_documents, sorted_metadatas = (
|
|
|
+ zip(*combined[:k]) if combined else ([], [], [])
|
|
|
+ )
|
|
|
|
|
|
- # Create the output dictionary
|
|
|
- result = {
|
|
|
- "distances": [sorted_distances],
|
|
|
- "documents": [sorted_documents],
|
|
|
- "metadatas": [sorted_metadatas],
|
|
|
+ # Create and return the output dictionary
|
|
|
+ return {
|
|
|
+ "distances": [list(sorted_distances)],
|
|
|
+ "documents": [list(sorted_documents)],
|
|
|
+ "metadatas": [list(sorted_metadatas)],
|
|
|
}
|
|
|
|
|
|
- return result
|
|
|
-
|
|
|
|
|
|
def get_all_items_from_collections(collection_names: list[str]) -> dict:
|
|
|
results = []
|