瀏覽代碼

revert: faulty dedup code

Timothy Jaeryang Baek 2 月之前
父節點
當前提交
93d486d50e
共有 2 個文件被更改,包括 16 次插入29 次删除
  1. 1 1
      backend/open_webui/config.py
  2. 15 28
      backend/open_webui/retrieval/utils.py

+ 1 - 1
backend/open_webui/config.py

@@ -1714,7 +1714,7 @@ Respond to the user query using the provided context, incorporating inline citat
 - Respond in the same language as the user's query.
 - Respond in the same language as the user's query.
 - If the context is unreadable or of poor quality, inform the user and provide the best possible answer.
 - If the context is unreadable or of poor quality, inform the user and provide the best possible answer.
 - If the answer isn't present in the context but you possess the knowledge, explain this to the user and provide the answer using your own understanding.
 - If the answer isn't present in the context but you possess the knowledge, explain this to the user and provide the answer using your own understanding.
-- **Only include inline citations using [source_id] when a <source_id> tag is explicitly provided in the context.**  
+- **Only include inline citations using [source_id] (e.g., [1], [2]) when a `<source_id>` tag is explicitly provided in the context.**
 - Do not cite if the <source_id> tag is not provided in the context.  
 - Do not cite if the <source_id> tag is not provided in the context.  
 - Do not use XML tags in your response.
 - Do not use XML tags in your response.
 - Ensure citations are concise and directly related to the information provided.
 - Ensure citations are concise and directly related to the information provided.

+ 15 - 28
backend/open_webui/retrieval/utils.py

@@ -14,7 +14,8 @@ from langchain_core.documents import Document
 
 
 from open_webui.config import VECTOR_DB
 from open_webui.config import VECTOR_DB
 from open_webui.retrieval.vector.connector import VECTOR_DB_CLIENT
 from open_webui.retrieval.vector.connector import VECTOR_DB_CLIENT
-from open_webui.utils.misc import get_last_user_message
+from open_webui.utils.misc import get_last_user_message, calculate_sha256_string
+
 from open_webui.models.users import UserModel
 from open_webui.models.users import UserModel
 
 
 from open_webui.env import (
 from open_webui.env import (
@@ -178,45 +179,31 @@ def merge_and_sort_query_results(
     combined_distances = []
     combined_distances = []
     combined_documents = []
     combined_documents = []
     combined_metadatas = []
     combined_metadatas = []
-    combined_ids = []
 
 
     for data in query_results:
     for data in query_results:
         combined_distances.extend(data["distances"][0])
         combined_distances.extend(data["distances"][0])
         combined_documents.extend(data["documents"][0])
         combined_documents.extend(data["documents"][0])
         combined_metadatas.extend(data["metadatas"][0])
         combined_metadatas.extend(data["metadatas"][0])
-        # DISTINCT(chunk_id,file_id) - in case if id (chunk_ids) become ordinals
-        combined_ids.extend(
-            [
-                f"{id}-{meta['file_id']}"
-                for id, meta in zip(data["ids"][0], data["metadatas"][0])
-            ]
-        )
 
 
-    # Create a list of tuples (distance, document, metadata, ids)
-    combined = list(
-        zip(combined_distances, combined_documents, combined_metadatas, combined_ids)
-    )
+    # Create a list of tuples (distance, document, metadata)
+    combined = list(zip(combined_distances, combined_documents, combined_metadatas))
 
 
     # Sort the list based on distances
     # Sort the list based on distances
     combined.sort(key=lambda x: x[0], reverse=reverse)
     combined.sort(key=lambda x: x[0], reverse=reverse)
 
 
-    sorted_distances = []
-    sorted_documents = []
-    sorted_metadatas = []
-    # Otherwise we don't have anything :-(
-    if combined:
+    # We don't have anything :-(
+    if not combined:
+        sorted_distances = []
+        sorted_documents = []
+        sorted_metadatas = []
+    else:
         # Unzip the sorted list
         # Unzip the sorted list
-        all_distances, all_documents, all_metadatas, all_ids = zip(*combined)
-        seen_ids = set()
+        sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
+
         # Slicing the lists to include only k elements
         # Slicing the lists to include only k elements
-        for index, id in enumerate(all_ids):
-            if id not in seen_ids:
-                sorted_distances.append(all_distances[index])
-                sorted_documents.append(all_documents[index])
-                sorted_metadatas.append(all_metadatas[index])
-                seen_ids.add(id)
-                if len(sorted_distances) >= k:
-                    break
+        sorted_distances = list(sorted_distances)[:k]
+        sorted_documents = list(sorted_documents)[:k]
+        sorted_metadatas = list(sorted_metadatas)[:k]
 
 
     # Create the output dictionary
     # Create the output dictionary
     result = {
     result = {