milvus.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. from pymilvus import MilvusClient as Client
  2. from pymilvus import FieldSchema, DataType
  3. import json
  4. from typing import Optional
  5. from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
  6. from open_webui.config import (
  7. MILVUS_URI,
  8. MILVUS_DB,
  9. )
  10. class MilvusClient:
  11. def __init__(self):
  12. self.collection_prefix = "open_webui"
  13. self.client = Client(uri=MILVUS_URI, database=MILVUS_DB)
  14. def _result_to_get_result(self, result) -> GetResult:
  15. ids = []
  16. documents = []
  17. metadatas = []
  18. for match in result:
  19. _ids = []
  20. _documents = []
  21. _metadatas = []
  22. for item in match:
  23. _ids.append(item.get("id"))
  24. _documents.append(item.get("data", {}).get("text"))
  25. _metadatas.append(item.get("metadata"))
  26. ids.append(_ids)
  27. documents.append(_documents)
  28. metadatas.append(_metadatas)
  29. return GetResult(
  30. **{
  31. "ids": ids,
  32. "documents": documents,
  33. "metadatas": metadatas,
  34. }
  35. )
  36. def _result_to_search_result(self, result) -> SearchResult:
  37. ids = []
  38. distances = []
  39. documents = []
  40. metadatas = []
  41. for match in result:
  42. _ids = []
  43. _distances = []
  44. _documents = []
  45. _metadatas = []
  46. for item in match:
  47. _ids.append(item.get("id"))
  48. _distances.append(item.get("distance"))
  49. _documents.append(item.get("entity", {}).get("data", {}).get("text"))
  50. _metadatas.append(item.get("entity", {}).get("metadata"))
  51. ids.append(_ids)
  52. distances.append(_distances)
  53. documents.append(_documents)
  54. metadatas.append(_metadatas)
  55. return SearchResult(
  56. **{
  57. "ids": ids,
  58. "distances": distances,
  59. "documents": documents,
  60. "metadatas": metadatas,
  61. }
  62. )
  63. def _create_collection(self, collection_name: str, dimension: int):
  64. schema = self.client.create_schema(
  65. auto_id=False,
  66. enable_dynamic_field=True,
  67. )
  68. schema.add_field(
  69. field_name="id",
  70. datatype=DataType.VARCHAR,
  71. is_primary=True,
  72. max_length=65535,
  73. )
  74. schema.add_field(
  75. field_name="vector",
  76. datatype=DataType.FLOAT_VECTOR,
  77. dim=dimension,
  78. description="vector",
  79. )
  80. schema.add_field(field_name="data", datatype=DataType.JSON, description="data")
  81. schema.add_field(
  82. field_name="metadata", datatype=DataType.JSON, description="metadata"
  83. )
  84. index_params = self.client.prepare_index_params()
  85. index_params.add_index(
  86. field_name="vector",
  87. index_type="HNSW",
  88. metric_type="COSINE",
  89. params={"M": 16, "efConstruction": 100},
  90. )
  91. self.client.create_collection(
  92. collection_name=f"{self.collection_prefix}_{collection_name}",
  93. schema=schema,
  94. index_params=index_params,
  95. )
  96. def has_collection(self, collection_name: str) -> bool:
  97. # Check if the collection exists based on the collection name.
  98. collection_name = collection_name.replace("-", "_")
  99. return self.client.has_collection(
  100. collection_name=f"{self.collection_prefix}_{collection_name}"
  101. )
  102. def delete_collection(self, collection_name: str):
  103. # Delete the collection based on the collection name.
  104. collection_name = collection_name.replace("-", "_")
  105. return self.client.drop_collection(
  106. collection_name=f"{self.collection_prefix}_{collection_name}"
  107. )
  108. def search(
  109. self, collection_name: str, vectors: list[list[float | int]], limit: int
  110. ) -> Optional[SearchResult]:
  111. # Search for the nearest neighbor items based on the vectors and return 'limit' number of results.
  112. collection_name = collection_name.replace("-", "_")
  113. result = self.client.search(
  114. collection_name=f"{self.collection_prefix}_{collection_name}",
  115. data=vectors,
  116. limit=limit,
  117. output_fields=["data", "metadata"],
  118. )
  119. return self._result_to_search_result(result)
  120. def query(self, collection_name: str, filter: dict, limit: Optional[int] = None):
  121. # Construct the filter string for querying
  122. collection_name = collection_name.replace("-", "_")
  123. if not self.has_collection(collection_name):
  124. return None
  125. filter_string = " && ".join(
  126. [
  127. f'metadata["{key}"] == {json.dumps(value)}'
  128. for key, value in filter.items()
  129. ]
  130. )
  131. max_limit = 16383 # The maximum number of records per request
  132. all_results = []
  133. if limit is None:
  134. limit = float("inf") # Use infinity as a placeholder for no limit
  135. # Initialize offset and remaining to handle pagination
  136. offset = 0
  137. remaining = limit
  138. try:
  139. # Loop until there are no more items to fetch or the desired limit is reached
  140. while remaining > 0:
  141. print("remaining", remaining)
  142. current_fetch = min(
  143. max_limit, remaining
  144. ) # Determine how many items to fetch in this iteration
  145. results = self.client.query(
  146. collection_name=f"{self.collection_prefix}_{collection_name}",
  147. filter=filter_string,
  148. output_fields=["*"],
  149. limit=current_fetch,
  150. offset=offset,
  151. )
  152. if not results:
  153. break
  154. all_results.extend(results)
  155. results_count = len(results)
  156. remaining -= (
  157. results_count # Decrease remaining by the number of items fetched
  158. )
  159. offset += results_count
  160. # Break the loop if the results returned are less than the requested fetch count
  161. if results_count < current_fetch:
  162. break
  163. print(all_results)
  164. return self._result_to_get_result([all_results])
  165. except Exception as e:
  166. print(e)
  167. return None
  168. def get(self, collection_name: str) -> Optional[GetResult]:
  169. # Get all the items in the collection.
  170. collection_name = collection_name.replace("-", "_")
  171. result = self.client.query(
  172. collection_name=f"{self.collection_prefix}_{collection_name}",
  173. filter='id != ""',
  174. )
  175. return self._result_to_get_result([result])
  176. def insert(self, collection_name: str, items: list[VectorItem]):
  177. # Insert the items into the collection, if the collection does not exist, it will be created.
  178. collection_name = collection_name.replace("-", "_")
  179. if not self.client.has_collection(
  180. collection_name=f"{self.collection_prefix}_{collection_name}"
  181. ):
  182. self._create_collection(
  183. collection_name=collection_name, dimension=len(items[0]["vector"])
  184. )
  185. return self.client.insert(
  186. collection_name=f"{self.collection_prefix}_{collection_name}",
  187. data=[
  188. {
  189. "id": item["id"],
  190. "vector": item["vector"],
  191. "data": {"text": item["text"]},
  192. "metadata": item["metadata"],
  193. }
  194. for item in items
  195. ],
  196. )
  197. def upsert(self, collection_name: str, items: list[VectorItem]):
  198. # Update the items in the collection, if the items are not present, insert them. If the collection does not exist, it will be created.
  199. collection_name = collection_name.replace("-", "_")
  200. if not self.client.has_collection(
  201. collection_name=f"{self.collection_prefix}_{collection_name}"
  202. ):
  203. self._create_collection(
  204. collection_name=collection_name, dimension=len(items[0]["vector"])
  205. )
  206. return self.client.upsert(
  207. collection_name=f"{self.collection_prefix}_{collection_name}",
  208. data=[
  209. {
  210. "id": item["id"],
  211. "vector": item["vector"],
  212. "data": {"text": item["text"]},
  213. "metadata": item["metadata"],
  214. }
  215. for item in items
  216. ],
  217. )
  218. def delete(
  219. self,
  220. collection_name: str,
  221. ids: Optional[list[str]] = None,
  222. filter: Optional[dict] = None,
  223. ):
  224. # Delete the items from the collection based on the ids.
  225. collection_name = collection_name.replace("-", "_")
  226. if ids:
  227. return self.client.delete(
  228. collection_name=f"{self.collection_prefix}_{collection_name}",
  229. ids=ids,
  230. )
  231. elif filter:
  232. # Convert the filter dictionary to a string using JSON_CONTAINS.
  233. filter_string = " && ".join(
  234. [
  235. f'metadata["{key}"] == {json.dumps(value)}'
  236. for key, value in filter.items()
  237. ]
  238. )
  239. return self.client.delete(
  240. collection_name=f"{self.collection_prefix}_{collection_name}",
  241. filter=filter_string,
  242. )
  243. def reset(self):
  244. # Resets the database. This will delete all collections and item entries.
  245. collection_names = self.client.list_collections()
  246. for collection_name in collection_names:
  247. if collection_name.startswith(self.collection_prefix):
  248. self.client.drop_collection(collection_name=collection_name)