milvus.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. from pymilvus import MilvusClient as Client
  2. from pymilvus import FieldSchema, DataType
  3. import json
  4. from typing import Optional
  5. from open_webui.apps.retrieval.vector.main import VectorItem, SearchResult, GetResult
  6. from open_webui.config import (
  7. MILVUS_URI,
  8. )
  9. class MilvusClient:
  10. def __init__(self):
  11. self.collection_prefix = "open_webui"
  12. self.client = Client(uri=MILVUS_URI)
  13. def _result_to_get_result(self, result) -> GetResult:
  14. ids = []
  15. documents = []
  16. metadatas = []
  17. for match in result:
  18. _ids = []
  19. _documents = []
  20. _metadatas = []
  21. for item in match:
  22. _ids.append(item.get("id"))
  23. _documents.append(item.get("data", {}).get("text"))
  24. _metadatas.append(item.get("metadata"))
  25. ids.append(_ids)
  26. documents.append(_documents)
  27. metadatas.append(_metadatas)
  28. return GetResult(
  29. **{
  30. "ids": ids,
  31. "documents": documents,
  32. "metadatas": metadatas,
  33. }
  34. )
  35. def _result_to_search_result(self, result) -> SearchResult:
  36. ids = []
  37. distances = []
  38. documents = []
  39. metadatas = []
  40. for match in result:
  41. _ids = []
  42. _distances = []
  43. _documents = []
  44. _metadatas = []
  45. for item in match:
  46. _ids.append(item.get("id"))
  47. _distances.append(item.get("distance"))
  48. _documents.append(item.get("entity", {}).get("data", {}).get("text"))
  49. _metadatas.append(item.get("entity", {}).get("metadata"))
  50. ids.append(_ids)
  51. distances.append(_distances)
  52. documents.append(_documents)
  53. metadatas.append(_metadatas)
  54. return SearchResult(
  55. **{
  56. "ids": ids,
  57. "distances": distances,
  58. "documents": documents,
  59. "metadatas": metadatas,
  60. }
  61. )
  62. def _create_collection(self, collection_name: str, dimension: int):
  63. schema = self.client.create_schema(
  64. auto_id=False,
  65. enable_dynamic_field=True,
  66. )
  67. schema.add_field(
  68. field_name="id",
  69. datatype=DataType.VARCHAR,
  70. is_primary=True,
  71. max_length=65535,
  72. )
  73. schema.add_field(
  74. field_name="vector",
  75. datatype=DataType.FLOAT_VECTOR,
  76. dim=dimension,
  77. description="vector",
  78. )
  79. schema.add_field(field_name="data", datatype=DataType.JSON, description="data")
  80. schema.add_field(
  81. field_name="metadata", datatype=DataType.JSON, description="metadata"
  82. )
  83. index_params = self.client.prepare_index_params()
  84. index_params.add_index(
  85. field_name="vector",
  86. index_type="HNSW",
  87. metric_type="COSINE",
  88. params={"M": 16, "efConstruction": 100},
  89. )
  90. self.client.create_collection(
  91. collection_name=f"{self.collection_prefix}_{collection_name}",
  92. schema=schema,
  93. index_params=index_params,
  94. )
  95. def has_collection(self, collection_name: str) -> bool:
  96. # Check if the collection exists based on the collection name.
  97. collection_name = collection_name.replace("-", "_")
  98. return self.client.has_collection(
  99. collection_name=f"{self.collection_prefix}_{collection_name}"
  100. )
  101. def delete_collection(self, collection_name: str):
  102. # Delete the collection based on the collection name.
  103. collection_name = collection_name.replace("-", "_")
  104. return self.client.drop_collection(
  105. collection_name=f"{self.collection_prefix}_{collection_name}"
  106. )
  107. def search(
  108. self, collection_name: str, vectors: list[list[float | int]], limit: int
  109. ) -> Optional[SearchResult]:
  110. # Search for the nearest neighbor items based on the vectors and return 'limit' number of results.
  111. collection_name = collection_name.replace("-", "_")
  112. result = self.client.search(
  113. collection_name=f"{self.collection_prefix}_{collection_name}",
  114. data=vectors,
  115. limit=limit,
  116. output_fields=["data", "metadata"],
  117. )
  118. return self._result_to_search_result(result)
  119. def query(self, collection_name: str, filter: dict, limit: Optional[int] = None):
  120. # Construct the filter string for querying
  121. collection_name = collection_name.replace("-", "_")
  122. if not self.has_collection(collection_name):
  123. return None
  124. filter_string = " && ".join(
  125. [
  126. f'metadata["{key}"] == {json.dumps(value)}'
  127. for key, value in filter.items()
  128. ]
  129. )
  130. max_limit = 16383 # The maximum number of records per request
  131. all_results = []
  132. if limit is None:
  133. limit = float("inf") # Use infinity as a placeholder for no limit
  134. # Initialize offset and remaining to handle pagination
  135. offset = 0
  136. remaining = limit
  137. try:
  138. # Loop until there are no more items to fetch or the desired limit is reached
  139. while remaining > 0:
  140. print("remaining", remaining)
  141. current_fetch = min(
  142. max_limit, remaining
  143. ) # Determine how many items to fetch in this iteration
  144. results = self.client.query(
  145. collection_name=f"{self.collection_prefix}_{collection_name}",
  146. filter=filter_string,
  147. output_fields=["*"],
  148. limit=current_fetch,
  149. offset=offset,
  150. )
  151. if not results:
  152. break
  153. all_results.extend(results)
  154. results_count = len(results)
  155. remaining -= (
  156. results_count # Decrease remaining by the number of items fetched
  157. )
  158. offset += results_count
  159. # Break the loop if the results returned are less than the requested fetch count
  160. if results_count < current_fetch:
  161. break
  162. print(all_results)
  163. return self._result_to_get_result([all_results])
  164. except Exception as e:
  165. print(e)
  166. return None
  167. def get(self, collection_name: str) -> Optional[GetResult]:
  168. # Get all the items in the collection.
  169. collection_name = collection_name.replace("-", "_")
  170. result = self.client.query(
  171. collection_name=f"{self.collection_prefix}_{collection_name}",
  172. filter='id != ""',
  173. )
  174. return self._result_to_get_result([result])
  175. def insert(self, collection_name: str, items: list[VectorItem]):
  176. # Insert the items into the collection, if the collection does not exist, it will be created.
  177. collection_name = collection_name.replace("-", "_")
  178. if not self.client.has_collection(
  179. collection_name=f"{self.collection_prefix}_{collection_name}"
  180. ):
  181. self._create_collection(
  182. collection_name=collection_name, dimension=len(items[0]["vector"])
  183. )
  184. return self.client.insert(
  185. collection_name=f"{self.collection_prefix}_{collection_name}",
  186. data=[
  187. {
  188. "id": item["id"],
  189. "vector": item["vector"],
  190. "data": {"text": item["text"]},
  191. "metadata": item["metadata"],
  192. }
  193. for item in items
  194. ],
  195. )
  196. def upsert(self, collection_name: str, items: list[VectorItem]):
  197. # Update the items in the collection, if the items are not present, insert them. If the collection does not exist, it will be created.
  198. collection_name = collection_name.replace("-", "_")
  199. if not self.client.has_collection(
  200. collection_name=f"{self.collection_prefix}_{collection_name}"
  201. ):
  202. self._create_collection(
  203. collection_name=collection_name, dimension=len(items[0]["vector"])
  204. )
  205. return self.client.upsert(
  206. collection_name=f"{self.collection_prefix}_{collection_name}",
  207. data=[
  208. {
  209. "id": item["id"],
  210. "vector": item["vector"],
  211. "data": {"text": item["text"]},
  212. "metadata": item["metadata"],
  213. }
  214. for item in items
  215. ],
  216. )
  217. def delete(
  218. self,
  219. collection_name: str,
  220. ids: Optional[list[str]] = None,
  221. filter: Optional[dict] = None,
  222. ):
  223. # Delete the items from the collection based on the ids.
  224. collection_name = collection_name.replace("-", "_")
  225. if ids:
  226. return self.client.delete(
  227. collection_name=f"{self.collection_prefix}_{collection_name}",
  228. ids=ids,
  229. )
  230. elif filter:
  231. # Convert the filter dictionary to a string using JSON_CONTAINS.
  232. filter_string = " && ".join(
  233. [
  234. f'metadata["{key}"] == {json.dumps(value)}'
  235. for key, value in filter.items()
  236. ]
  237. )
  238. return self.client.delete(
  239. collection_name=f"{self.collection_prefix}_{collection_name}",
  240. filter=filter_string,
  241. )
  242. def reset(self):
  243. # Resets the database. This will delete all collections and item entries.
  244. collection_names = self.client.list_collections()
  245. for collection_name in collection_names:
  246. if collection_name.startswith(self.collection_prefix):
  247. self.client.drop_collection(collection_name=collection_name)