123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- from elasticsearch import Elasticsearch, BadRequestError
- from typing import Optional
- import ssl
- from elasticsearch.helpers import bulk, scan
- from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
- from open_webui.config import (
- ELASTICSEARCH_URL,
- ELASTICSEARCH_CA_CERTS,
- ELASTICSEARCH_API_KEY,
- ELASTICSEARCH_USERNAME,
- ELASTICSEARCH_PASSWORD,
- ELASTICSEARCH_CLOUD_ID,
- ELASTICSEARCH_INDEX_PREFIX,
- SSL_ASSERT_FINGERPRINT,
- )
- class ElasticsearchClient:
- """
- Important:
- in order to reduce the number of indexes and since the embedding vector length is fixed, we avoid creating
- an index for each file but store it as a text field, while seperating to different index
- baesd on the embedding length.
- """
- def __init__(self):
- self.index_prefix = ELASTICSEARCH_INDEX_PREFIX
- self.client = Elasticsearch(
- hosts=[ELASTICSEARCH_URL],
- ca_certs=ELASTICSEARCH_CA_CERTS,
- api_key=ELASTICSEARCH_API_KEY,
- cloud_id=ELASTICSEARCH_CLOUD_ID,
- basic_auth=(
- (ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD)
- if ELASTICSEARCH_USERNAME and ELASTICSEARCH_PASSWORD
- else None
- ),
- ssl_assert_fingerprint=SSL_ASSERT_FINGERPRINT,
- )
- # Status: works
- def _get_index_name(self, dimension: int) -> str:
- return f"{self.index_prefix}_d{str(dimension)}"
- # Status: works
- def _scan_result_to_get_result(self, result) -> GetResult:
- if not result:
- return None
- ids = []
- documents = []
- metadatas = []
- for hit in result:
- ids.append(hit["_id"])
- documents.append(hit["_source"].get("text"))
- metadatas.append(hit["_source"].get("metadata"))
- return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas])
- # Status: works
- def _result_to_get_result(self, result) -> GetResult:
- if not result["hits"]["hits"]:
- return None
- ids = []
- documents = []
- metadatas = []
- for hit in result["hits"]["hits"]:
- ids.append(hit["_id"])
- documents.append(hit["_source"].get("text"))
- metadatas.append(hit["_source"].get("metadata"))
- return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas])
- # Status: works
- def _result_to_search_result(self, result) -> SearchResult:
- ids = []
- distances = []
- documents = []
- metadatas = []
- for hit in result["hits"]["hits"]:
- ids.append(hit["_id"])
- distances.append(hit["_score"])
- documents.append(hit["_source"].get("text"))
- metadatas.append(hit["_source"].get("metadata"))
- return SearchResult(
- ids=[ids],
- distances=[distances],
- documents=[documents],
- metadatas=[metadatas],
- )
- # Status: works
- def _create_index(self, dimension: int):
- body = {
- "mappings": {
- "dynamic_templates": [
- {
- "strings": {
- "match_mapping_type": "string",
- "mapping": {"type": "keyword"},
- }
- }
- ],
- "properties": {
- "collection": {"type": "keyword"},
- "id": {"type": "keyword"},
- "vector": {
- "type": "dense_vector",
- "dims": dimension, # Adjust based on your vector dimensions
- "index": True,
- "similarity": "cosine",
- },
- "text": {"type": "text"},
- "metadata": {"type": "object"},
- },
- }
- }
- self.client.indices.create(index=self._get_index_name(dimension), body=body)
- # Status: works
- def _create_batches(self, items: list[VectorItem], batch_size=100):
- for i in range(0, len(items), batch_size):
- yield items[i : min(i + batch_size, len(items))]
- # Status: works
- def has_collection(self, collection_name) -> bool:
- query_body = {"query": {"bool": {"filter": []}}}
- query_body["query"]["bool"]["filter"].append(
- {"term": {"collection": collection_name}}
- )
- try:
- result = self.client.count(index=f"{self.index_prefix}*", body=query_body)
- return result.body["count"] > 0
- except Exception as e:
- return None
- def delete_collection(self, collection_name: str):
- query = {"query": {"term": {"collection": collection_name}}}
- self.client.delete_by_query(index=f"{self.index_prefix}*", body=query)
- # Status: works
- def search(
- self, collection_name: str, vectors: list[list[float]], limit: int
- ) -> Optional[SearchResult]:
- query = {
- "size": limit,
- "_source": ["text", "metadata"],
- "query": {
- "script_score": {
- "query": {
- "bool": {"filter": [{"term": {"collection": collection_name}}]}
- },
- "script": {
- "source": "cosineSimilarity(params.vector, 'vector') + 1.0",
- "params": {
- "vector": vectors[0]
- }, # Assuming single query vector
- },
- }
- },
- }
- result = self.client.search(
- index=self._get_index_name(len(vectors[0])), body=query
- )
- return self._result_to_search_result(result)
- # Status: only tested halfwat
- def query(
- self, collection_name: str, filter: dict, limit: Optional[int] = None
- ) -> Optional[GetResult]:
- if not self.has_collection(collection_name):
- return None
- query_body = {
- "query": {"bool": {"filter": []}},
- "_source": ["text", "metadata"],
- }
- for field, value in filter.items():
- query_body["query"]["bool"]["filter"].append({"term": {field: value}})
- query_body["query"]["bool"]["filter"].append(
- {"term": {"collection": collection_name}}
- )
- size = limit if limit else 10
- try:
- result = self.client.search(
- index=f"{self.index_prefix}*",
- body=query_body,
- size=size,
- )
- return self._result_to_get_result(result)
- except Exception as e:
- return None
- # Status: works
- def _has_index(self, dimension: int):
- return self.client.indices.exists(
- index=self._get_index_name(dimension=dimension)
- )
- def get_or_create_index(self, dimension: int):
- if not self._has_index(dimension=dimension):
- self._create_index(dimension=dimension)
- # Status: works
- def get(self, collection_name: str) -> Optional[GetResult]:
- # Get all the items in the collection.
- query = {
- "query": {"bool": {"filter": [{"term": {"collection": collection_name}}]}},
- "_source": ["text", "metadata"],
- }
- results = list(scan(self.client, index=f"{self.index_prefix}*", query=query))
- return self._scan_result_to_get_result(results)
- # Status: works
- def insert(self, collection_name: str, items: list[VectorItem]):
- if not self._has_index(dimension=len(items[0]["vector"])):
- self._create_index(dimension=len(items[0]["vector"]))
- for batch in self._create_batches(items):
- actions = [
- {
- "_index": self._get_index_name(dimension=len(items[0]["vector"])),
- "_id": item["id"],
- "_source": {
- "collection": collection_name,
- "vector": item["vector"],
- "text": item["text"],
- "metadata": item["metadata"],
- },
- }
- for item in batch
- ]
- bulk(self.client, actions)
- # Upsert documents using the update API with doc_as_upsert=True.
- def upsert(self, collection_name: str, items: list[VectorItem]):
- if not self._has_index(dimension=len(items[0]["vector"])):
- self._create_index(dimension=len(items[0]["vector"]))
- for batch in self._create_batches(items):
- actions = [
- {
- "_op_type": "update",
- "_index": self._get_index_name(dimension=len(item["vector"])),
- "_id": item["id"],
- "doc": {
- "collection": collection_name,
- "vector": item["vector"],
- "text": item["text"],
- "metadata": item["metadata"],
- },
- "doc_as_upsert": True,
- }
- for item in batch
- ]
- bulk(self.client, actions)
- # Delete specific documents from a collection by filtering on both collection and document IDs.
- def delete(
- self,
- collection_name: str,
- ids: Optional[list[str]] = None,
- filter: Optional[dict] = None,
- ):
- query = {
- "query": {"bool": {"filter": [{"term": {"collection": collection_name}}]}}
- }
- # logic based on chromaDB
- if ids:
- query["query"]["bool"]["filter"].append({"terms": {"_id": ids}})
- elif filter:
- for field, value in filter.items():
- query["query"]["bool"]["filter"].append(
- {"term": {f"metadata.{field}": value}}
- )
- self.client.delete_by_query(index=f"{self.index_prefix}*", body=query)
- def reset(self):
- indices = self.client.indices.get(index=f"{self.index_prefix}*")
- for index in indices:
- self.client.indices.delete(index=index)
|