Timothy J. Baek 7 tháng trước cách đây
mục cha
commit
17c772831d

+ 56 - 18
backend/open_webui/apps/retrieval/main.py

@@ -700,17 +700,25 @@ def save_docs_to_vector_db(
             list(map(lambda x: x.replace("\n", " "), texts))
         )
 
+        items = [
+            {
+                "id": str(uuid.uuid4()),
+                "text": text,
+                "vector": embeddings[idx],
+                "metadata": metadatas[idx],
+            }
+            for idx, text in enumerate(texts)
+        ]
+
+        if file_id:
+            VECTOR_DB_CLIENT.insert(
+                collection_name=f"file-{file_id}",
+                items=items,
+            )
+
         VECTOR_DB_CLIENT.insert(
             collection_name=collection_name,
-            items=[
-                {
-                    "id": str(uuid.uuid4()),
-                    "text": text,
-                    "vector": embeddings[idx],
-                    "metadata": metadatas[idx],
-                }
-                for idx, text in enumerate(texts)
-            ],
+            items=items,
         )
 
         return True
@@ -721,6 +729,7 @@ def save_docs_to_vector_db(
 
 class ProcessFileForm(BaseModel):
     file_id: str
+    content: Optional[str] = None
     collection_name: Optional[str] = None
 
 
@@ -742,29 +751,58 @@ def process_file(
             PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
         )
 
-        file_path = file.meta.get("path", None)
-        if file_path:
-            docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
-        else:
+        if form_data.content:
+            docs = [
+                Document(
+                    page_content=form_data.content,
+                    metadata={
+                        "name": file.meta.get("name", file.filename),
+                        "created_by": file.user_id,
+                        **file.meta,
+                    },
+                )
+            ]
+
+            text_content = form_data.content
+        elif file.data.get("content", None):
             docs = [
                 Document(
                     page_content=file.data.get("content", ""),
                     metadata={
-                        "name": file.filename,
+                        "name": file.meta.get("name", file.filename),
                         "created_by": file.user_id,
                         **file.meta,
                     },
                 )
             ]
+            text_content = file.data.get("content", "")
+        else:
+            file_path = file.meta.get("path", None)
+            if file_path:
+                docs = loader.load(
+                    file.filename, file.meta.get("content_type"), file_path
+                )
+            else:
+                docs = [
+                    Document(
+                        page_content=file.data.get("content", ""),
+                        metadata={
+                            "name": file.filename,
+                            "created_by": file.user_id,
+                            **file.meta,
+                        },
+                    )
+                ]
+
+            text_content = " ".join([doc.page_content for doc in docs])
 
-        text_content = " ".join([doc.page_content for doc in docs])
         log.debug(f"text_content: {text_content}")
-        hash = calculate_sha256_string(text_content)
-
         Files.update_file_data_by_id(
             file.id,
             {"content": text_content},
         )
+
+        hash = calculate_sha256_string(text_content)
         Files.update_file_hash_by_id(file.id, hash)
 
         try:
@@ -772,7 +810,7 @@ def process_file(
                 docs=docs,
                 collection_name=collection_name,
                 metadata={
-                    "file_id": form_data.file_id,
+                    "file_id": file.id,
                     "name": file.meta.get("name", file.filename),
                     "hash": hash,
                 },

+ 50 - 13
backend/open_webui/apps/webui/routers/files.py

@@ -4,6 +4,7 @@ import shutil
 import uuid
 from pathlib import Path
 from typing import Optional
+from pydantic import BaseModel
 
 from open_webui.apps.webui.models.files import FileForm, FileModel, Files
 from open_webui.apps.retrieval.main import process_file, ProcessFileForm
@@ -154,6 +155,55 @@ async def get_file_by_id(id: str, user=Depends(get_verified_user)):
         )
 
 
+############################
+# Get File Data Content By Id
+############################
+
+
+@router.get("/{id}/data/content")
+async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
+    file = Files.get_file_by_id(id)
+
+    if file and (file.user_id == user.id or user.role == "admin"):
+        return {"content": file.data.get("content", "")}
+    else:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=ERROR_MESSAGES.NOT_FOUND,
+        )
+
+
+############################
+# Update File Data Content By Id
+############################
+
+
+class ContentForm(BaseModel):
+    content: str
+
+
+@router.post("/{id}/data/content/update")
+async def update_file_data_content_by_id(
+    id: str, form_data: ContentForm, user=Depends(get_verified_user)
+):
+    file = Files.get_file_by_id(id)
+
+    if file and (file.user_id == user.id or user.role == "admin"):
+        try:
+            process_file(ProcessFileForm(file_id=id, content=form_data.content))
+            file = Files.get_file_by_id(id=id)
+        except Exception as e:
+            log.exception(e)
+            log.error(f"Error processing file: {file.id}")
+
+        return {"content": file.data.get("content", "")}
+    else:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=ERROR_MESSAGES.NOT_FOUND,
+        )
+
+
 ############################
 # Get File Content By Id
 ############################
@@ -182,19 +232,6 @@ async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
         )
 
 
-@router.get("/{id}/content/text")
-async def get_file_text_content_by_id(id: str, user=Depends(get_verified_user)):
-    file = Files.get_file_by_id(id)
-
-    if file and (file.user_id == user.id or user.role == "admin"):
-        return {"text": file.data.get("content")}
-    else:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=ERROR_MESSAGES.NOT_FOUND,
-        )
-
-
 @router.get("/{id}/content/{file_name}", response_model=Optional[FileModel])
 async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
     file = Files.get_file_by_id(id)