Forráskód Böngészése

refac: process docs dir

Timothy J. Baek 7 hónapja
szülő
commit
a6c797d4c2

+ 0 - 63
backend/open_webui/apps/retrieval/main.py

@@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import (
     query_doc_with_hybrid_search,
 )
 
-from open_webui.apps.webui.models.documents import DocumentForm, Documents
 from open_webui.apps.webui.models.files import Files
 from open_webui.config import (
     BRAVE_SEARCH_API_KEY,
@@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
         )
 
 
-@app.get("/process/dir")
-def process_docs_dir(user=Depends(get_admin_user)):
-    for path in Path(DOCS_DIR).rglob("./**/*"):
-        try:
-            if path.is_file() and not path.name.startswith("."):
-                tags = extract_folders_after_data_docs(path)
-                filename = path.name
-                file_content_type = mimetypes.guess_type(path)
-
-                with open(path, "rb") as f:
-                    collection_name = calculate_sha256(f)[:63]
-
-                loader = Loader(
-                    engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
-                    TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
-                    PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
-                )
-                docs = loader.load(filename, file_content_type[0], str(path))
-
-                try:
-                    result = save_docs_to_vector_db(docs, collection_name)
-
-                    if result:
-                        sanitized_filename = sanitize_filename(filename)
-                        doc = Documents.get_doc_by_name(sanitized_filename)
-
-                        if doc is None:
-                            doc = Documents.insert_new_doc(
-                                user.id,
-                                DocumentForm(
-                                    **{
-                                        "name": sanitized_filename,
-                                        "title": filename,
-                                        "collection_name": collection_name,
-                                        "filename": filename,
-                                        "content": (
-                                            json.dumps(
-                                                {
-                                                    "tags": list(
-                                                        map(
-                                                            lambda name: {"name": name},
-                                                            tags,
-                                                        )
-                                                    )
-                                                }
-                                            )
-                                            if len(tags)
-                                            else "{}"
-                                        ),
-                                    }
-                                ),
-                            )
-                except Exception as e:
-                    log.exception(e)
-                    pass
-
-        except Exception as e:
-            log.exception(e)
-
-    return True
-
-
 class QueryDocForm(BaseModel):
     collection_name: str
     query: str

+ 51 - 2
backend/open_webui/apps/webui/routers/files.py

@@ -5,17 +5,21 @@ import uuid
 from pathlib import Path
 from typing import Optional
 from pydantic import BaseModel
+import mimetypes
+
 
 from open_webui.apps.webui.models.files import FileForm, FileModel, Files
 from open_webui.apps.retrieval.main import process_file, ProcessFileForm
 
-from open_webui.config import UPLOAD_DIR
-from open_webui.constants import ERROR_MESSAGES
+from open_webui.config import UPLOAD_DIR, DOCS_DIR
 from open_webui.env import SRC_LOG_LEVELS
+from open_webui.constants import ERROR_MESSAGES
 
 
 from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 from fastapi.responses import FileResponse, StreamingResponse
+
+
 from open_webui.utils.utils import get_admin_user, get_verified_user
 
 log = logging.getLogger(__name__)
@@ -86,6 +90,51 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
         )
 
 
+@router.post("/upload/dir")
+def upload_dir(user=Depends(get_admin_user)):
+    for path in Path(DOCS_DIR).rglob("./**/*"):
+        if path.is_file() and not path.name.startswith("."):
+            try:
+                log.debug(f"Processing file from path: {path}")
+
+                filename = path.name
+                file_content_type = mimetypes.guess_type(path)
+
+                # replace filename with uuid
+                id = str(uuid.uuid4())
+                name = filename
+
+                contents = path.read_bytes()
+                file_path = str(path)
+
+                file = Files.insert_new_file(
+                    user.id,
+                    FileForm(
+                        **{
+                            "id": id,
+                            "filename": filename,
+                            "meta": {
+                                "name": name,
+                                "content_type": file_content_type,
+                                "size": len(contents),
+                                "path": file_path,
+                            },
+                        }
+                    ),
+                )
+
+                try:
+                    process_file(ProcessFileForm(file_id=id))
+                    log.debug(f"File processed: {path}, {file.id}")
+                except Exception as e:
+                    log.exception(e)
+                    log.error(f"Error processing file: {file.id}")
+            except Exception as e:
+                log.exception(e)
+                pass
+    return True
+
+
 ############################
 # List Files
 ############################

+ 26 - 0
src/lib/apis/files/index.ts

@@ -30,6 +30,32 @@ export const uploadFile = async (token: string, file: File) => {
 	return res;
 };
 
+export const uploadDir = async (token: string) => {
+	let error = null;
+
+	const res = await fetch(`${WEBUI_API_BASE_URL}/files/upload/dir`, {
+		method: 'POST',
+		headers: {
+			Accept: 'application/json',
+			authorization: `Bearer ${token}`
+		}
+	})
+		.then(async (res) => {
+			if (!res.ok) throw await res.json();
+			return res.json();
+		})
+		.catch((err) => {
+			error = err.detail;
+			return null;
+		});
+
+	if (error) {
+		throw error;
+	}
+
+	return res;
+};
+
 export const getFiles = async (token: string = '') => {
 	let error = null;
 

+ 0 - 26
src/lib/apis/retrieval/index.ts

@@ -342,32 +342,6 @@ export const processFile = async (
 	return res;
 };
 
-export const processDocsDir = async (token: string) => {
-	let error = null;
-
-	const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/dir`, {
-		method: 'GET',
-		headers: {
-			Accept: 'application/json',
-			authorization: `Bearer ${token}`
-		}
-	})
-		.then(async (res) => {
-			if (!res.ok) throw await res.json();
-			return res.json();
-		})
-		.catch((err) => {
-			error = err.detail;
-			return null;
-		});
-
-	if (error) {
-		throw error;
-	}
-
-	return res;
-};
-
 export const processYoutubeVideo = async (token: string, url: string) => {
 	let error = null;
 

+ 2 - 4
src/lib/components/admin/Settings/Documents.svelte

@@ -7,7 +7,6 @@
 
 	import {
 		getQuerySettings,
-		processDocsDir,
 		updateQuerySettings,
 		resetVectorDB,
 		getEmbeddingConfig,
@@ -21,7 +20,7 @@
 
 	import { knowledge, models } from '$lib/stores';
 	import { getKnowledgeItems } from '$lib/apis/knowledge';
-	import { deleteAllFiles, deleteFileById } from '$lib/apis/files';
+	import { uploadDir, deleteAllFiles, deleteFileById } from '$lib/apis/files';
 
 	import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
 	import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
@@ -65,11 +64,10 @@
 
 	const scanHandler = async () => {
 		scanDirLoading = true;
-		const res = await processDocsDir(localStorage.token);
+		const res = await uploadDir(localStorage.token);
 		scanDirLoading = false;
 
 		if (res) {
-			await knowledge.set(await getKnowledgeItems(localStorage.token));
 			toast.success($i18n.t('Scan complete!'));
 		}
 	};