Timothy J. Baek hai 7 meses
pai
achega
92dd173b27

+ 77 - 64
backend/open_webui/apps/retrieval/main.py

@@ -726,7 +726,6 @@ def process_file(
         )
         docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
         text_content = " ".join([doc.page_content for doc in docs])
-
         log.debug(f"text_content: {text_content}")
 
         Files.update_files_metadata_by_id(
@@ -795,10 +794,17 @@ def process_text(
             metadata={"name": form_data.name, "created_by": user.id},
         )
     ]
+    text_content = form_data.content
+    log.debug(f"text_content: {text_content}")
+
     result = save_docs_to_vector_db(docs, collection_name)
 
     if result:
-        return {"status": True, "collection_name": collection_name}
+        return {
+            "status": True,
+            "collection_name": collection_name,
+            "content": text_content,
+        }
     else:
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -806,68 +812,6 @@ def process_text(
         )
 
 
-@app.get("/process/dir")
-def process_docs_dir(user=Depends(get_admin_user)):
-    for path in Path(DOCS_DIR).rglob("./**/*"):
-        try:
-            if path.is_file() and not path.name.startswith("."):
-                tags = extract_folders_after_data_docs(path)
-                filename = path.name
-                file_content_type = mimetypes.guess_type(path)
-
-                with open(path, "rb") as f:
-                    collection_name = calculate_sha256(f)[:63]
-
-                loader = Loader(
-                    engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
-                    TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
-                    PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
-                )
-                docs = loader.load(filename, file_content_type[0], str(path))
-
-                try:
-                    result = save_docs_to_vector_db(docs, collection_name)
-
-                    if result:
-                        sanitized_filename = sanitize_filename(filename)
-                        doc = Documents.get_doc_by_name(sanitized_filename)
-
-                        if doc is None:
-                            doc = Documents.insert_new_doc(
-                                user.id,
-                                DocumentForm(
-                                    **{
-                                        "name": sanitized_filename,
-                                        "title": filename,
-                                        "collection_name": collection_name,
-                                        "filename": filename,
-                                        "content": (
-                                            json.dumps(
-                                                {
-                                                    "tags": list(
-                                                        map(
-                                                            lambda name: {"name": name},
-                                                            tags,
-                                                        )
-                                                    )
-                                                }
-                                            )
-                                            if len(tags)
-                                            else "{}"
-                                        ),
-                                    }
-                                ),
-                            )
-                except Exception as e:
-                    log.exception(e)
-                    pass
-
-        except Exception as e:
-            log.exception(e)
-
-    return True
-
-
 @app.post("/process/youtube")
 def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
     try:
@@ -882,12 +826,15 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
             translation=app.state.YOUTUBE_LOADER_TRANSLATION,
         )
         docs = loader.load()
+        text_content = " ".join([doc.page_content for doc in docs])
+        log.debug(f"text_content: {text_content}")
         save_docs_to_vector_db(docs, collection_name, overwrite=True)
 
         return {
             "status": True,
             "collection_name": collection_name,
             "filename": form_data.url,
+            "content": text_content,
         }
     except Exception as e:
         log.exception(e)
@@ -910,12 +857,15 @@ def process_web(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
             requests_per_second=app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
         )
         docs = loader.load()
+        text_content = " ".join([doc.page_content for doc in docs])
+        log.debug(f"text_content: {text_content}")
         save_docs_to_vector_db(docs, collection_name, overwrite=True)
 
         return {
             "status": True,
             "collection_name": collection_name,
             "filename": form_data.url,
+            "content": text_content,
         }
     except Exception as e:
         log.exception(e)
@@ -1067,6 +1017,7 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
 
         loader = get_web_loader(urls)
         docs = loader.load()
+
         save_docs_to_vector_db(docs, collection_name, overwrite=True)
 
         return {
@@ -1082,6 +1033,68 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
         )
 
 
+@app.get("/process/dir")
+def process_docs_dir(user=Depends(get_admin_user)):
+    for path in Path(DOCS_DIR).rglob("./**/*"):
+        try:
+            if path.is_file() and not path.name.startswith("."):
+                tags = extract_folders_after_data_docs(path)
+                filename = path.name
+                file_content_type = mimetypes.guess_type(path)
+
+                with open(path, "rb") as f:
+                    collection_name = calculate_sha256(f)[:63]
+
+                loader = Loader(
+                    engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
+                    TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
+                    PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
+                )
+                docs = loader.load(filename, file_content_type[0], str(path))
+
+                try:
+                    result = save_docs_to_vector_db(docs, collection_name)
+
+                    if result:
+                        sanitized_filename = sanitize_filename(filename)
+                        doc = Documents.get_doc_by_name(sanitized_filename)
+
+                        if doc is None:
+                            doc = Documents.insert_new_doc(
+                                user.id,
+                                DocumentForm(
+                                    **{
+                                        "name": sanitized_filename,
+                                        "title": filename,
+                                        "collection_name": collection_name,
+                                        "filename": filename,
+                                        "content": (
+                                            json.dumps(
+                                                {
+                                                    "tags": list(
+                                                        map(
+                                                            lambda name: {"name": name},
+                                                            tags,
+                                                        )
+                                                    )
+                                                }
+                                            )
+                                            if len(tags)
+                                            else "{}"
+                                        ),
+                                    }
+                                ),
+                            )
+                except Exception as e:
+                    log.exception(e)
+                    pass
+
+        except Exception as e:
+            log.exception(e)
+
+    return True
+
+
 class QueryDocForm(BaseModel):
     collection_name: str
     query: str

+ 11 - 8
src/lib/components/chat/MessageInput/Commands.svelte

@@ -30,7 +30,7 @@
 	const uploadWeb = async (url) => {
 		console.log(url);
 
-		const doc = {
+		const fileItem = {
 			type: 'doc',
 			name: url,
 			collection_name: '',
@@ -40,12 +40,14 @@
 		};
 
 		try {
-			files = [...files, doc];
+			files = [...files, fileItem];
 			const res = await processWeb(localStorage.token, '', url);
 
 			if (res) {
-				doc.status = 'processed';
-				doc.collection_name = res.collection_name;
+				fileItem.status = 'processed';
+				fileItem.collection_name = res.collection_name;
+				fileItem.content = res.content;
+
 				files = files;
 			}
 		} catch (e) {
@@ -58,7 +60,7 @@
 	const uploadYoutubeTranscription = async (url) => {
 		console.log(url);
 
-		const doc = {
+		const fileItem = {
 			type: 'doc',
 			name: url,
 			collection_name: '',
@@ -68,12 +70,13 @@
 		};
 
 		try {
-			files = [...files, doc];
+			files = [...files, fileItem];
 			const res = await processYoutubeVideo(localStorage.token, url);
 
 			if (res) {
-				doc.status = 'processed';
-				doc.collection_name = res.collection_name;
+				fileItem.status = 'processed';
+				fileItem.collection_name = res.collection_name;
+				fileItem.content = res.content;
 				files = files;
 			}
 		} catch (e) {

+ 2 - 0
src/lib/components/common/FileItem.svelte

@@ -39,6 +39,8 @@
 				if (url) {
 					if (type === 'file') {
 						window.open(`${url}/content`, '_blank').focus();
+					} else {
+						window.open(`${url}`, '_blank').focus();
 					}
 				}
 			}

+ 8 - 2
src/lib/components/common/FileItemModal.svelte

@@ -20,8 +20,14 @@
 	<div class="font-primary px-6 py-5 w-full flex flex-col justify-center dark:text-gray-400">
 		<div class="flex items-start justify-between pb-2">
 			<div>
-				<div class=" font-medium text-lg line-clamp-1 dark:text-gray-100">
-					{file?.name ?? 'File'}
+				<div class=" font-medium text-lg dark:text-gray-100">
+					<a
+						href={file.url ? (file.type === 'file' ? `${file.url}/content` : `${file.url}`) : '#'}
+						target="_blank"
+						class="hover:underline line-clamp-1"
+					>
+						{file?.name ?? 'File'}
+					</a>
 				</div>
 
 				<div>