Browse Source

feat: Add epub support

Dave Bauman 1 year ago
parent
commit
f559068186

+ 5 - 0
Dockerfile

@@ -28,6 +28,11 @@ ENV WEBUI_JWT_SECRET_KEY "SECRET_KEY"
 
 WORKDIR /app
 
+# Install pandoc
+RUN apt-get update \
+    && apt-get install -y pandoc \
+    && rm -rf /var/lib/apt/lists/*
+
 # copy embedding weight from build
 RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2
 COPY --from=build /app/onnx.tar.gz /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2

+ 13 - 4
backend/apps/rag/main.py

@@ -19,6 +19,7 @@ from langchain_community.document_loaders import (
     PyPDFLoader,
     CSVLoader,
     Docx2txtLoader,
+    UnstructuredEPubLoader,
     UnstructuredWordDocumentLoader,
     UnstructuredMarkdownLoader,
     UnstructuredXMLLoader,
@@ -184,6 +185,8 @@ def store_doc(
             loader = TextLoader(file_path)
         elif file_ext in octet_markdown:
             loader = UnstructuredMarkdownLoader(file_path)
+        elif file.content_type == "application/epub+zip":
+            loader = UnstructuredEPubLoader(file_path)
         else:
             loader = TextLoader(file_path)
             known_type=False
@@ -206,10 +209,16 @@ def store_doc(
             )
     except Exception as e:
         print(e)
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=ERROR_MESSAGES.DEFAULT(e),
-        )
+        if "No pandoc was found" in str(e):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
+            )
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=ERROR_MESSAGES.DEFAULT(e),
+            )
 
 
 @app.get("/reset/db")

+ 2 - 0
backend/constants.py

@@ -42,3 +42,5 @@ class ERROR_MESSAGES(str, Enum):
     USER_NOT_FOUND = "We could not find what you're looking for :/"
     API_KEY_NOT_FOUND = "Oops! It looks like there's a hiccup. The API key is missing. Please make sure to provide a valid API key to access this feature."
     MALICIOUS = "Unusual activities detected, please try again in a few minutes."
+
+    PANDOC_NOT_INSTALLED = "Pandoc is not installed on the server. Please contact your administrator for assistance."

+ 2 - 1
backend/requirements.txt

@@ -24,8 +24,9 @@ pypdf
 docx2txt
 unstructured
 markdown
+pypandoc
 
 PyJWT
 pyjwt[crypto]
 
-black
+black

+ 13 - 7
src/lib/components/chat/MessageInput.svelte

@@ -121,13 +121,19 @@
 			error: ''
 		};
 
-		files = [...files, doc];
-		const res = await uploadDocToVectorDB(localStorage.token, '', file);
-
-		if (res) {
-			doc.upload_status = true;
-			doc.collection_name = res.collection_name;
-			files = files;
+		try {
+			files = [...files, doc];
+			const res = await uploadDocToVectorDB(localStorage.token, '', file);
+
+			if (res) {
+				doc.upload_status = true;
+				doc.collection_name = res.collection_name;
+				files = files;
+			}
+		} catch (e) {
+			// Remove the failed doc from the files array
+			files = files.filter((f) => f.name !== file.name);
+			toast.error(e);
 		}
 	};
 

+ 1 - 0
src/lib/constants.ts

@@ -12,6 +12,7 @@ export const WEB_UI_VERSION = 'v1.0.0-alpha-static';
 export const REQUIRED_OLLAMA_VERSION = '0.1.16';
 
 export const SUPPORTED_FILE_TYPE = [
+	'application/epub+zip',
 	'application/pdf',
 	'text/plain',
 	'text/csv',