Timothy J. Baek 11 bulan lalu
induk
melakukan
322db31dc9
3 mengubah file dengan 10 tambahan dan 1 penghapusan
  1. 6 0
      backend/apps/rag/main.py
  2. 1 0
      backend/requirements.txt
  3. 3 1
      src/lib/constants.ts

+ 6 - 0
backend/apps/rag/main.py

@@ -28,6 +28,7 @@ from langchain_community.document_loaders import (
     UnstructuredXMLLoader,
     UnstructuredXMLLoader,
     UnstructuredRSTLoader,
     UnstructuredRSTLoader,
     UnstructuredExcelLoader,
     UnstructuredExcelLoader,
+    UnstructuredPowerPointLoader,
     YoutubeLoader,
     YoutubeLoader,
 )
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -768,6 +769,11 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
     ] or file_ext in ["xls", "xlsx"]:
     ] or file_ext in ["xls", "xlsx"]:
         loader = UnstructuredExcelLoader(file_path)
         loader = UnstructuredExcelLoader(file_path)
+    elif file_content_type in [
+        "application/vnd.ms-powerpoint",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    ] or file_ext in ["ppt", "pptx"]:
+        loader = UnstructuredPowerPointLoader(file_path)
     elif file_ext in known_source_ext or (
     elif file_ext in known_source_ext or (
         file_content_type and file_content_type.find("text/") >= 0
         file_content_type and file_content_type.find("text/") >= 0
     ):
     ):

+ 1 - 0
backend/requirements.txt

@@ -35,6 +35,7 @@ chromadb==0.4.24
 sentence-transformers==2.7.0
 sentence-transformers==2.7.0
 pypdf==4.2.0
 pypdf==4.2.0
 docx2txt==0.8
 docx2txt==0.8
+python-pptx==0.6.23
 unstructured==0.11.8
 unstructured==0.11.8
 Markdown==3.6
 Markdown==3.6
 pypandoc==1.13
 pypandoc==1.13

+ 3 - 1
src/lib/constants.ts

@@ -86,7 +86,9 @@ export const SUPPORTED_FILE_EXTENSIONS = [
 	'csv',
 	'csv',
 	'txt',
 	'txt',
 	'xls',
 	'xls',
-	'xlsx'
+	'xlsx',
+	'pptx',
+	'ppt'
 ];
 ];
 
 
 // Source: https://kit.svelte.dev/docs/modules#$env-static-public
 // Source: https://kit.svelte.dev/docs/modules#$env-static-public