Переглянути джерело

add excel document support

Marclass 1 рік тому
батько
коміт
8bfda730d9
2 змінених файлів з 12 додано та 1 видалено
  1. 6 0
      backend/apps/rag/main.py
  2. 6 1
      backend/requirements.txt

+ 6 - 0
backend/apps/rag/main.py

@@ -23,6 +23,7 @@ from langchain_community.document_loaders import (
     UnstructuredMarkdownLoader,
     UnstructuredXMLLoader,
     UnstructuredRSTLoader,
+    UnstructuredExcelLoader,
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
@@ -157,6 +158,9 @@ def store_doc(
         ]
     docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
     known_doc_ext=["doc","docx"]
+    excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+    known_excel_ext=["xls", "xlsx"]
+
     file_ext=file.filename.split(".")[-1].lower()
     known_type=True
     
@@ -179,6 +183,8 @@ def store_doc(
             loader = Docx2txtLoader(file_path)
         elif file_ext=="csv":
             loader = CSVLoader(file_path)
+        elif (file.content_type in excel_types or file_ext in known_excel_ext):
+            loader = UnstructuredExcelLoader(file_path)
         elif file_ext=="rst":
             loader = UnstructuredRSTLoader(file_path, mode="elements")
         elif file_ext in text_xml:

+ 6 - 1
backend/requirements.txt

@@ -28,4 +28,9 @@ markdown
 PyJWT
 pyjwt[crypto]
 
-black
+black
+
+pandas
+openpyxl
+pyxlsb
+xlrd