Ver código fonte

Merge pull request #2923 from mindspawn/outlook-msg

Support Outlook Message File Format
Timothy Jaeryang Baek 10 meses atrás
pai
commit
dbde628141
3 arquivos alterados com 17 adições e 2 exclusões
  1. 12 0
      backend/apps/rag/main.py
  2. 3 1
      backend/requirements.txt
  3. 2 1
      src/lib/constants.ts

+ 12 - 0
backend/apps/rag/main.py

@@ -9,6 +9,7 @@ from fastapi import (
 )
 from fastapi.middleware.cors import CORSMiddleware
 import os, shutil, logging, re
+from datetime import datetime
 
 from pathlib import Path
 from typing import List, Union, Sequence
@@ -30,6 +31,7 @@ from langchain_community.document_loaders import (
     UnstructuredExcelLoader,
     UnstructuredPowerPointLoader,
     YoutubeLoader,
+    OutlookMessageLoader,
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
@@ -879,6 +881,13 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b
     texts = [doc.page_content for doc in docs]
     metadatas = [doc.metadata for doc in docs]
 
+    # ChromaDB does not like datetime formats
+    # for meta-data so convert them to string.
+    for metadata in metadatas:
+        for key, value in metadata.items():
+            if isinstance(value, datetime):
+                metadata[key] = str(value)
+
     try:
         if overwrite:
             for collection in CHROMA_CLIENT.list_collections():
@@ -965,6 +974,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
         "swift",
         "vue",
         "svelte",
+        "msg",
     ]
 
     if file_ext == "pdf":
@@ -999,6 +1009,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ] or file_ext in ["ppt", "pptx"]:
         loader = UnstructuredPowerPointLoader(file_path)
+    elif file_ext == "msg":
+        loader = OutlookMessageLoader(file_path)
     elif file_ext in known_source_ext or (
         file_content_type and file_content_type.find("text/") >= 0
     ):

+ 3 - 1
backend/requirements.txt

@@ -56,4 +56,6 @@ PyJWT[crypto]==2.8.0
 black==24.4.2
 langfuse==2.33.0
 youtube-transcript-api==0.6.2
-pytube==15.0.0
+pytube==15.0.0
+
+extract_msg

+ 2 - 1
src/lib/constants.ts

@@ -89,7 +89,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
 	'xls',
 	'xlsx',
 	'pptx',
-	'ppt'
+	'ppt',
+	'msg'
 ];
 
 // Source: https://kit.svelte.dev/docs/modules#$env-static-public