소스 검색

Allow any file to be used for RAG.

Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file.
Marclass 1 년 전
부모
커밋
aa1d386042
3개의 변경된 파일27개의 추가작업 그리고 43개의 파일을 삭제
  1. 18 38
      backend/apps/rag/main.py
  2. 5 3
      src/lib/components/chat/MessageInput.svelte
  3. 4 2
      src/routes/(app)/documents/+page.svelte

+ 18 - 38
backend/apps/rag/main.py

@@ -144,37 +144,21 @@ def store_doc(
     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
 
     print(file.content_type)
-    if file.content_type not in [
-        "application/pdf",
-        "text/plain",
-        "text/csv",
-        "text/xml",
-        "text/x-python",
-        "text/css",
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/octet-stream",
-        "application/x-javascript",
-    ]:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
-        )
-    text_xml=["text/xml"]
+    
+    text_xml=["xml"]
     octet_markdown=["md"]
-    octet_plain=[
+    known_source_ext=[
         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", 
         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
         "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
         "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
         ]
+    docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    known_doc_ext=["doc","docx"]
     file_ext=file.filename.split(".")[-1].lower()
-    if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain):
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
-        )
-
+    known_type=True
+    
     try:
         filename = file.filename
         file_path = f"{UPLOAD_DIR}/{filename}"
@@ -188,27 +172,22 @@ def store_doc(
             collection_name = calculate_sha256(f)[:63]
         f.close()
 
-        if file.content_type == "application/pdf":
+        if file_ext=="pdf":
             loader = PyPDFLoader(file_path)
-        elif (
-            file.content_type
-            == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ):
+        elif (file.content_type ==docx_type or file_ext in known_doc_ext):
             loader = Docx2txtLoader(file_path)
-        
-        elif file.content_type == "text/csv":
+        elif file_ext=="csv":
             loader = CSVLoader(file_path)
-        elif file.content_type in text_xml:
+        elif file_ext in text_xml:
             loader=UnstructuredXMLLoader(file_path)
-        elif file.content_type == "text/plain" or file.content_type.find("text/")>=0:
+        elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
             loader = TextLoader(file_path)
-        elif file.content_type == "application/octet-stream":
-            if file_ext in octet_markdown:
-                loader = UnstructuredMarkdownLoader(file_path)
-            if file_ext in octet_plain:
-                loader = TextLoader(file_path)
-        elif file.content_type == "application/x-javascript":
+        elif file_ext in octet_markdown:
+            loader = UnstructuredMarkdownLoader(file_path)
+        else:
             loader = TextLoader(file_path)
+            known_type=False
+
 
         data = loader.load()
         result = store_data_in_vector_db(data, collection_name)
@@ -218,6 +197,7 @@ def store_doc(
                 "status": True,
                 "collection_name": collection_name,
                 "filename": filename,
+                "known_type":known_type,
             }
         else:
             raise HTTPException(

+ 5 - 3
src/lib/components/chat/MessageInput.svelte

@@ -173,7 +173,8 @@
 					) {
 						uploadDoc(file);
 					} else {
-						toast.error(`Unsupported File Type '${file['type']}'.`);
+						toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+						uploadDoc(file);
 					}
 				} else {
 					toast.error(`File not found.`);
@@ -308,8 +309,9 @@
 								uploadDoc(file);
 								filesInputElement.value = '';
 							} else {
-								toast.error(`Unsupported File Type '${file['type']}'.`);
-								inputFiles = null;
+								toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+								uploadDoc(file);
+								filesInputElement.value = '';
 							}
 						} else {
 							toast.error(`File not found.`);

+ 4 - 2
src/routes/(app)/documents/+page.svelte

@@ -73,7 +73,8 @@
 				) {
 					uploadDoc(file);
 				} else {
-					toast.error(`Unsupported File Type '${file['type']}'.`);
+					toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+					uploadDoc(file);
 				}
 			} else {
 				toast.error(`File not found.`);
@@ -153,7 +154,8 @@
 						) {
 							uploadDoc(file);
 						} else {
-							toast.error(`Unsupported File Type '${file['type']}'.`);
+							toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+							uploadDoc(file);
 						}
 
 						inputFiles = null;