Browse Source

Allow any file to be used for RAG.

Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file.
Marclass 1 year ago
parent
commit
aa1d386042

+ 18 - 38
backend/apps/rag/main.py

@@ -144,37 +144,21 @@ def store_doc(
     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
 
 
     print(file.content_type)
     print(file.content_type)
-    if file.content_type not in [
-        "application/pdf",
-        "text/plain",
-        "text/csv",
-        "text/xml",
-        "text/x-python",
-        "text/css",
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/octet-stream",
-        "application/x-javascript",
-    ]:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
-        )
-    text_xml=["text/xml"]
+    
+    text_xml=["xml"]
     octet_markdown=["md"]
     octet_markdown=["md"]
-    octet_plain=[
+    known_source_ext=[
         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", 
         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", 
         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
         "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
         "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
         "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
         "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
         ]
         ]
+    docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    known_doc_ext=["doc","docx"]
     file_ext=file.filename.split(".")[-1].lower()
     file_ext=file.filename.split(".")[-1].lower()
-    if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain):
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
-        )
-
+    known_type=True
+    
     try:
     try:
         filename = file.filename
         filename = file.filename
         file_path = f"{UPLOAD_DIR}/{filename}"
         file_path = f"{UPLOAD_DIR}/{filename}"
@@ -188,27 +172,22 @@ def store_doc(
             collection_name = calculate_sha256(f)[:63]
             collection_name = calculate_sha256(f)[:63]
         f.close()
         f.close()
 
 
-        if file.content_type == "application/pdf":
+        if file_ext=="pdf":
             loader = PyPDFLoader(file_path)
             loader = PyPDFLoader(file_path)
-        elif (
-            file.content_type
-            == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ):
+        elif (file.content_type ==docx_type or file_ext in known_doc_ext):
             loader = Docx2txtLoader(file_path)
             loader = Docx2txtLoader(file_path)
-        
-        elif file.content_type == "text/csv":
+        elif file_ext=="csv":
             loader = CSVLoader(file_path)
             loader = CSVLoader(file_path)
-        elif file.content_type in text_xml:
+        elif file_ext in text_xml:
             loader=UnstructuredXMLLoader(file_path)
             loader=UnstructuredXMLLoader(file_path)
-        elif file.content_type == "text/plain" or file.content_type.find("text/")>=0:
+        elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
             loader = TextLoader(file_path)
             loader = TextLoader(file_path)
-        elif file.content_type == "application/octet-stream":
-            if file_ext in octet_markdown:
-                loader = UnstructuredMarkdownLoader(file_path)
-            if file_ext in octet_plain:
-                loader = TextLoader(file_path)
-        elif file.content_type == "application/x-javascript":
+        elif file_ext in octet_markdown:
+            loader = UnstructuredMarkdownLoader(file_path)
+        else:
             loader = TextLoader(file_path)
             loader = TextLoader(file_path)
+            known_type=False
+
 
 
         data = loader.load()
         data = loader.load()
         result = store_data_in_vector_db(data, collection_name)
         result = store_data_in_vector_db(data, collection_name)
@@ -218,6 +197,7 @@ def store_doc(
                 "status": True,
                 "status": True,
                 "collection_name": collection_name,
                 "collection_name": collection_name,
                 "filename": filename,
                 "filename": filename,
+                "known_type":known_type,
             }
             }
         else:
         else:
             raise HTTPException(
             raise HTTPException(

+ 5 - 3
src/lib/components/chat/MessageInput.svelte

@@ -173,7 +173,8 @@
 					) {
 					) {
 						uploadDoc(file);
 						uploadDoc(file);
 					} else {
 					} else {
-						toast.error(`Unsupported File Type '${file['type']}'.`);
+						toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+						uploadDoc(file);
 					}
 					}
 				} else {
 				} else {
 					toast.error(`File not found.`);
 					toast.error(`File not found.`);
@@ -308,8 +309,9 @@
 								uploadDoc(file);
 								uploadDoc(file);
 								filesInputElement.value = '';
 								filesInputElement.value = '';
 							} else {
 							} else {
-								toast.error(`Unsupported File Type '${file['type']}'.`);
-								inputFiles = null;
+								toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+								uploadDoc(file);
+								filesInputElement.value = '';
 							}
 							}
 						} else {
 						} else {
 							toast.error(`File not found.`);
 							toast.error(`File not found.`);

+ 4 - 2
src/routes/(app)/documents/+page.svelte

@@ -73,7 +73,8 @@
 				) {
 				) {
 					uploadDoc(file);
 					uploadDoc(file);
 				} else {
 				} else {
-					toast.error(`Unsupported File Type '${file['type']}'.`);
+					toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+					uploadDoc(file);
 				}
 				}
 			} else {
 			} else {
 				toast.error(`File not found.`);
 				toast.error(`File not found.`);
@@ -153,7 +154,8 @@
 						) {
 						) {
 							uploadDoc(file);
 							uploadDoc(file);
 						} else {
 						} else {
-							toast.error(`Unsupported File Type '${file['type']}'.`);
+							toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
+							uploadDoc(file);
 						}
 						}
 
 
 						inputFiles = null;
 						inputFiles = null;