Bladeren bron

Error handling load_single_document() in ingest.py (#4852)

load_single_document() handles
- corrupt files
- empty (zero byte) files
- unsupported file extensions
dcasota 10 maanden geleden
bovenliggende
commit
5528dd9d11
1 gewijzigde bestanden met toevoegingen van 16 en 7 verwijderingen
  1. 16 7
      examples/langchain-python-rag-privategpt/ingest.py

+ 16 - 7
examples/langchain-python-rag-privategpt/ingest.py

@@ -77,13 +77,21 @@ LOADER_MAPPING = {
 
 
 def load_single_document(file_path: str) -> List[Document]:
-    ext = "." + file_path.rsplit(".", 1)[-1]
-    if ext in LOADER_MAPPING:
-        loader_class, loader_args = LOADER_MAPPING[ext]
-        loader = loader_class(file_path, **loader_args)
-        return loader.load()
+    if os.path.getsize(file_path) != 0:
+        filename, ext = os.path.splitext(file_path)
+        if ext in LOADER_MAPPING:
+            loader_class, loader_args = LOADER_MAPPING[ext]
+            try:
+                loader = loader_class(file_path, **loader_args)
+                if loader:
+                    return loader.load()
+            except:
+                print(f"Corrupted file {file_path}. Ignoring it.")
+        else:
+            print(f"Unsupported file {file_path}. Ignoring it.")
+    else:
+        print(f"Empty file {file_path}. Ignoring it.")
 
-    raise ValueError(f"Unsupported file extension '{ext}'")
 
 def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
     """
@@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
         results = []
         with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
             for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
-                results.extend(docs)
+                if docs:
+                    results.extend(docs)
                 pbar.update()
 
     return results