浏览代码

Fixed the issue where a single URL error disrupts the data loading process in Web Search mode

To address the unresolved issue in the LangChain library where a single URL error disrupts the data loading process, the lazy_load method in the WebBaseLoader class has been modified. The enhanced method now handles exceptions appropriately, logging errors and continuing with the remaining URLs.
Que Nguyen 10 月之前
父节点
当前提交
3bec60b80c
共有 1 个文件被更改,包括 26 次插入3 次删除
  1. 26 3
      backend/apps/rag/main.py

+ 26 - 3
backend/apps/rag/main.py

@@ -12,9 +12,10 @@ import os, shutil, logging, re
 from datetime import datetime
 from datetime import datetime
 
 
 from pathlib import Path
 from pathlib import Path
-from typing import List, Union, Sequence
+from typing import List, Union, Sequence, Iterator, Any
 
 
 from chromadb.utils.batch_utils import create_batches
 from chromadb.utils.batch_utils import create_batches
+from langchain_core.documents import Document
 
 
 from langchain_community.document_loaders import (
 from langchain_community.document_loaders import (
     WebBaseLoader,
     WebBaseLoader,
@@ -701,7 +702,7 @@ def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True):
     # Check if the URL is valid
     # Check if the URL is valid
     if not validate_url(url):
     if not validate_url(url):
         raise ValueError(ERROR_MESSAGES.INVALID_URL)
         raise ValueError(ERROR_MESSAGES.INVALID_URL)
-    return WebBaseLoader(
+    return SafeWebBaseLoader(
         url,
         url,
         verify_ssl=verify_ssl,
         verify_ssl=verify_ssl,
         requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
         requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
@@ -1237,7 +1238,29 @@ def reset(user=Depends(get_admin_user)) -> bool:
 
 
     return True
     return True
 
 
-
+class SafeWebBaseLoader(WebBaseLoader):
+    """WebBaseLoader with enhanced error handling for URLs."""
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazy load text from the url(s) in web_path with error handling."""
+        for path in self.web_paths:
+            try:
+                soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
+                text = soup.get_text(**self.bs_get_text_kwargs)
+
+                # Build metadata
+                metadata = {"source": path}
+                if title := soup.find("title"):
+                    metadata["title"] = title.get_text()
+                if description := soup.find("meta", attrs={"name": "description"}):
+                    metadata["description"] = description.get("content", "No description found.")
+                if html := soup.find("html"):
+                    metadata["language"] = html.get("lang", "No language found.")
+                
+                yield Document(page_content=text, metadata=metadata)
+            except Exception as e:
+                # Log the error and continue with the next URL
+                log.error(f"Error loading {path}: {e}")
+                
 if ENV == "dev":
 if ENV == "dev":
 
 
     @app.get("/ef")
     @app.get("/ef")