Browse Source

Merge pull request #9314 from roryeckel/validate-rag-urls

Validate URLs returned by search engine
Timothy Jaeryang Baek 2 months ago
parent
commit
1b8dc673e7

+ 4 - 0
backend/open_webui/retrieval/web/main.py

@@ -1,3 +1,5 @@
+import validators
+
 from typing import Optional
 from urllib.parse import urlparse
 
@@ -10,6 +12,8 @@ def get_filtered_results(results, filter_list):
     filtered_results = []
     for result in results:
         url = result.get("url") or result.get("link", "")
+        if not validators.url(url):
+            continue
         domain = urlparse(url).netloc
         if any(domain.endswith(filtered_domain) for filtered_domain in filter_list):
             filtered_results.append(result)

+ 13 - 4
backend/open_webui/retrieval/web/utils.py

@@ -42,6 +42,15 @@ def validate_url(url: Union[str, Sequence[str]]):
     else:
         return False
 
+def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
+    valid_urls = []
+    for u in url:
+        try:
+            if validate_url(u):
+                valid_urls.append(u)
+        except ValueError:
+            continue
+    return valid_urls
 
 def resolve_hostname(hostname):
     # Get address information
@@ -86,11 +95,11 @@ def get_web_loader(
     verify_ssl: bool = True,
     requests_per_second: int = 2,
 ):
-    # Check if the URL is valid
-    if not validate_url(urls):
-        raise ValueError(ERROR_MESSAGES.INVALID_URL)
+    # Check if the URLs are valid
+    safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
+
     return SafeWebBaseLoader(
-        urls,
+        safe_urls,
         verify_ssl=verify_ssl,
         requests_per_second=requests_per_second,
         continue_on_failure=True,