Browse Source

Merge pull request #9837 from silverriver/patch-1

feat Make Google PSE search return more than 10 google search results
Timothy Jaeryang Baek 2 months ago
parent
commit
a5bba20915
1 changed files with 32 additions and 16 deletions
  1. 32 16
      backend/open_webui/retrieval/web/google_pse.py

+ 32 - 16
backend/open_webui/retrieval/web/google_pse.py

@@ -8,7 +8,6 @@ from open_webui.env import SRC_LOG_LEVELS
 log = logging.getLogger(__name__)
 log = logging.getLogger(__name__)
 log.setLevel(SRC_LOG_LEVELS["RAG"])
 log.setLevel(SRC_LOG_LEVELS["RAG"])
 
 
-
 def search_google_pse(
 def search_google_pse(
     api_key: str,
     api_key: str,
     search_engine_id: str,
     search_engine_id: str,
@@ -17,34 +16,51 @@ def search_google_pse(
     filter_list: Optional[list[str]] = None,
     filter_list: Optional[list[str]] = None,
 ) -> list[SearchResult]:
 ) -> list[SearchResult]:
     """Search using Google's Programmable Search Engine API and return the results as a list of SearchResult objects.
     """Search using Google's Programmable Search Engine API and return the results as a list of SearchResult objects.
+    Handles pagination for counts greater than 10.
 
 
     Args:
     Args:
         api_key (str): A Programmable Search Engine API key
         api_key (str): A Programmable Search Engine API key
         search_engine_id (str): A Programmable Search Engine ID
         search_engine_id (str): A Programmable Search Engine ID
         query (str): The query to search for
         query (str): The query to search for
+        count (int): The number of results to return (max 100, as PSE max results per query is 10 and max page is 10)
+        filter_list (Optional[list[str]], optional): A list of keywords to filter out from results. Defaults to None.
+
+    Returns:
+        list[SearchResult]: A list of SearchResult objects.
     """
     """
     url = "https://www.googleapis.com/customsearch/v1"
     url = "https://www.googleapis.com/customsearch/v1"
-
     headers = {"Content-Type": "application/json"}
     headers = {"Content-Type": "application/json"}
-    params = {
-        "cx": search_engine_id,
-        "q": query,
-        "key": api_key,
-        "num": count,
-    }
-
-    response = requests.request("GET", url, headers=headers, params=params)
-    response.raise_for_status()
-
-    json_response = response.json()
-    results = json_response.get("items", [])
+    all_results = []
+    start_index = 1  # Google PSE start parameter is 1-based
+
+    while count > 0:
+        num_results_this_page = min(count, 10)  # Google PSE max results per page is 10
+        params = {
+            "cx": search_engine_id,
+            "q": query,
+            "key": api_key,
+            "num": num_results_this_page,
+            "start": start_index,
+        }
+        response = requests.request("GET", url, headers=headers, params=params)
+        response.raise_for_status()
+        json_response = response.json()
+        results = json_response.get("items", [])
+        if results: # check if results are returned. If not, no more pages to fetch.
+            all_results.extend(results)
+            count -= len(results) # Decrement count by the number of results fetched in this page.
+            start_index += 10 # Increment start index for the next page
+        else:
+            break # No more results from Google PSE, break the loop
+
     if filter_list:
     if filter_list:
-        results = get_filtered_results(results, filter_list)
+        all_results = get_filtered_results(all_results, filter_list)
+
     return [
     return [
         SearchResult(
         SearchResult(
             link=result["link"],
             link=result["link"],
             title=result.get("title"),
             title=result.get("title"),
             snippet=result.get("snippet"),
             snippet=result.get("snippet"),
         )
         )
-        for result in results
+        for result in all_results
     ]
     ]