google_pse.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import logging
  2. from typing import Optional
  3. import requests
  4. from open_webui.retrieval.web.main import SearchResult, get_filtered_results
  5. from open_webui.env import SRC_LOG_LEVELS
  6. log = logging.getLogger(__name__)
  7. log.setLevel(SRC_LOG_LEVELS["RAG"])
  8. def search_google_pse(
  9. api_key: str,
  10. search_engine_id: str,
  11. query: str,
  12. count: int,
  13. filter_list: Optional[list[str]] = None,
  14. ) -> list[SearchResult]:
  15. """Search using Google's Programmable Search Engine API and return the results as a list of SearchResult objects.
  16. Handles pagination for counts greater than 10.
  17. Args:
  18. api_key (str): A Programmable Search Engine API key
  19. search_engine_id (str): A Programmable Search Engine ID
  20. query (str): The query to search for
  21. count (int): The number of results to return (max 100, as PSE max results per query is 10 and max page is 10)
  22. filter_list (Optional[list[str]], optional): A list of keywords to filter out from results. Defaults to None.
  23. Returns:
  24. list[SearchResult]: A list of SearchResult objects.
  25. """
  26. url = "https://www.googleapis.com/customsearch/v1"
  27. headers = {"Content-Type": "application/json"}
  28. all_results = []
  29. start_index = 1 # Google PSE start parameter is 1-based
  30. while count > 0:
  31. num_results_this_page = min(count, 10) # Google PSE max results per page is 10
  32. params = {
  33. "cx": search_engine_id,
  34. "q": query,
  35. "key": api_key,
  36. "num": num_results_this_page,
  37. "start": start_index,
  38. }
  39. response = requests.request("GET", url, headers=headers, params=params)
  40. response.raise_for_status()
  41. json_response = response.json()
  42. results = json_response.get("items", [])
  43. if results: # check if results are returned. If not, no more pages to fetch.
  44. all_results.extend(results)
  45. count -= len(
  46. results
  47. ) # Decrement count by the number of results fetched in this page.
  48. start_index += 10 # Increment start index for the next page
  49. else:
  50. break # No more results from Google PSE, break the loop
  51. if filter_list:
  52. all_results = get_filtered_results(all_results, filter_list)
  53. return [
  54. SearchResult(
  55. link=result["link"],
  56. title=result.get("title"),
  57. snippet=result.get("snippet"),
  58. )
  59. for result in all_results
  60. ]