utils.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import curses
  2. import feedparser
  3. import requests
  4. import unicodedata
  5. import json
  6. from newspaper import Article
  7. from bs4 import BeautifulSoup
  8. from nltk.tokenize import sent_tokenize, word_tokenize
  9. import numpy as np
  10. from sklearn.neighbors import NearestNeighbors
  11. from mattsollamatools import chunker
  12. # Create a dictionary to store topics and their URLs
  13. topic_urls = {
  14. "Mac": "https://9to5mac.com/guides/mac/feed",
  15. "News": "http://www.npr.org/rss/rss.php?id=1001",
  16. "Nvidia": "https://nvidianews.nvidia.com/releases.xml",
  17. "Raspberry Pi": "https://www.raspberrypi.com/news/feed/",
  18. "Music": "https://www.billboard.com/c/music/music-news/feed/"
  19. }
  20. # Use curses to create a menu of topics
  21. def menu(stdscr):
  22. chosen_topic = get_url_for_topic(stdscr)
  23. url = topic_urls[chosen_topic] if chosen_topic in topic_urls else "Topic not found"
  24. stdscr.addstr(len(topic_urls) + 3, 0, f"Selected URL for {chosen_topic}: {url}")
  25. stdscr.refresh()
  26. return chosen_topic
  27. # You have chosen a topic. Now return the url for that topic
  28. def get_url_for_topic(stdscr):
  29. curses.curs_set(0) # Hide the cursor
  30. stdscr.clear()
  31. stdscr.addstr(0, 0, "Choose a topic using the arrow keys (Press Enter to select):")
  32. # Create a list of topics
  33. topics = list(topic_urls.keys())
  34. current_topic = 0
  35. while True:
  36. for i, topic in enumerate(topics):
  37. if i == current_topic:
  38. stdscr.addstr(i + 2, 2, f"> {topic}")
  39. else:
  40. stdscr.addstr(i + 2, 2, f" {topic}")
  41. stdscr.refresh()
  42. key = stdscr.getch()
  43. if key == curses.KEY_DOWN and current_topic < len(topics) - 1:
  44. current_topic += 1
  45. elif key == curses.KEY_UP and current_topic > 0:
  46. current_topic -= 1
  47. elif key == 10: # Enter key
  48. return topic_urls[topics[current_topic]]
  49. # Get the last N URLs from an RSS feed
  50. def getUrls(feed_url, n=20):
  51. feed = feedparser.parse(feed_url)
  52. entries = feed.entries[-n:]
  53. urls = [entry.link for entry in entries]
  54. return urls
  55. # Often there are a bunch of ads and menus on pages for a news article. This uses newspaper3k to get just the text of just the article.
  56. def getArticleText(url):
  57. article = Article(url)
  58. article.download()
  59. article.parse()
  60. return article.text
  61. def get_summary(text):
  62. systemPrompt = "Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text given."
  63. prompt = text
  64. url = "http://localhost:11434/api/generate"
  65. payload = {
  66. "model": "mistral-openorca",
  67. "prompt": prompt,
  68. "system": systemPrompt,
  69. "stream": False
  70. }
  71. payload_json = json.dumps(payload)
  72. headers = {"Content-Type": "application/json"}
  73. response = requests.post(url, data=payload_json, headers=headers)
  74. return json.loads(response.text)["response"]
  75. # Perform K-nearest neighbors (KNN) search
  76. def knn_search(question_embedding, embeddings, k=5):
  77. X = np.array([item['embedding'] for article in embeddings for item in article['embeddings']])
  78. source_texts = [item['source'] for article in embeddings for item in article['embeddings']]
  79. # Fit a KNN model on the embeddings
  80. knn = NearestNeighbors(n_neighbors=k, metric='cosine')
  81. knn.fit(X)
  82. # Find the indices and distances of the k-nearest neighbors
  83. distances, indices = knn.kneighbors(question_embedding, n_neighbors=k)
  84. # Get the indices and source texts of the best matches
  85. best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
  86. return best_matches