123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- import curses
- import feedparser
- import requests
- import unicodedata
- import json
- from newspaper import Article
- from bs4 import BeautifulSoup
- from nltk.tokenize import sent_tokenize, word_tokenize
- import numpy as np
- from sklearn.neighbors import NearestNeighbors
- from mattsollamatools import chunker
- # Create a dictionary to store topics and their URLs
- topic_urls = {
- "Mac": "https://9to5mac.com/guides/mac/feed",
- "News": "http://www.npr.org/rss/rss.php?id=1001",
- "Nvidia": "https://nvidianews.nvidia.com/releases.xml",
- "Raspberry Pi": "https://www.raspberrypi.com/news/feed/",
- "Music": "https://www.billboard.com/c/music/music-news/feed/"
- }
- # Use curses to create a menu of topics
- def menu(stdscr):
- chosen_topic = get_url_for_topic(stdscr)
- url = topic_urls[chosen_topic] if chosen_topic in topic_urls else "Topic not found"
-
- stdscr.addstr(len(topic_urls) + 3, 0, f"Selected URL for {chosen_topic}: {url}")
- stdscr.refresh()
-
- return chosen_topic
- # You have chosen a topic. Now return the url for that topic
- def get_url_for_topic(stdscr):
- curses.curs_set(0) # Hide the cursor
- stdscr.clear()
- stdscr.addstr(0, 0, "Choose a topic using the arrow keys (Press Enter to select):")
- # Create a list of topics
- topics = list(topic_urls.keys())
- current_topic = 0
- while True:
- for i, topic in enumerate(topics):
- if i == current_topic:
- stdscr.addstr(i + 2, 2, f"> {topic}")
- else:
- stdscr.addstr(i + 2, 2, f" {topic}")
- stdscr.refresh()
- key = stdscr.getch()
- if key == curses.KEY_DOWN and current_topic < len(topics) - 1:
- current_topic += 1
- elif key == curses.KEY_UP and current_topic > 0:
- current_topic -= 1
- elif key == 10: # Enter key
- return topic_urls[topics[current_topic]]
- # Get the last N URLs from an RSS feed
- def getUrls(feed_url, n=20):
- feed = feedparser.parse(feed_url)
- entries = feed.entries[-n:]
- urls = [entry.link for entry in entries]
- return urls
- # Often there are a bunch of ads and menus on pages for a news article. This uses newspaper3k to get just the text of just the article.
- def getArticleText(url):
- article = Article(url)
- article.download()
- article.parse()
- return article.text
- def get_summary(text):
- systemPrompt = "Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text given."
- prompt = text
-
- url = "http://localhost:11434/api/generate"
- payload = {
- "model": "mistral-openorca",
- "prompt": prompt,
- "system": systemPrompt,
- "stream": False
- }
- payload_json = json.dumps(payload)
- headers = {"Content-Type": "application/json"}
- response = requests.post(url, data=payload_json, headers=headers)
- return json.loads(response.text)["response"]
- # Perform K-nearest neighbors (KNN) search
- def knn_search(question_embedding, embeddings, k=5):
- X = np.array([item['embedding'] for article in embeddings for item in article['embeddings']])
- source_texts = [item['source'] for article in embeddings for item in article['embeddings']]
-
- # Fit a KNN model on the embeddings
- knn = NearestNeighbors(n_neighbors=k, metric='cosine')
- knn.fit(X)
-
- # Find the indices and distances of the k-nearest neighbors
- distances, indices = knn.kneighbors(question_embedding, n_neighbors=k)
-
- # Get the indices and source texts of the best matches
- best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
-
- return best_matches
|