1 年之前 · cc0bf96398
--- a/examples/python-rag-newssummary/README.md
+++ b/examples/python-rag-newssummary/README.md
@@ -0,0 +1,22 @@
 
				+# News Summarizer
			
 
				+
			
 
				+This example goes through a series of steps:
			
 
				+
			
 
				+  1. You choose a topic area (e.g., "news", "NVidia", "music", etc.).
			
 
				+  2. Gets the most recent articles on that topic from various sources.
			
 
				+  3. Uses Ollama to summarize each article.
			
 
				+  4. Creates chunks of sentences from each article.
			
 
				+  5. Uses Sentence Transformers to generate embeddings for each of those chunks.
			
 
				+  6. You enter a question regarding the summaries shown.
			
 
				+  7. Uses Sentence Transformers to generate an embedding for that question.
			
 
				+  8. Uses the embedded question to find the most similar chunks.
			
 
				+  9. Feeds all that to Ollama to generate a good answer to your question based on these news articles.
			
 
				+
			
 
				+This example lets you pick from a few different topic areas, then summarize the most recent x articles for that topic. It then creates chunks of sentences from each article and then generates embeddings for each of those chunks.
			
 
				+
			
 
				+You can run the example like this:
			
 
				+
			
 
				+```bash
			
 
				+pip install -r requirements.txt
			
 
				+python summ.py
			
 
				+```
			
--- a/examples/python-rag-newssummary/requirements.txt
+++ b/examples/python-rag-newssummary/requirements.txt
@@ -0,0 +1,9 @@
 
				+beautifulsoup4==4.12.2
			
 
				+feedparser==6.0.10
			
 
				+mattsollamatools==0.0.8
			
 
				+newspaper3k==0.2.8
			
 
				+nltk==3.8.1
			
 
				+numpy==1.24.3
			
 
				+Requests==2.31.0
			
 
				+scikit_learn==1.3.0
			
 
				+sentence_transformers==2.2.2
			
--- a/examples/python-rag-newssummary/summ.py
+++ b/examples/python-rag-newssummary/summ.py
@@ -0,0 +1,86 @@
 
				+import curses
			
 
				+import json
			
 
				+from utils import get_url_for_topic, topic_urls, menu, getUrls, get_summary, getArticleText, knn_search
			
 
				+import requests
			
 
				+from sentence_transformers import SentenceTransformer
			
 
				+from mattsollamatools import chunker
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    chosen_topic = curses.wrapper(menu)
			
 
				+    print("Here is your news summary:\n")
			
 
				+    urls = getUrls(chosen_topic, n=5)
			
 
				+    model = SentenceTransformer('all-MiniLM-L6-v2')
			
 
				+    allEmbeddings = []
			
 
				+
			
 
				+    for url in urls:
			
 
				+      article={}
			
 
				+      article['embeddings'] = []
			
 
				+      article['url'] = url
			
 
				+      text = getArticleText(url)
			
 
				+      summary = get_summary(text)
			
 
				+      chunks = chunker(text)  # Use the chunk_text function from web_utils
			
 
				+      embeddings = model.encode(chunks)
			
 
				+      for (chunk, embedding) in zip(chunks, embeddings):
			
 
				+        item = {}
			
 
				+        item['source'] = chunk
			
 
				+        item['embedding'] = embedding.tolist()  # Convert NumPy array to list
			
 
				+        item['sourcelength'] = len(chunk)
			
 
				+        article['embeddings'].append(item)
			
 
				+    
			
 
				+      allEmbeddings.append(article)
			
 
				+
			
 
				+      print(f"{summary}\n")
			
 
				+
			
 
				+    
			
 
				+    while True:
			
 
				+      context = []
			
 
				+      # Input a question from the user
			
 
				+      question = input("Enter your question about the news, or type quit: ")
			
 
				+
			
 
				+      if question.lower() == 'quit':
			
 
				+        break
			
 
				+
			
 
				+      # Embed the user's question
			
 
				+      question_embedding = model.encode([question])
			
 
				+
			
 
				+      # Perform KNN search to find the best matches (indices and source text)
			
 
				+      best_matches = knn_search(question_embedding, allEmbeddings, k=10)
			
 
				+
			
 
				+
			
 
				+      sourcetext=""
			
 
				+      for i, (index, source_text) in enumerate(best_matches, start=1):
			
 
				+          sourcetext += f"{i}. Index: {index}, Source Text: {source_text}"
			
 
				+
			
 
				+      systemPrompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
			
 
				+
			
 
				+      url = "http://localhost:11434/api/generate"
			
 
				+
			
 
				+      payload = {
			
 
				+      "model": "mistral-openorca",
			
 
				+      "prompt": question, 
			
 
				+      "system": systemPrompt,
			
 
				+      "stream": False, 
			
 
				+      "context": context
			
 
				+      }
			
 
				+
			
 
				+      # Convert the payload to a JSON string
			
 
				+      payload_json = json.dumps(payload)
			
 
				+
			
 
				+      # Set the headers to specify JSON content
			
 
				+      headers = {
			
 
				+          "Content-Type": "application/json"
			
 
				+      }
			
 
				+
			
 
				+      # Send the POST request
			
 
				+      response = requests.post(url, data=payload_json, headers=headers)
			
 
				+
			
 
				+      # Check the response
			
 
				+      if response.status_code == 200:
			
 
				+          output = json.loads(response.text)
			
 
				+          context = output['context']
			
 
				+          print(output['response']+ "\n")
			
 
				+          
			
 
				+
			
 
				+      else:
			
 
				+          print(f"Request failed with status code {response.status_code}")
			
 
				+
			
--- a/examples/python-rag-newssummary/utils.py
+++ b/examples/python-rag-newssummary/utils.py
@@ -0,0 +1,108 @@
 
				+import curses
			
 
				+import feedparser
			
 
				+import requests
			
 
				+import unicodedata
			
 
				+import json
			
 
				+from newspaper import Article
			
 
				+from bs4 import BeautifulSoup
			
 
				+from nltk.tokenize import sent_tokenize, word_tokenize
			
 
				+import numpy as np
			
 
				+from sklearn.neighbors import NearestNeighbors
			
 
				+from mattsollamatools import chunker
			
 
				+
			
 
				+# Create a dictionary to store topics and their URLs
			
 
				+topic_urls = {
			
 
				+    "Mac": "https://9to5mac.com/guides/mac/feed",
			
 
				+    "News": "http://www.npr.org/rss/rss.php?id=1001",
			
 
				+    "Nvidia": "https://nvidianews.nvidia.com/releases.xml",
			
 
				+    "Raspberry Pi": "https://www.raspberrypi.com/news/feed/", 
			
 
				+    "Music": "https://www.billboard.com/c/music/music-news/feed/"
			
 
				+}
			
 
				+
			
 
				+# Use curses to create a menu of topics
			
 
				+def menu(stdscr):
			
 
				+    chosen_topic = get_url_for_topic(stdscr)  
			
 
				+    url = topic_urls[chosen_topic] if chosen_topic in topic_urls else "Topic not found"
			
 
				+    
			
 
				+    stdscr.addstr(len(topic_urls) + 3, 0, f"Selected URL for {chosen_topic}: {url}")
			
 
				+    stdscr.refresh()
			
 
				+    
			
 
				+    return chosen_topic
			
 
				+
			
 
				+# You have chosen a topic. Now return the url for that topic
			
 
				+def get_url_for_topic(stdscr):
			
 
				+    curses.curs_set(0)  # Hide the cursor
			
 
				+    stdscr.clear()
			
 
				+
			
 
				+    stdscr.addstr(0, 0, "Choose a topic using the arrow keys (Press Enter to select):")
			
 
				+
			
 
				+    # Create a list of topics
			
 
				+    topics = list(topic_urls.keys())
			
 
				+    current_topic = 0
			
 
				+
			
 
				+    while True:
			
 
				+        for i, topic in enumerate(topics):
			
 
				+            if i == current_topic:
			
 
				+                stdscr.addstr(i + 2, 2, f"> {topic}")
			
 
				+            else:
			
 
				+                stdscr.addstr(i + 2, 2, f"  {topic}")
			
 
				+
			
 
				+        stdscr.refresh()
			
 
				+
			
 
				+        key = stdscr.getch()
			
 
				+
			
 
				+        if key == curses.KEY_DOWN and current_topic < len(topics) - 1:
			
 
				+            current_topic += 1
			
 
				+        elif key == curses.KEY_UP and current_topic > 0:
			
 
				+            current_topic -= 1
			
 
				+        elif key == 10:  # Enter key
			
 
				+            return topic_urls[topics[current_topic]]
			
 
				+
			
 
				+# Get the last N URLs from an RSS feed
			
 
				+def getUrls(feed_url, n=20):
			
 
				+    feed = feedparser.parse(feed_url)
			
 
				+    entries = feed.entries[-n:]
			
 
				+    urls = [entry.link for entry in entries]
			
 
				+    return urls
			
 
				+
			
 
				+# Often there are a bunch of ads and menus on pages for a news article. This uses newspaper3k to get just the text of just the article.
			
 
				+def getArticleText(url):
			
 
				+  article = Article(url)
			
 
				+  article.download()
			
 
				+  article.parse()
			
 
				+  return article.text
			
 
				+
			
 
				+def get_summary(text):
			
 
				+  systemPrompt = "Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text given."
			
 
				+  prompt = text
			
 
				+  
			
 
				+  url = "http://localhost:11434/api/generate"
			
 
				+
			
 
				+  payload = {
			
 
				+    "model": "mistral-openorca",
			
 
				+    "prompt": prompt, 
			
 
				+    "system": systemPrompt,
			
 
				+    "stream": False
			
 
				+  }
			
 
				+  payload_json = json.dumps(payload)
			
 
				+  headers = {"Content-Type": "application/json"}
			
 
				+  response = requests.post(url, data=payload_json, headers=headers)
			
 
				+
			
 
				+  return json.loads(response.text)["response"]
			
 
				+
			
 
				+# Perform K-nearest neighbors (KNN) search
			
 
				+def knn_search(question_embedding, embeddings, k=5):
			
 
				+    X = np.array([item['embedding'] for article in embeddings for item in article['embeddings']])
			
 
				+    source_texts = [item['source'] for article in embeddings for item in article['embeddings']]
			
 
				+    
			
 
				+    # Fit a KNN model on the embeddings
			
 
				+    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
			
 
				+    knn.fit(X)
			
 
				+    
			
 
				+    # Find the indices and distances of the k-nearest neighbors
			
 
				+    distances, indices = knn.kneighbors(question_embedding, n_neighbors=k)
			
 
				+    
			
 
				+    # Get the indices and source texts of the best matches
			
 
				+    best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
			
 
				+    
			
 
				+    return best_matches