Changes

tulane-cmps6730 · Apr 27, 2024 · b5480f2 · b5480f2
1 parent 478f72b
commit b5480f2
Show file tree

Hide file tree

Showing 5 changed files with 194 additions and 1 deletion.
diff --git a/nlp/app/research/AAPL.json b/nlp/app/research/AAPL.json
diff --git a/nlp/app/research/MSFT.json b/nlp/app/research/MSFT.json
diff --git a/nlp/app/research/NVDA.json b/nlp/app/research/NVDA.json
diff --git a/nlp/scrape.py b/nlp/scrape.py
@@ -0,0 +1,67 @@
+import yfinance as yf
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import time
+import json
+
+alexPath = "/Users/alex/Desktop/Tulane/S24/Repos/NLPP/"
+zachPath = ""
+relativePath = "project-finance/nlp/app/research/"
+
+def get_news_articles(ticker):
+    articles_list = []
+    stock = yf.Ticker(ticker)
+    news_items = stock.news
+
+    for item in news_items:
+        url = item['link']
+        try:
+            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True)
+            final_url = response.url  # Capture the final URL after any redirects
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, 'html.parser')
+                article_content = soup.find('div', {'class': 'caas-body'}) or soup.find('article')
+                if article_content:
+                    articles_list.append({
+                        'title': item['title'],
+                        'url': final_url,
+                        'content': article_content.text
+                    })
+                else:
+                    articles_list.append({
+                        'title': item['title'],
+                        'url': final_url,
+                        'content': "Content not found - page may use dynamic loading or have a different layout"
+                    })
+            else:
+                articles_list.append({
+                    'title': item['title'],
+                    'url': final_url,
+                    'content': f"Failed to fetch article: HTTP {response.status_code}"
+                })
+        except requests.exceptions.RequestException as e:
+            articles_list.append({
+                'title': item['title'],
+                'url': url,
+                'content': f"Error fetching the article: {str(e)}"
+            })
+
+    return articles_list
+
+def save_to_json(data, path):
+    with open(path, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+ticker = "AAPL"
+news_articles = get_news_articles(ticker)
+path = alexPath + relativePath + ticker + ".json"
+save_to_json(news_articles, path) 
+
+print("News articles saved to JSON.")
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,4 @@ tensorflow>=2.0.0
 WTForms>=2.2.1
 openai>=1.14.2
 python-dotenv>=1.0.1
-
+yfinance>=0.1.54