forked from GoogleCloudPlatform/genai-for-marketing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_trendspotting.py
230 lines (183 loc) · 6.93 KB
/
utils_trendspotting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utility module to:
- Retrieve top search terms from Google Trends dataset
- Query the GDELT API to retrieve news related to top search terms
- Summarize news articles
"""
import requests
from datetime import datetime
from typing import Any, Dict, List
from newspaper import Article
from newspaper import ArticleException
class GoogleTrends:
"""Get Trends from BQ dataset"""
def __init__(
self,
project_id: str,
bq_client: Any):
self.project_id = project_id
self.bq_client = bq_client
def run(self, refresh_date: str):
"""Gets the top search terms on a given date from the BigQuery `google_trends.top_terms` dataset.
Args:
refresh_date (str, optional):
The date of the search terms to retrieve.
Returns:
List[str]: A list of the top search terms on the specified date.
"""
query = f"""
SELECT term, rank FROM `bigquery-public-data.google_trends.top_terms`
WHERE refresh_date = '{refresh_date}'
GROUP BY 1,2
ORDER by rank ASC
"""
query_job = self.bq_client.query(
query,
location="US",
)
df = query_job.to_dataframe()
terms = df.loc[0].values[0]
terms = terms.split(' ')
return terms
class GDELTRetriever:
"""Query the GDELT API to retrieve news related to top search terms"""
def __init__(self, max_records:int = 10, tone: str = 'positive'):
self.gdelt_api_url: str = 'https://api.gdeltproject.org/api/v2/doc/doc'
self.mode: str = 'ArtList'
self.format: str = 'json'
self.max_records: int = max_records
self.n_near_words: int = 20
self.source_country: str = 'US'
self.source_lang: str = 'english'
if tone == 'positive':
self.tone = 'tone>5'
elif tone == 'negative':
self.tone = 'tone<-5'
def _get_articles_info(
self,
keywords: list[str],
startdate: datetime,
enddate: datetime) -> Dict:
"""Get articles that match the given keywords.
Args:
keywords:
A list of keywords to search for.
startdate:
The start date of the search.
enddate:
The end date of the search.
Returns:
A dictionary with news articles that match the given keywords.
"""
query = f'near{self.n_near_words}:"{" ".join(keywords)}" '
query += f'sourcecountry:{self.source_country} sourcelang:{self.source_lang} '
query += f'{self.tone}'
params = {'query': query,
'format': self.format,
'mode': self.mode,
'maxrecords': str(self.max_records),
'startdatetime': startdate,
'enddatetime': enddate}
response = requests.get(self.gdelt_api_url, params=params)
response.raise_for_status()
return response.json()
def _parse_article(self, url: str) -> str:
"""Parses an article from the given URL.
Args:
url:
The URL of the article to parse.
Returns:
The parsed article as a string.
"""
article = Article(url)
try:
article.download()
article.parse()
except ArticleException:
return None
else:
return article
def _get_documents(self, articles: Dict) -> List[Dict]:
"""Gets a list of documents from a list of articles.
Args:
articles:
A list of articles.
Returns:
A list of documents.
"""
documents = []
unique_docs = set()
for article in articles['articles']:
parsed_article = self._parse_article(article['url'])
if parsed_article and parsed_article.text and (article['title'] not in unique_docs):
unique_docs.add(article['title'])
document = {
'page_content': parsed_article.text,
'title': article['title'],
'url': article['url'],
'domain': article['domain'],
'date': article['seendate']
}
documents.append(document)
return documents
def get_relevant_documents(self, query: str) -> List[Dict]:
"""Gets a list of relevant documents from a query.
Args:
query: A query.
Returns:
A list of relevant documents.
"""
keywords = query['keywords']
startdate = query['startdate']
enddate = query['enddate']
articles = self._get_articles_info(keywords, startdate, enddate)
documents = self._get_documents(articles)
return documents
def summarize_news_article(document: Dict, llm):
"""Summarizes a news article.
Args:
document:
A dictionary containing the following keys:
`page_content`: The text of the news article.
llm: A language model that can be used to generate summaries.
Returns:
A dictionary containing the following keys:
`page_content`: The original text of the news article.
`summary`: A one-sentence summary of the news article.
"""
prompt_template = f"""Write a one sentence summary of the following article delimited by triple backticks:
```{document['page_content']}```
"""
document['summary'] = llm.predict(prompt_template).text
return document
def summarize_documents(documents: Dict, llm) -> List:
"""Summarizes a list of news articles.
Args:
documents:
A dictionary containing a list of news articles, each of which is a dictionary containing the following keys:
`page_content`: The text of the news article.
llm: A language model that can be used to generate summaries.
Returns:
A list of dictionaries, each of which contains the following keys:
`page_content`: The original text of the news article.
`summary`: A one-sentence summary of the news article.
"""
summaries = []
for document in documents:
summaries.append(
summarize_news_article(document, llm)
)
return summaries