From 37d8fb596dc6553fdc8870634c6e0207270b6d9a Mon Sep 17 00:00:00 2001 From: Wagalidoom Date: Thu, 21 Nov 2024 04:12:41 +0100 Subject: [PATCH] feat: add back python script --- src/generateYmal.js | 42 +++ src/scripts/.env.test | 1 + src/scripts/requirements.txt | 6 + src/scripts/setup_environment.sh | 22 ++ src/scripts/snapshot_crawler.py | 455 +++++++++++++++++++++++++++++++ 5 files changed, 526 insertions(+) create mode 100644 src/generateYmal.js create mode 100644 src/scripts/.env.test create mode 100644 src/scripts/requirements.txt create mode 100644 src/scripts/setup_environment.sh create mode 100644 src/scripts/snapshot_crawler.py diff --git a/src/generateYmal.js b/src/generateYmal.js new file mode 100644 index 00000000..510ccff3 --- /dev/null +++ b/src/generateYmal.js @@ -0,0 +1,42 @@ +import { spawn } from 'child_process'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import fs from 'fs'; + +// Convert the URL path of the current module to a directory path +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +// Execute the Python script +const runPythonScript = () => { + // Path to the Python executable in the virtual environment + const pythonExecutable = path.join(__dirname, 'scripts/gip_scraper/bin/python'); + + // Log file paths + const stdoutLog = path.join(__dirname, 'logs/python_stdout.log'); + const stderrLog = path.join(__dirname, 'logs/python_stderr.log'); + + // Ensure log directory exists + fs.mkdirSync(path.join(__dirname, 'logs'), { recursive: true }); + + // Spawn the Python process using the virtual environment's Python executable + const pythonProcess = spawn(pythonExecutable, ['src/scripts/snapshot_crawler.py']); + + // Handle standard output + pythonProcess.stdout.on('data', (data) => { + console.log(data.toString()); + fs.appendFileSync(stdoutLog, data); + }); + + // Handle error output + pythonProcess.stderr.on('data', (data) => { + console.error(data.toString()); + fs.appendFileSync(stderrLog, data); + }); + + // Handle process exit + pythonProcess.on('close', (code) => { + console.log(`Python script exited with code ${code}`); + }); +}; + +runPythonScript(); \ No newline at end of file diff --git a/src/scripts/.env.test b/src/scripts/.env.test new file mode 100644 index 00000000..1abeb1da --- /dev/null +++ b/src/scripts/.env.test @@ -0,0 +1 @@ +GEMINI_API_KEY \ No newline at end of file diff --git a/src/scripts/requirements.txt b/src/scripts/requirements.txt new file mode 100644 index 00000000..3ba9974d --- /dev/null +++ b/src/scripts/requirements.txt @@ -0,0 +1,6 @@ +requests +pyyaml +beautifulsoup4 +html2text +google-generativeai +python-dotenv \ No newline at end of file diff --git a/src/scripts/setup_environment.sh b/src/scripts/setup_environment.sh new file mode 100644 index 00000000..088f6702 --- /dev/null +++ b/src/scripts/setup_environment.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +# Setup or confirm virtual environment +ENV_DIR="$SCRIPT_DIR/gip_scraper" +if [ ! -d "$ENV_DIR" ]; then + echo "Setting up a new Python virtual environment..." + python3 -m venv "$ENV_DIR" +else + echo "Using existing virtual environment." +fi + +# Activate the virtual environment +source "$ENV_DIR/bin/activate" + +# Install or update dependencies +echo "Installing requirements from requirements.txt..." +pip install -r "$SCRIPT_DIR/requirements.txt" + +echo "Environment setup is complete." \ No newline at end of file diff --git a/src/scripts/snapshot_crawler.py b/src/scripts/snapshot_crawler.py new file mode 100644 index 00000000..a6e734b5 --- /dev/null +++ b/src/scripts/snapshot_crawler.py @@ -0,0 +1,455 @@ +import warnings +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + +import requests +import yaml +import re +import os +import logging +from bs4 import BeautifulSoup +import html2text +from datetime import datetime +import time +from dotenv import load_dotenv +import google.generativeai as genai +from collections import deque + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +print("Current working directory:", os.getcwd()) + +load_dotenv() + +# Configure the Gemini API +GEMINI_API_KEY = os.getenv('GEMINI_API_KEY') +genai.configure(api_key=GEMINI_API_KEY) + +class CustomDumper(yaml.SafeDumper): + def increase_indent(self, flow=False, indentless=False): + return super(CustomDumper, self).increase_indent(flow, False) + +class RateLimiter: + def __init__(self, max_calls, period): + self.max_calls = max_calls + self.period = period + self.calls = deque() + + def __call__(self, f): + def wrapped(*args, **kwargs): + now = time.time() + + # Remove old calls + while self.calls and now - self.calls[0] >= self.period: + self.calls.popleft() + + if len(self.calls) >= self.max_calls: + sleep_time = self.period - (now - self.calls[0]) + logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.") + time.sleep(sleep_time) + now = time.time() + + self.calls.append(now) + return f(*args, **kwargs) + return wrapped + +# Create a rate limiter for 15 calls per 60 seconds +rate_limiter = RateLimiter(max_calls=15, period=60) + +@rate_limiter +def extract_funding_info(text): + try: + model = genai.GenerativeModel('gemini-1.5-flash') + prompt = f""" + Extract funding information from the following text. If funding is mentioned, provide the amount and currency. If no funding is mentioned, say 'No funding mentioned'. + + Text: {text[:1000]} + + Respond in the following format: + Amount: [number or 'None'] + Currency: [currency code or symbol, or 'None'] + Confidence: [High/Medium/Low] + """ + + response = model.generate_content(prompt) + result = response.text + + # Parse the result + amount_match = re.search(r'Amount: (.+)', result) + currency_match = re.search(r'Currency: (.+)', result) + confidence_match = re.search(r'Confidence: (.+)', result) + + amount = amount_match.group(1) if amount_match else None + currency = currency_match.group(1) if currency_match else None + confidence = confidence_match.group(1) if confidence_match else 'Low' + + if amount == 'None' or currency == 'None': + amount = None + currency = None + + return { + 'amount': amount, + 'currency': currency, + 'confidence': confidence, + 'model_used': 'gemini-1.5-flash' + } + + except Exception as e: + logger.error(f"Error in extract_funding_info: {str(e)}") + return { + 'amount': None, + 'currency': None, + 'confidence': None, + 'error': str(e), + 'model_used': 'none' + } + +def fetch_html_content(url): + header = {'User-Agent': 'Mozilla/5.0'} + response = requests.get(url, headers=header) + if response.status_code == 200: + return response.text + return None + +def parse_html_content(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + + # Extract timestamp + time_element = soup.find('time', class_='post-time') + datetime_value = time_element['datetime'] if time_element else "" + unix_timestamp = convert_to_unix_timestamp(datetime_value) + + # Extract tags + discourse_tags = [tag.text for tag in soup.find_all('a', class_='discourse-tag')] + + # Extract meta content and title + meta_content = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "" + title_content = soup.title.string.split(':')[-1].replace('- GIPs - Gnosis','').strip() if soup.title else "" + + # Clean up HTML content + clean_html_content(soup) + + # Remove title from body content + # Find and remove the first h1 or h2 that contains the title + title_elements = soup.find_all(['h1', 'h2']) + for element in title_elements: + if element.get_text().strip().lower() == title_content.lower() or \ + element.get_text().strip().startswith(f"GIP-") or \ + "Should" in element.get_text(): + element.decompose() + break + + # Convert remaining content to Markdown + markdown_text = html_to_markdown(str(soup.body)) + + # Remove any remaining title-like lines from the start of the markdown + markdown_lines = markdown_text.split('\n') + while markdown_lines and ( + markdown_lines[0].strip().lower() == title_content.lower() or + markdown_lines[0].strip().startswith("# GIP-") or + "Should" in markdown_lines[0] + ): + markdown_lines.pop(0) + + # Rejoin the markdown text, skipping empty lines at the start + while markdown_lines and not markdown_lines[0].strip(): + markdown_lines.pop(0) + + markdown_text = '\n'.join(markdown_lines) + cleaned_body = clean_body_content(markdown_text) + + return cleaned_body, meta_content, title_content, unix_timestamp, discourse_tags + +def clean_body_content(body_text): + # Split into lines for processing + lines = body_text.split('\n') + + # Find where the actual content starts + content_start = 0 + in_header = False + + for i, line in enumerate(lines): + line = line.strip() + + # Skip empty lines + if not line: + continue + + # Check for header section markers + if line.startswith('GIP:') or line == '0 voters': + in_header = True + continue + + # Look for common content start markers + if (line.startswith('##') or + line.startswith('Category') or + line.startswith('Executive Summary') or + line.startswith('Simple Summary') or + line.startswith('Abstract') or + line.startswith('Motivation')): + content_start = i + break + + # If we're not in a header section and find substantial content, start here + if not in_header and len(line) > 20 and not line.startswith('*'): + content_start = i + break + + # Join remaining lines + cleaned_content = '\n'.join(lines[content_start:]) + + # Remove any leading empty lines + cleaned_content = cleaned_content.lstrip() + + return cleaned_content + +def convert_to_unix_timestamp(datetime_value): + if datetime_value: + dt_obj = datetime.strptime(datetime_value, "%Y-%m-%dT%H:%M:%SZ") + return int(time.mktime(dt_obj.timetuple())) + return "" + +def clean_html_content(soup): + elements_to_remove = [ + ('div', {'itemprop': 'comment'}), + ('header', {}), + ('footer', {}), + ('div', {'id': 'topic-title'}), + ('span', {'class': 'creator'}), + ('span', {'class': 'crawler-post-infos'}) + ] + for tag, attrs in elements_to_remove: + for element in soup.find_all(tag, attrs): + element.decompose() + +def html_to_markdown(html_content): + converter = html2text.HTML2Text() + converter.ignore_links = False + converter.body_width = 0 + return converter.handle(html_content) + +def extract_info_from_meta(content): + patterns = { + 'gip_number': r'GIP: (\d+)', + 'author': r'author: ([^,]+)', + 'state': r'status: (.*?)(?:, [a-z]+:|$)', + 'type': r'type: ([^,]+)', + 'created': r'created: (\d{4}-\d{2}-\d{2})' + } + return {key: re.search(pattern, content, re.IGNORECASE).group(1).strip() if re.search(pattern, content, re.IGNORECASE) else None + for key, pattern in patterns.items()} + +def fetch_forum_gips(base_url): + logger.info(f"Fetching GIPs from {base_url}") + max_gip = 0 + topics = [] + page = 0 + try: + while True: + url = f"{base_url}?page={page}" + response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) + response.raise_for_status() + topic_list = response.json()['topic_list'] + new_topics = topic_list['topics'] + topics.extend(new_topics) + + for topic in new_topics: + match = re.search(r'GIP-?(\d+)', topic['slug'], re.IGNORECASE) + if match: + max_gip = max(max_gip, int(match.group(1))) + + if 'more_topics_url' not in topic_list: + break + page += 1 + logger.info(f"Fetched {len(topics)} topics, max GIP number: {max_gip}") + return max_gip, topics + except Exception as e: + logger.error(f"Error fetching forum GIPs: {str(e)}") + return 0, [] + +def fetch_snapshot_proposals(max_gip): + logger.info(f"Fetching snapshot proposals for max GIP: {max_gip}") + url = 'https://hub.snapshot.org/graphql' + payload = { + "operationName": "Proposals", + "variables": { + "first": max_gip, + "skip": 0, + "space_in": ["gnosis.eth"], + "state": "all", + "title_contains": "", + "flagged": False + }, + "query": """query Proposals($first: Int!, $skip: Int!, $state: String!, $space: String, $space_in: [String], $author_in: [String], $title_contains: String, $space_verified: Boolean, $flagged: Boolean) { + proposals(first: $first, skip: $skip, where: {space: $space, state: $state, space_in: $space_in, author_in: $author_in, title_contains: $title_contains, space_verified: $space_verified, flagged: $flagged}) { + id + title + body + start + end + state + author + created + choices + scores_state + scores_total + scores + votes + quorum + } + }""" + } + try: + response = requests.post(url, json=payload) + response.raise_for_status() + proposals = response.json()['data']['proposals'] + logger.info(f"Fetched {len(proposals)} proposals from Snapshot") + return proposals + except Exception as e: + logger.error(f"Error fetching snapshot proposals: {str(e)}") + return [] + +def extract_and_clean_gip_number(title): + match = re.search(r'GIP[- ]?(\d+)', title, re.IGNORECASE) + if match: + clean_title = re.sub(r'\s*GIP[- ]?\d+:\s*', '', title, flags=re.IGNORECASE) + return match.group(1), clean_title + return None, title + +def integrate_missing_proposals(missing_gips, forum_topics): + missing_proposals = [] + for topic in forum_topics: + slug = topic['slug'] + match = re.search(r'GIP-?(\d+)', slug, re.IGNORECASE) + if match and int(match.group(1)) in missing_gips: + url = f'https://forum.gnosis.io/t/{slug}' + html_content = fetch_html_content(url) + if html_content: + full_content, meta_content, title_content, unix_timestamp, discourse_tags = parse_html_content(html_content) + proposal_info = extract_info_from_meta(meta_content) + proposal = create_proposal_dict(slug, int(match.group(1)), title_content, full_content, unix_timestamp, discourse_tags, proposal_info) + missing_proposals.append(proposal) + return missing_proposals + +def create_proposal_dict(slug, gip_number, title, body, start, state, proposal_info): + funding_info = extract_funding_info(body) + return { + 'id': slug, + 'gip_number': gip_number, + 'title': title, + 'body': body, + 'start': start, + 'end': None, + 'state': state, + 'author': proposal_info['author'], + 'choices': ['For', 'Against', 'Abstain'], + 'scores_state': None, + 'scores_total': None, + 'scores': [0, 0, 0], + 'votes': None, + 'quorum': None, + 'funding': funding_info + } + +def create_yaml_content(proposal): + scores = proposal['scores'] + if len(scores) == 2: + scores.append(0) + return { + 'id': proposal['id'], + 'gip_number': proposal['gip_number'], + 'url': f'https://forum.gnosis.io/t/{proposal["id"]}' if proposal["id"][:3]=='gip' else f'https://snapshot.org/#/gnosis.eth/proposal/{proposal["id"]}', + 'title': proposal['title'], + 'body': proposal['body'], + 'start': proposal['start'], + 'end': proposal['end'], + 'state': proposal['state'], + 'author': proposal['author'], + 'choices': ['For', 'Against', 'Abstain'], + 'scores_state': proposal['scores_state'], + 'scores_total': proposal['scores_total'], + 'scores': scores, + 'votes': proposal['votes'], + 'quorum': proposal['quorum'], + 'funding': proposal.get('funding', {'amount': None, 'currency': None, 'confidence': None}) + } + +def save_proposal_as_yaml(proposal, gip_tracker): + gip_number = proposal['gip_number'] + start = proposal.get('start', 0) + + if gip_number: + if gip_number in gip_tracker: + max_start, file_id = gip_tracker[gip_number] + if start > max_start: + gip_tracker[gip_number] = (start, file_id + 1) + file_suffix = "" + old_file_name = f"./public/GIPs/GIP-{gip_number}.yml" + new_file_name = f"./public/GIPs/GIP-{gip_number}-redo{file_id}.yml" + if os.path.exists(old_file_name): + os.rename(old_file_name, new_file_name) + else: + file_suffix = f"-redo{file_id + 1}" + else: + gip_tracker[gip_number] = (start, 0) + file_suffix = "" + else: + file_suffix = "-unknown" + + yaml_content = create_yaml_content(proposal) + file_name = f"./public/GIPs/GIP-{gip_number}{file_suffix}.yml" + os.makedirs(os.path.dirname(file_name), exist_ok=True) + + with open(file_name, 'w') as file: + yaml.dump(yaml_content, file, Dumper=CustomDumper, allow_unicode=True, sort_keys=False) + +def main(): + try: + logger.info("Starting GIP scraping process") + base_url = 'https://forum.gnosis.io/c/dao/gips/20.json' + + max_gip, forum_topics = fetch_forum_gips(base_url) + if max_gip == 0: + logger.error("Failed to fetch GIPs from forum. Exiting.") + return + + proposals = fetch_snapshot_proposals(max_gip) + if not proposals: + logger.error("Failed to fetch proposals from Snapshot. Exiting.") + return + + processed_proposals = [] + for i, proposal in enumerate(proposals): + try: + gip_number, clean_title = extract_and_clean_gip_number(proposal['title']) + proposal['gip_number'] = gip_number + proposal['title'] = clean_title + proposal['funding'] = extract_funding_info(proposal['body']) + processed_proposals.append(proposal) + logger.info(f"Processed proposal {i+1}/{len(proposals)}: GIP-{gip_number}") + except Exception as e: + logger.error(f"Error processing proposal {proposal.get('id', 'Unknown')}: {str(e)}") + + processed_proposals = sorted(processed_proposals, key=lambda p: p['created']) + + gip_numbers_from_api = {int(p['gip_number']) for p in processed_proposals if p['gip_number']} + missing_gips = set(range(1, max_gip + 1)) - gip_numbers_from_api + + additional_proposals = integrate_missing_proposals(missing_gips, forum_topics) + processed_proposals.extend(additional_proposals) + + gip_tracker = {} + for proposal in processed_proposals: + try: + save_proposal_as_yaml(proposal, gip_tracker) + except Exception as e: + logger.error(f"Error saving proposal {proposal.get('gip_number', 'Unknown')}: {str(e)}") + + logger.info(f"Generated {len(processed_proposals)} YAML files.") + except Exception as e: + logger.error(f"An error occurred in main: {str(e)}") + +if __name__ == "__main__": + main() \ No newline at end of file