elizaOS · calebsheridan · Nov 7, 2024
diff --git a/README.md b/README.md
@@ -43,9 +43,9 @@ Read the example character file and validate it against the JSON schema
 
 # Scripts
 
-You can use the scripts the generate a character file from your tweets, convert a folder of documents into a knowledge file, and add knowledge to your character file.
+You can use the scripts to generate a character file from your tweets, convert web pages or a folder of documents into a knowledge file, and add knowledge to your character file.
 
-Most of these scripts require an OpenAI or Anthropic API key.
+Most of these scripts require an OpenAI or Anthropic API key. The web2folder script requires a FireCrawl API key.
 
 ## tweets2character
 
@@ -70,6 +70,27 @@ node scripts/tweets2character.js twitter-2024-07-22-aed6e84e05e7976f87480bc36686
 
 Note that the arguments are optional and will be prompted for if not provided.
 
+## web2folder
+
+Convert web pages into markdown files that can be processed by folder2knowledge.
+
+You can run web2folder directly from your command line with no downloads:
+
+```sh
+npx web2folder https://github.com/ai16z/eliza
+```
+
+Or after cloning the repo:
+
+```sh
+npm install
+node scripts/web2folder.js https://github.com/ai16z/eliza
+```
+
+Note: you will need a [FireCrawl API key](https://docs.firecrawl.dev/introduction) set in your environment as FIRECRAWL_API_KEY.
+
+The script will create a `web-content` directory with markdown files that you can then process using folder2knowledge.
+
 ## folder2knowledge
 
 Convert a folder of images and videos into a .knowledge file which you can use with [Eliza](https://github.com/lalalune/eliza). Will convert text, markdown and PDF into normalized text in JSON format.

diff --git a/package.json b/package.json
@@ -7,12 +7,14 @@
   "bin": {
     "tweets2character": "scripts/tweets2character.js",
     "folder2knowledge": "scripts/folder2knowledge.js",
-    "knowledge2character": "scripts/knowledge2character.js"
+    "knowledge2character": "scripts/knowledge2character.js",
+    "web2folder": "scripts/web2folder.js"
   },
   "scripts": {
     "tweets2character": "node scripts/tweets2character.js",
     "folder2knowledge": "node scripts/folder2knowledge.js",
     "knowledge2character": "node scripts/knowledge2character.js",
+    "web2folder": "node scripts/web2folder.js",
     "example": "node examples/example.mjs",
     "validate": "node examples/validate.mjs"
   },
@@ -28,6 +30,8 @@
     "node-fetch": "^3.3.2",
     "node-llama-cpp": "^3.0.0-beta.44",
     "node-stream-zip": "^1.15.0",
+    "pdfjs-dist": "2.x",
+    "sanitize-filename": "^1.6.3",
     "systeminformation": "^5.23.5",
     "tiktoken": "^1.0.16"
   },

diff --git a/scripts/web2folder.js b/scripts/web2folder.js
@@ -0,0 +1,157 @@
+#!/usr/bin/env node
+
+import dotenv from 'dotenv';
+import fs from 'fs/promises';
+import path from 'path';
+import sanitizeFilename from 'sanitize-filename';
+import os from 'os';
+import readline from 'readline';
+
+dotenv.config();
+
+const FIRECRAWL_API_URL = 'https://api.firecrawl.dev/v1';
+
+const tmpDir = path.join(os.homedir(), 'tmp', '.eliza');
+const envPath = path.join(tmpDir, '.env');
+
+const ensureTmpDirAndEnv = async () => {
+  await fs.mkdir(tmpDir, { recursive: true });
+  if (!await fs.access(envPath).then(() => true).catch(() => false)) {
+    await fs.writeFile(envPath, '');
+  }
+};
+
+const saveApiKey = async (apiKey) => {
+  const envConfig = dotenv.parse(await fs.readFile(envPath, 'utf-8'));
+  envConfig.FIRECRAWL_API_KEY = apiKey;
+  await fs.writeFile(envPath, Object.entries(envConfig).map(([key, value]) => `${key}=${value}`).join('\n'));
+};
+
+const loadApiKey = async () => {
+  const envConfig = dotenv.parse(await fs.readFile(envPath, 'utf-8'));
+  return envConfig.FIRECRAWL_API_KEY;
+};
+
+const validateApiKey = (apiKey) => {
+  return apiKey && apiKey.trim().startsWith('fc-');
+};
+
+const promptForApiKey = () => {
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout
+  });
+
+  return new Promise((resolve) => {
+    rl.question('Enter your Firecrawl API key: ', (answer) => {
+      rl.close();
+      resolve(answer);
+    });
+  });
+};
+
+const getApiKey = async () => {
+  if (validateApiKey(process.env.FIRECRAWL_API_KEY)) {
+    return process.env.FIRECRAWL_API_KEY;
+  }
+
+  const cachedKey = await loadApiKey();
+  if (validateApiKey(cachedKey)) {
+    return cachedKey;
+  }
+
+  const newKey = await promptForApiKey();
+  if (validateApiKey(newKey)) {
+    await saveApiKey(newKey);
+    return newKey;
+  } else {
+    console.error('Invalid API key provided. Exiting.');
+    process.exit(1);
+  }
+};
+
+const promptForUrls = () => {
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout
+  });
+
+  return new Promise((resolve) => {
+    rl.question('Enter URLs (separated by spaces): ', (answer) => {
+      rl.close();
+      resolve(answer.split(' ').filter(url => url.trim()));
+    });
+  });
+};
+
+const scrapeUrl = async (url, apiKey) => {
+  try {
+    const response = await fetch(`${FIRECRAWL_API_URL}/scrape`, {
+      method: 'POST',
+      headers: {
+        'Authorization': `Bearer ${apiKey}`,
+        'Content-Type': 'application/json'
+      },
+      body: JSON.stringify({ url })
+    });
+
+    const result = await response.json();
+
+    if (!result.success) {
+      throw new Error(`Failed to fetch ${url}: ${result.error}`);
+    }
+
+    const filename = sanitizeFilename(url.replace(/^https?:\/\//, '')) + '.md';
+
+    return {
+      url,
+      content: result.data.markdown,
+      filename
+    };
+  } catch (error) {
+    console.error(`Error processing URL ${url}:`, error);
+    return null;
+  }
+};
+
+const saveToFolder = async (outputDir, webData) => {
+  if (!webData) return;
+
+  const outputPath = path.join(outputDir, webData.filename);
+  await fs.writeFile(outputPath, webData.content, 'utf-8');
+  console.log(`Saved ${webData.url} to ${outputPath}`);
+};
+
+const main = async () => {
+  try {
+    await ensureTmpDirAndEnv();
+    const apiKey = await getApiKey();
+    process.env.FIRECRAWL_API_KEY = apiKey;
+
+    let urls = process.argv.slice(2);
+    if (urls.length === 0) {
+      urls = await promptForUrls();
+      if (urls.length === 0) {
+        console.error('No URLs provided. Exiting.');
+        process.exit(1);
+      }
+    }
+
+    const outputDir = path.join(process.cwd(), process.env.OUTPUT_DIR || 'web-content');
+    await fs.mkdir(outputDir, { recursive: true });
+
+    for (const url of urls) {
+      const webData = await scrapeUrl(url, apiKey);
+      if (webData) {
+        await saveToFolder(outputDir, webData);
+      }
+    }
+
+    console.log('Done processing web content.');
+  } catch (error) {
+    console.error('Error during script execution:', error);
+    process.exit(1);
+  }
+};
+
+main();