Skip to content

Commit

Permalink
cli: Added crawl command.
Browse files Browse the repository at this point in the history
  • Loading branch information
HavenDV committed Sep 8, 2024
1 parent 29119fe commit 5a62d9e
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 0 deletions.
125 changes: 125 additions & 0 deletions src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
using System.CommandLine;
using System.Diagnostics.CodeAnalysis;

namespace Firecrawl.Cli.Commands;

public class CrawlCommand : Command
{
public CrawlCommand() : base(name: "crawl", description: "Crawl a url and saves all pages as markdown")
{
var url = new Argument<string>(
name: "url",
getDefaultValue: () => string.Empty,
description: "Input url");
AddArgument(url);

var outputPath = new Option<string>(
name: "outputPath",
getDefaultValue: () => string.Empty,
description: "Output path");
AddOption(outputPath);

var limit = new Option<int>(
name: "limit",
getDefaultValue: () => 10,
description: "Limit of pages to crawl");
AddOption(limit);

var maxDepth = new Option<int?>(
name: "maxDepth",
getDefaultValue: () => null,
description: "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.");
AddOption(maxDepth);

this.SetHandler(
HandleAsync,
url,
outputPath,
limit,
maxDepth);
}

private static async Task HandleAsync(
string url,
string outputPath,
int limit,
int? maxDepth)
{
Console.WriteLine("Initializing...");

var apiKey = await Helpers.GetApiKey().ConfigureAwait(false);
using var api = new FirecrawlApp(apiKey);

Console.WriteLine($"Crawling {url}...");

var response = await api.Crawling.CrawlUrlsAsync(
url: url,
crawlerOptions: new CrawlUrlsRequestCrawlerOptions
{
MaxDepth = maxDepth,
Limit = limit,
},
pageOptions: new CrawlUrlsRequestPageOptions
{
OnlyMainContent = true,
WaitFor = 1000,
}).ConfigureAwait(false);

Console.WriteLine($"JobId: {response.JobId}");

GetCrawlStatusResponse? statusResponse = null;
while (true)
{
await Task.Delay(TimeSpan.FromSeconds(5)).ConfigureAwait(false);

statusResponse = await api.Crawl.GetCrawlStatusAsync(
jobId: response.JobId!).ConfigureAwait(false);
if (statusResponse.Status == "completed")
{
break;
}
}

if (string.IsNullOrWhiteSpace(outputPath))
{
outputPath = new Uri(url).Host;
}

Directory.CreateDirectory(outputPath);

var index = 0;
foreach (var data in statusResponse.Data ?? [])
{
var name = string.IsNullOrWhiteSpace(data.Metadata?.SourceURL)
? $"output{++index}.md"
: $"{ConvertUrlToFilename(data.Metadata.SourceURL)}.md";
var subPath = Path.Combine(outputPath, name);

var fileInfo = new FileInfo(subPath);
await File.WriteAllTextAsync(fileInfo.FullName, data.Markdown).ConfigureAwait(false);
Console.WriteLine($"Output file: {new Uri(fileInfo.FullName).AbsoluteUri}");
}

Console.WriteLine("Done.");
}

public static string ConvertUrlToFilename(string url)
{
url = url ?? throw new ArgumentNullException(nameof(url));

url = url
.Replace("https://", string.Empty, StringComparison.OrdinalIgnoreCase)
.Replace("https:/", string.Empty, StringComparison.OrdinalIgnoreCase)
.Replace("http://", string.Empty, StringComparison.OrdinalIgnoreCase)
.Replace("http:/", string.Empty, StringComparison.OrdinalIgnoreCase)
.Replace("www.", string.Empty, StringComparison.OrdinalIgnoreCase);

// Replace invalid filename characters with '_'
foreach (char c in Path.GetInvalidFileNameChars())
{
url = url.Replace(c, '_');
}

return url;
}
}
40 changes: 40 additions & 0 deletions src/libs/Firecrawl.Cli/Commands/MapCommand.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
using System.CommandLine;

namespace Firecrawl.Cli.Commands;

public class MapCommand : Command
{
public MapCommand() : base(name: "map", description: "Attempts to output all website's urls in a few seconds.")
{
var url = new Argument<string>(
name: "url",
getDefaultValue: () => string.Empty,
description: "Input url");
AddArgument(url);

var outputPath = new Option<string>(
name: "outputPath",
getDefaultValue: () => string.Empty,
description: "Output path");
AddOption(outputPath);

this.SetHandler(
HandleAsync,
url,
outputPath);
}

private static async Task HandleAsync(
string url,
string outputPath)
{
Console.WriteLine("Initializing...");

var apiKey = await Helpers.GetApiKey().ConfigureAwait(false);
using var api = new FirecrawlApp(apiKey);

Console.WriteLine($"Maps {url}...");

Console.WriteLine("Done.");
}
}
2 changes: 2 additions & 0 deletions src/libs/Firecrawl.Cli/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@
description: "CLI tool to use Firecrawl API");
rootCommand.AddCommand(new AuthCommand());
rootCommand.AddCommand(new ScrapeCommand());
rootCommand.AddCommand(new CrawlCommand());
rootCommand.AddCommand(new MapCommand());

return await rootCommand.InvokeAsync(args).ConfigureAwait(false);

0 comments on commit 5a62d9e

Please sign in to comment.