From 5a62d9e40cfa2d5e8801daf44ee84dbe56deb8de Mon Sep 17 00:00:00 2001 From: HavenDV Date: Mon, 9 Sep 2024 01:24:13 +0400 Subject: [PATCH] cli: Added crawl command. --- .../Firecrawl.Cli/Commands/CrawlCommand.cs | 125 ++++++++++++++++++ src/libs/Firecrawl.Cli/Commands/MapCommand.cs | 40 ++++++ src/libs/Firecrawl.Cli/Program.cs | 2 + 3 files changed, 167 insertions(+) create mode 100644 src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs create mode 100644 src/libs/Firecrawl.Cli/Commands/MapCommand.cs diff --git a/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs b/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs new file mode 100644 index 0000000..aee92cb --- /dev/null +++ b/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs @@ -0,0 +1,125 @@ +using System.CommandLine; +using System.Diagnostics.CodeAnalysis; + +namespace Firecrawl.Cli.Commands; + +public class CrawlCommand : Command +{ + public CrawlCommand() : base(name: "crawl", description: "Crawl a url and saves all pages as markdown") + { + var url = new Argument( + name: "url", + getDefaultValue: () => string.Empty, + description: "Input url"); + AddArgument(url); + + var outputPath = new Option( + name: "outputPath", + getDefaultValue: () => string.Empty, + description: "Output path"); + AddOption(outputPath); + + var limit = new Option( + name: "limit", + getDefaultValue: () => 10, + description: "Limit of pages to crawl"); + AddOption(limit); + + var maxDepth = new Option( + name: "maxDepth", + getDefaultValue: () => null, + description: "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."); + AddOption(maxDepth); + + this.SetHandler( + HandleAsync, + url, + outputPath, + limit, + maxDepth); + } + + private static async Task HandleAsync( + string url, + string outputPath, + int limit, + int? maxDepth) + { + Console.WriteLine("Initializing..."); + + var apiKey = await Helpers.GetApiKey().ConfigureAwait(false); + using var api = new FirecrawlApp(apiKey); + + Console.WriteLine($"Crawling {url}..."); + + var response = await api.Crawling.CrawlUrlsAsync( + url: url, + crawlerOptions: new CrawlUrlsRequestCrawlerOptions + { + MaxDepth = maxDepth, + Limit = limit, + }, + pageOptions: new CrawlUrlsRequestPageOptions + { + OnlyMainContent = true, + WaitFor = 1000, + }).ConfigureAwait(false); + + Console.WriteLine($"JobId: {response.JobId}"); + + GetCrawlStatusResponse? statusResponse = null; + while (true) + { + await Task.Delay(TimeSpan.FromSeconds(5)).ConfigureAwait(false); + + statusResponse = await api.Crawl.GetCrawlStatusAsync( + jobId: response.JobId!).ConfigureAwait(false); + if (statusResponse.Status == "completed") + { + break; + } + } + + if (string.IsNullOrWhiteSpace(outputPath)) + { + outputPath = new Uri(url).Host; + } + + Directory.CreateDirectory(outputPath); + + var index = 0; + foreach (var data in statusResponse.Data ?? []) + { + var name = string.IsNullOrWhiteSpace(data.Metadata?.SourceURL) + ? $"output{++index}.md" + : $"{ConvertUrlToFilename(data.Metadata.SourceURL)}.md"; + var subPath = Path.Combine(outputPath, name); + + var fileInfo = new FileInfo(subPath); + await File.WriteAllTextAsync(fileInfo.FullName, data.Markdown).ConfigureAwait(false); + Console.WriteLine($"Output file: {new Uri(fileInfo.FullName).AbsoluteUri}"); + } + + Console.WriteLine("Done."); + } + + public static string ConvertUrlToFilename(string url) + { + url = url ?? throw new ArgumentNullException(nameof(url)); + + url = url + .Replace("https://", string.Empty, StringComparison.OrdinalIgnoreCase) + .Replace("https:/", string.Empty, StringComparison.OrdinalIgnoreCase) + .Replace("http://", string.Empty, StringComparison.OrdinalIgnoreCase) + .Replace("http:/", string.Empty, StringComparison.OrdinalIgnoreCase) + .Replace("www.", string.Empty, StringComparison.OrdinalIgnoreCase); + + // Replace invalid filename characters with '_' + foreach (char c in Path.GetInvalidFileNameChars()) + { + url = url.Replace(c, '_'); + } + + return url; + } +} \ No newline at end of file diff --git a/src/libs/Firecrawl.Cli/Commands/MapCommand.cs b/src/libs/Firecrawl.Cli/Commands/MapCommand.cs new file mode 100644 index 0000000..1e487be --- /dev/null +++ b/src/libs/Firecrawl.Cli/Commands/MapCommand.cs @@ -0,0 +1,40 @@ +using System.CommandLine; + +namespace Firecrawl.Cli.Commands; + +public class MapCommand : Command +{ + public MapCommand() : base(name: "map", description: "Attempts to output all website's urls in a few seconds.") + { + var url = new Argument( + name: "url", + getDefaultValue: () => string.Empty, + description: "Input url"); + AddArgument(url); + + var outputPath = new Option( + name: "outputPath", + getDefaultValue: () => string.Empty, + description: "Output path"); + AddOption(outputPath); + + this.SetHandler( + HandleAsync, + url, + outputPath); + } + + private static async Task HandleAsync( + string url, + string outputPath) + { + Console.WriteLine("Initializing..."); + + var apiKey = await Helpers.GetApiKey().ConfigureAwait(false); + using var api = new FirecrawlApp(apiKey); + + Console.WriteLine($"Maps {url}..."); + + Console.WriteLine("Done."); + } +} \ No newline at end of file diff --git a/src/libs/Firecrawl.Cli/Program.cs b/src/libs/Firecrawl.Cli/Program.cs index c293437..f0d4203 100644 --- a/src/libs/Firecrawl.Cli/Program.cs +++ b/src/libs/Firecrawl.Cli/Program.cs @@ -5,5 +5,7 @@ description: "CLI tool to use Firecrawl API"); rootCommand.AddCommand(new AuthCommand()); rootCommand.AddCommand(new ScrapeCommand()); +rootCommand.AddCommand(new CrawlCommand()); +rootCommand.AddCommand(new MapCommand()); return await rootCommand.InvokeAsync(args).ConfigureAwait(false); \ No newline at end of file