From 2f6db1842e93da400abc56dda2e5f3bbd9c69a77 Mon Sep 17 00:00:00 2001 From: HavenDV Date: Mon, 9 Sep 2024 01:39:18 +0400 Subject: [PATCH] refactor: Added WaitJob helper. --- README.md | 23 ++++++++++++++ .../Firecrawl.Cli/Commands/CrawlCommand.cs | 21 ++++--------- src/libs/Firecrawl.Cli/Firecrawl.Cli.csproj | 2 +- src/libs/Firecrawl/CrawlClient.WaitJob.cs | 30 +++++++++++++++++++ src/tests/IntegrationTests/Tests.Crawl.cs | 26 +++++----------- 5 files changed, 67 insertions(+), 35 deletions(-) create mode 100644 src/libs/Firecrawl/CrawlClient.WaitJob.cs diff --git a/README.md b/README.md index 0dcecbb..e07df0b 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,31 @@ using Firecrawl; using var api = new FirecrawlApp(apiKey); +// Scrape var response = await api.Scraping.ScrapeAsync("https://docs.firecrawl.dev/features/scrape"); string markdown = response.Data.Markdown; + +// Crawl +var response = await api.Crawling.CrawlUrlsAsync( + url: "https://docs.firecrawl.dev/", + crawlerOptions: new CrawlUrlsRequestCrawlerOptions + { + Limit = 3, + }, + pageOptions: new CrawlUrlsRequestPageOptions + { + OnlyMainContent = true, + }); + +var jobResponse = await api.Crawl.WaitJobAsync( + jobId: response.JobId); + +foreach (var data in jobResponse.Data) +{ + Console.WriteLine($"URL: {data.Metadata.SourceURL}"); + Console.WriteLine($"Output file: {data.Markdown}"); +} ``` ### CLI @@ -28,6 +50,7 @@ string markdown = response.Data.Markdown; dotnet tool install -g Firecrawl.Cli firecrawl auth firecrawl scrape https://docs.firecrawl.dev/features/scrape // saves it to output.md +firecrawl crawl https://docs.firecrawl.dev/features/scrape --limit 5 // saves all .md files to docs.firecrawl.dev folder ``` ## Support diff --git a/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs b/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs index aee92cb..3623522 100644 --- a/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs +++ b/src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs @@ -1,5 +1,4 @@ using System.CommandLine; -using System.Diagnostics.CodeAnalysis; namespace Firecrawl.Cli.Commands; @@ -21,7 +20,7 @@ public CrawlCommand() : base(name: "crawl", description: "Crawl a url and saves var limit = new Option( name: "limit", - getDefaultValue: () => 10, + getDefaultValue: () => 5, description: "Limit of pages to crawl"); AddOption(limit); @@ -67,18 +66,8 @@ private static async Task HandleAsync( Console.WriteLine($"JobId: {response.JobId}"); - GetCrawlStatusResponse? statusResponse = null; - while (true) - { - await Task.Delay(TimeSpan.FromSeconds(5)).ConfigureAwait(false); - - statusResponse = await api.Crawl.GetCrawlStatusAsync( - jobId: response.JobId!).ConfigureAwait(false); - if (statusResponse.Status == "completed") - { - break; - } - } + var jobResponse = await api.Crawl.WaitJobAsync( + jobId: response.JobId!).ConfigureAwait(false); if (string.IsNullOrWhiteSpace(outputPath)) { @@ -88,7 +77,7 @@ private static async Task HandleAsync( Directory.CreateDirectory(outputPath); var index = 0; - foreach (var data in statusResponse.Data ?? []) + foreach (var data in jobResponse.Data ?? []) { var name = string.IsNullOrWhiteSpace(data.Metadata?.SourceURL) ? $"output{++index}.md" @@ -115,7 +104,7 @@ public static string ConvertUrlToFilename(string url) .Replace("www.", string.Empty, StringComparison.OrdinalIgnoreCase); // Replace invalid filename characters with '_' - foreach (char c in Path.GetInvalidFileNameChars()) + foreach (var c in Path.GetInvalidFileNameChars()) { url = url.Replace(c, '_'); } diff --git a/src/libs/Firecrawl.Cli/Firecrawl.Cli.csproj b/src/libs/Firecrawl.Cli/Firecrawl.Cli.csproj index f4cb780..d71c235 100644 --- a/src/libs/Firecrawl.Cli/Firecrawl.Cli.csproj +++ b/src/libs/Firecrawl.Cli/Firecrawl.Cli.csproj @@ -6,7 +6,7 @@ enable enable false - $(NoWarn);CA1724;CA1303 + $(NoWarn);CA1724;CA1303;CA1054;CA1055 diff --git a/src/libs/Firecrawl/CrawlClient.WaitJob.cs b/src/libs/Firecrawl/CrawlClient.WaitJob.cs new file mode 100644 index 0000000..283675b --- /dev/null +++ b/src/libs/Firecrawl/CrawlClient.WaitJob.cs @@ -0,0 +1,30 @@ +namespace Firecrawl; + +public partial class CrawlClient +{ + /// + /// Waits for a crawl job to complete or fail. + /// + /// + /// The token to cancel the operation with + /// + public async Task WaitJobAsync( + string jobId, + CancellationToken cancellationToken = default) + { + while (true) + { + cancellationToken.ThrowIfCancellationRequested(); + + await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken).ConfigureAwait(false); + + var statusResponse = await GetCrawlStatusAsync( + jobId: jobId, + cancellationToken: cancellationToken).ConfigureAwait(false); + if (statusResponse.Status is "completed" or "failed") + { + return statusResponse; + } + } + } +} \ No newline at end of file diff --git a/src/tests/IntegrationTests/Tests.Crawl.cs b/src/tests/IntegrationTests/Tests.Crawl.cs index 84d4b30..b4290a4 100644 --- a/src/tests/IntegrationTests/Tests.Crawl.cs +++ b/src/tests/IntegrationTests/Tests.Crawl.cs @@ -26,22 +26,12 @@ public async Task Crawl() response.JobId.Should().NotBeNullOrEmpty(); - GetCrawlStatusResponse? statusResponse = null; - while (!cancellationToken.IsCancellationRequested) - { - await Task.Delay(TimeSpan.FromSeconds(5), cancellationToken); - - statusResponse = await api.Crawl.GetCrawlStatusAsync( - jobId: response.JobId!, - cancellationToken: cancellationToken); - if (statusResponse.Status == "completed") - { - break; - } - } + var jobResponse = await api.Crawl.WaitJobAsync( + jobId: response.JobId!, + cancellationToken: cancellationToken); var index = 0; - foreach (var data in statusResponse?.Data ?? []) + foreach (var data in jobResponse.Data ?? []) { data.Html.Should().NotBeNullOrEmpty(); data.Markdown.Should().NotBeNullOrEmpty(); @@ -51,9 +41,9 @@ public async Task Crawl() Console.WriteLine($"Output file: {new Uri(fileInfo.FullName).AbsoluteUri}"); } - statusResponse.Should().NotBeNull(); - statusResponse!.Status.Should().Be("completed"); - statusResponse.Total.Should().Be(3); - statusResponse.Data.Should().NotBeNullOrEmpty(); + jobResponse.Should().NotBeNull(); + jobResponse.Status.Should().Be("completed"); + jobResponse.Total.Should().Be(3); + jobResponse.Data.Should().NotBeNullOrEmpty(); } }