From e68a34e6e07dfa1d1fa33bd4be42c501aa72b27f Mon Sep 17 00:00:00 2001 From: d4n Date: Thu, 8 Aug 2024 01:00:24 -0500 Subject: [PATCH] Bring back Bing OCR --- src/Apis/Bing/BingVisualSearch.cs | 45 ++++++++++++- src/Apis/Bing/IBingVisualSearch.cs | 8 +++ src/Entities/OcrEngine.cs | 5 ++ src/Modules/OcrModule.cs | 63 +++++++++++++++---- .../Apis/BingVisualSearchTests.cs | 22 +++++++ tests/Fergun.Tests/Modules/OcrModuleTests.cs | 34 +++++++++- tests/Fergun.Tests/Utils.cs | 4 ++ 7 files changed, 166 insertions(+), 15 deletions(-) diff --git a/src/Apis/Bing/BingVisualSearch.cs b/src/Apis/Bing/BingVisualSearch.cs index d7b0f43..8b5a6db 100644 --- a/src/Apis/Bing/BingVisualSearch.cs +++ b/src/Apis/Bing/BingVisualSearch.cs @@ -55,6 +55,49 @@ public BingVisualSearch(HttpClient httpClient) } } + /// + public async Task OcrAsync(string url, CancellationToken cancellationToken = default) + { + ObjectDisposedException.ThrowIf(_disposed, this); + cancellationToken.ThrowIfCancellationRequested(); + + using var request = BuildRequest(url, "OCR"); + using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false); + + response.EnsureSuccessStatusCode(); + + await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false); + using var document = await JsonDocument.ParseAsync(stream, default, cancellationToken).ConfigureAwait(false); + + string? imageCategory = document + .RootElement + .GetPropertyOrDefault("imageQualityHints") + .FirstOrDefault() + .GetPropertyOrDefault("category") + .GetStringOrDefault(); + + if (imageCategory is not null && _imageCategories.TryGetValue(imageCategory, out string? message)) + { + throw new BingException(message, imageCategory); + } + + var textRegions = document + .RootElement + .GetProperty("tags"u8) + .FirstOrDefault(x => x.TryGetProperty("displayName"u8, out var displayName) && displayName.ValueEquals("##TextRecognition"u8)) + .GetPropertyOrDefault("actions") + .FirstOrDefault() + .GetPropertyOrDefault("data") + .GetPropertyOrDefault("regions") + .EnumerateArrayOrEmpty() + .Select(x => string.Join('\n', + x.GetPropertyOrDefault("lines") + .EnumerateArrayOrEmpty() + .Select(y => y.GetPropertyOrDefault("text").GetStringOrDefault()))); + + return string.Join("\n\n", textRegions); + } + /// public async Task> ReverseImageSearchAsync(string url, BingSafeSearchLevel safeSearch = BingSafeSearchLevel.Moderate, string? language = null, @@ -88,7 +131,7 @@ public async Task> ReverseImageSear .EnumerateArray() .Select(x => x.GetPropertyOrDefault("actions")) .SelectMany(x => x.EnumerateArrayOrEmpty()) - .FirstOrDefault(x => x.TryGetProperty("actionType", out var actionTye) && actionTye.ValueEquals("VisualSearch"u8)) + .FirstOrDefault(x => x.TryGetProperty("actionType"u8, out var actionType) && actionType.ValueEquals("VisualSearch"u8)) .GetPropertyOrDefault("data") .GetPropertyOrDefault("value") .EnumerateArrayOrEmpty() diff --git a/src/Apis/Bing/IBingVisualSearch.cs b/src/Apis/Bing/IBingVisualSearch.cs index 01dae9b..a369f55 100644 --- a/src/Apis/Bing/IBingVisualSearch.cs +++ b/src/Apis/Bing/IBingVisualSearch.cs @@ -9,6 +9,14 @@ namespace Fergun.Apis.Bing; /// public interface IBingVisualSearch { + /// + /// Performs OCR to the specified image URL. + /// + /// The URL of an image. + /// The cancellation token. + /// A representing the asynchronous OCR operation. The result contains the recognized text. + Task OcrAsync(string url, CancellationToken cancellationToken = default); + /// /// Performs reverse image search to the specified image URL. /// diff --git a/src/Entities/OcrEngine.cs b/src/Entities/OcrEngine.cs index 79a1d1c..73c6762 100644 --- a/src/Entities/OcrEngine.cs +++ b/src/Entities/OcrEngine.cs @@ -12,6 +12,11 @@ public enum OcrEngine /// Google, + /// + /// Bing. + /// + Bing, + /// /// Yandex. /// diff --git a/src/Modules/OcrModule.cs b/src/Modules/OcrModule.cs index 5701f5d..164e313 100644 --- a/src/Modules/OcrModule.cs +++ b/src/Modules/OcrModule.cs @@ -5,9 +5,12 @@ using System.Threading.Tasks; using Discord; using Discord.Interactions; +using Fergun.Apis.Bing; using Fergun.Apis.Google; using Fergun.Apis.Yandex; using Fergun.Extensions; +using Fergun.Interactive; +using Fergun.Interactive.Selection; using Fergun.Preconditions; using Humanizer; using Microsoft.Extensions.Logging; @@ -23,21 +26,40 @@ public class OcrModule : InteractionModuleBase private readonly ILogger _logger; private readonly IFergunLocalizer _localizer; private readonly SharedModule _shared; + private readonly InteractiveService _interactive; private readonly IGoogleLensClient _googleLens; + private readonly IBingVisualSearch _bingVisualSearch; private readonly IYandexImageSearch _yandexImageSearch; - public OcrModule(ILogger logger, IFergunLocalizer localizer, SharedModule shared, - IGoogleLensClient googleLens, IYandexImageSearch yandexImageSearch) + public OcrModule(ILogger logger, IFergunLocalizer localizer, SharedModule shared, InteractiveService interactive, + IGoogleLensClient googleLens, IBingVisualSearch bingVisualSearch, IYandexImageSearch yandexImageSearch) { _logger = logger; _localizer = localizer; _shared = shared; + _interactive = interactive; _googleLens = googleLens; + _bingVisualSearch = bingVisualSearch; _yandexImageSearch = yandexImageSearch; } public override void BeforeExecute(ICommandInfo command) => _localizer.CurrentCulture = CultureInfo.GetCultureInfo(Context.Interaction.GetLanguageCode()); + [SlashCommand("bing", "Performs OCR to an image using Bing Visual Search.")] + public async Task BingAsync([Summary(description: "The URL of an image.")] string? url = null, + [Summary(description: "An image file.")] IAttachment? file = null) + => await OcrAsync(OcrEngine.Bing, file?.Url ?? url, Context.Interaction); + + [SlashCommand("google", "Performs OCR to an image using Google Lens.")] + public async Task GoogleAsync([Summary(description: "The URL of an image.")] string? url = null, + [Summary(description: "An image file.")] IAttachment? file = null) + => await OcrAsync(OcrEngine.Google, file?.Url ?? url, Context.Interaction); + + [SlashCommand("yandex", "Performs OCR to an image using Yandex.")] + public async Task YandexAsync([Summary(description: "The URL of an image.")] string? url = null, + [Summary(description: "An image file.")] IAttachment? file = null) + => await OcrAsync(OcrEngine.Yandex, file?.Url ?? url, Context.Interaction); + [MessageCommand("OCR")] public async Task OcrAsync(IMessage message) { @@ -51,18 +73,28 @@ public async Task OcrAsync(IMessage message) return FergunResult.FromError(_localizer["NoImageUrlInMessage"], true); } - return await YandexAsync(url); - } + var page = new PageBuilder() + .WithTitle(_localizer["SelectOCREngine"]) + .WithColor(Color.Orange); - [SlashCommand("google", "Performs OCR to an image using Google Lens.")] - public async Task GoogleAsync([Summary(description: "The URL of an image.")] string? url = null, - [Summary(description: "An image file.")] IAttachment? file = null) - => await OcrAsync(OcrEngine.Google, file?.Url ?? url, Context.Interaction); + var selection = new SelectionBuilder() + .AddUser(Context.User) + .WithOptions(Enum.GetValues()) + .WithSelectionPage(page) + .Build(); - [SlashCommand("yandex", "Performs OCR to an image using Yandex.")] - public async Task YandexAsync([Summary(description: "The URL of an image.")] string? url = null, - [Summary(description: "An image file.")] IAttachment? file = null) - => await OcrAsync(OcrEngine.Yandex, file?.Url ?? url, Context.Interaction); + var result = await _interactive.SendSelectionAsync(selection, Context.Interaction, TimeSpan.FromMinutes(1), ephemeral: true); + + if (result.IsSuccess) + { + return await OcrAsync(result.Value, url, result.StopInteraction!, Context.Interaction, true); + } + + // Attempt to disable the components + _ = Context.Interaction.ModifyOriginalResponseAsync(x => x.Components = selection.GetOrAddComponents(true).Build()); + + return FergunResult.FromSilentError(); + } public async Task OcrAsync(OcrEngine ocrEngine, string? url, IDiscordInteraction interaction, IDiscordInteraction? originalInteraction = null, bool ephemeral = false) @@ -114,6 +146,7 @@ public async Task OcrAsync(OcrEngine ocrEngine, string? url, IDis text = ocrEngine switch { OcrEngine.Google => await _googleLens.OcrAsync(url), + OcrEngine.Bing => await _bingVisualSearch.OcrAsync(url), OcrEngine.Yandex => await _yandexImageSearch.OcrAsync(url), _ => throw new ArgumentException(_localizer["InvalidOCREngine"], nameof(ocrEngine)) }; @@ -123,6 +156,11 @@ public async Task OcrAsync(OcrEngine ocrEngine, string? url, IDis _logger.LogWarning(e, "Failed to perform Google Lens OCR to url {Url}", url); return FergunResult.FromError(_localizer["GoogleLensOCRError"], ephemeral, interaction); } + catch (BingException e) + { + _logger.LogWarning(e, "Failed to perform Bing OCR to url {Url}", url); + return FergunResult.FromError(e.ImageCategory is null ? e.Message : _localizer[$"Bing{e.ImageCategory}"], ephemeral, interaction); + } catch (YandexException e) { _logger.LogWarning(e, "Failed to perform Yandex OCR to url {Url}", url); @@ -142,6 +180,7 @@ public async Task OcrAsync(OcrEngine ocrEngine, string? url, IDis (var name, string iconUrl) = ocrEngine switch { OcrEngine.Google => (_localizer["GoogleLensOCR"], Constants.GoogleLensLogoUrl), + OcrEngine.Bing => (_localizer["BingVisualSearch"], Constants.BingIconUrl), OcrEngine.Yandex => (_localizer["YandexOCR"], Constants.YandexIconUrl), _ => throw new ArgumentException(_localizer["InvalidOCREngine"], nameof(ocrEngine)) }; diff --git a/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs b/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs index 076309d..c5542c8 100644 --- a/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs +++ b/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs @@ -12,6 +12,27 @@ public class BingVisualSearchTests { private readonly IBingVisualSearch _bingVisualSearch = new BingVisualSearch(); + [Theory] + [InlineData("https://upload.wikimedia.org/wikipedia/commons/0/01/Windows_fonts_most_used.jpg")] + [InlineData("https://upload.wikimedia.org/wikipedia/commons/5/57/Lorem_Ipsum_Helvetica.png")] + public async Task OcrAsync_Returns_Text(string url) + { + string text = await _bingVisualSearch.OcrAsync(url); + + Assert.NotNull(text); + Assert.NotEmpty(text); + } + + [Theory] + [InlineData("https://upload.wikimedia.org/wikipedia/commons/2/29/Suru_Bog_10000px.jpg")] // 10000px image + [InlineData("https://simpl.info/bigimage/bigImage.jpg")] // 91 MB file + public async Task OcrAsync_Throws_BingException_If_Image_Is_Invalid(string url) + { + var task = _bingVisualSearch.OcrAsync(url); + + await Assert.ThrowsAsync(() => task); + } + [Theory] [InlineData("https://r.bing.com/rp/ecXQMr9jqKMeHE3ADTBrSN_WNyA.jpg", BingSafeSearchLevel.Off, null)] [InlineData("https://r.bing.com/rp/vXuQ5-3dSnE08_cK26jVzOTxREk.jpg", BingSafeSearchLevel.Moderate, "en")] @@ -52,6 +73,7 @@ public async Task Disposed_BingVisualSearch_Usage_Throws_ObjectDisposedException (_bingVisualSearch as IDisposable)?.Dispose(); (_bingVisualSearch as IDisposable)?.Dispose(); + await Assert.ThrowsAsync(() => _bingVisualSearch.OcrAsync(It.IsAny())); await Assert.ThrowsAsync(() => _bingVisualSearch.ReverseImageSearchAsync(It.IsAny(), It.IsAny(), It.IsAny())); } diff --git a/tests/Fergun.Tests/Modules/OcrModuleTests.cs b/tests/Fergun.Tests/Modules/OcrModuleTests.cs index 8f452ce..52564bc 100644 --- a/tests/Fergun.Tests/Modules/OcrModuleTests.cs +++ b/tests/Fergun.Tests/Modules/OcrModuleTests.cs @@ -3,8 +3,11 @@ using System.Threading.Tasks; using Discord; using Discord.Interactions; +using Discord.WebSocket; +using Fergun.Apis.Bing; using Fergun.Apis.Google; using Fergun.Apis.Yandex; +using Fergun.Interactive; using Fergun.Modules; using GTranslate.Translators; using Microsoft.Extensions.Logging; @@ -18,8 +21,11 @@ public class OcrModuleTests private readonly Mock _contextMock = new(); private readonly Mock _interactionMock = new(); private readonly Mock _googleLensMock = new(); + private readonly Mock _bingVisualSearchMock = new(); private readonly Mock _yandexImageSearchMock = new(); private readonly Mock> _loggerMock = new(); + private readonly DiscordSocketClient _client = new(); + private readonly InteractiveConfig _interactiveConfig = new() { DeferStopSelectionInteractions = false }; private readonly IFergunLocalizer _ocrLocalizer = Utils.CreateMockedLocalizer(); private readonly Mock _moduleMock; private const string TextImageUrl = "https://example.com/image.png"; @@ -31,6 +37,9 @@ public OcrModuleTests() _googleLensMock.Setup(x => x.OcrAsync(It.Is(s => s == TextImageUrl), It.IsAny())).ReturnsAsync("test"); _googleLensMock.Setup(x => x.OcrAsync(It.Is(s => s == EmptyImageUrl), It.IsAny())).ReturnsAsync(string.Empty); _googleLensMock.Setup(x => x.OcrAsync(It.Is(s => s == InvalidImageUrl), It.IsAny())).ThrowsAsync(new GoogleLensException("Invalid image.")); + _bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is(s => s == TextImageUrl), It.IsAny())).ReturnsAsync("test"); + _bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is(s => s == EmptyImageUrl), It.IsAny())).ReturnsAsync(string.Empty); + _bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is(s => s == InvalidImageUrl), It.IsAny())).ThrowsAsync(new BingException("Invalid image.")); _yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is(s => s == TextImageUrl), It.IsAny())).ReturnsAsync("test"); _yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is(s => s == EmptyImageUrl), It.IsAny())).ReturnsAsync(string.Empty); _yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is(s => s == InvalidImageUrl), It.IsAny())).ThrowsAsync(new YandexException("Invalid image.")); @@ -39,7 +48,9 @@ public OcrModuleTests() var sharedLocalizer = Utils.CreateMockedLocalizer(); var shared = new SharedModule(sharedLogger, sharedLocalizer, Mock.Of(), new GoogleTranslator2()); - _moduleMock = new Mock(() => new OcrModule(_loggerMock.Object, _ocrLocalizer, shared, _googleLensMock.Object, _yandexImageSearchMock.Object)) { CallBase = true }; + var interactive = new InteractiveService(_client, _interactiveConfig); + _moduleMock = new Mock(() => new OcrModule(_loggerMock.Object, _ocrLocalizer, shared, interactive, + _googleLensMock.Object, _bingVisualSearchMock.Object, _yandexImageSearchMock.Object)) { CallBase = true }; _contextMock.SetupGet(x => x.Interaction).Returns(_interactionMock.Object); ((IInteractionModuleBase)_moduleMock.Object).SetContext(_contextMock.Object); } @@ -51,7 +62,7 @@ public void BeforeExecute_Sets_Language() _moduleMock.Object.BeforeExecute(It.IsAny()); Assert.Equal("en", _ocrLocalizer.CurrentCulture.TwoLetterISOLanguageName); } - + [Theory] [InlineData(TextImageUrl, true)] [InlineData(EmptyImageUrl, false)] @@ -71,6 +82,25 @@ public async Task GoogleAsync_Uses_GoogleLens(string url, bool success) It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny()), success ? Times.Once : Times.Never); } + [Theory] + [InlineData(TextImageUrl, true)] + [InlineData(EmptyImageUrl, false)] + public async Task BingAsync_Uses_BingVisualSearch(string url, bool success) + { + var module = _moduleMock.Object; + const bool isEphemeral = false; + + var result = await module.BingAsync(url); + Assert.Equal(success, result.IsSuccess); + + _interactionMock.Verify(x => x.DeferAsync(It.Is(b => b == isEphemeral), It.IsAny()), Times.Once); + + _bingVisualSearchMock.Verify(x => x.OcrAsync(It.Is(s => s == url), It.IsAny()), Times.Once); + + _interactionMock.Verify(x => x.FollowupAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.Is(b => b == isEphemeral), + It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny()), success ? Times.Once : Times.Never); + } + [Theory] [InlineData(TextImageUrl, true)] [InlineData(EmptyImageUrl, false)] diff --git a/tests/Fergun.Tests/Utils.cs b/tests/Fergun.Tests/Utils.cs index df1fc53..45c4e74 100644 --- a/tests/Fergun.Tests/Utils.cs +++ b/tests/Fergun.Tests/Utils.cs @@ -128,6 +128,10 @@ public static IBingVisualSearch CreateMockedBingVisualSearchApi(Faker? faker = n faker ??= new Faker(); var bingMock = new Mock(); + bingMock.Setup(x => x.OcrAsync(It.Is(s => s == string.Empty), It.IsAny())).ReturnsAsync(string.Empty); + bingMock.Setup(x => x.OcrAsync(It.Is(s => !string.IsNullOrEmpty(s)), It.IsAny())).ReturnsAsync(faker.Lorem.Sentence()); + bingMock.Setup(x => x.OcrAsync(It.Is(s => s == "https://example.com/error"), It.IsAny())).ThrowsAsync(new BingException("Error message.")); + bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => s == string.Empty), It.IsAny(), It.IsAny(), It.IsAny())).ReturnsAsync(Array.Empty); bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => !string.IsNullOrEmpty(s)), It.IsAny(), It.IsAny(), It.IsAny())).ReturnsAsync(() => faker.Make(50, () => CreateMockedBingReverseImageSearchResult(faker)).AsReadOnly()); bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => s == "https://example.com/error"), It.IsAny(), It.IsAny(), It.IsAny())).ThrowsAsync(new BingException("Error message."));