diff --git a/src/Apis/Bing/BingVisualSearch.cs b/src/Apis/Bing/BingVisualSearch.cs index d7b0f43..8b5a6db 100644 --- a/src/Apis/Bing/BingVisualSearch.cs +++ b/src/Apis/Bing/BingVisualSearch.cs @@ -55,6 +55,49 @@ public BingVisualSearch(HttpClient httpClient) } } + /// <inheritdoc/> + public async Task<string> OcrAsync(string url, CancellationToken cancellationToken = default) + { + ObjectDisposedException.ThrowIf(_disposed, this); + cancellationToken.ThrowIfCancellationRequested(); + + using var request = BuildRequest(url, "OCR"); + using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false); + + response.EnsureSuccessStatusCode(); + + await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false); + using var document = await JsonDocument.ParseAsync(stream, default, cancellationToken).ConfigureAwait(false); + + string? imageCategory = document + .RootElement + .GetPropertyOrDefault("imageQualityHints") + .FirstOrDefault() + .GetPropertyOrDefault("category") + .GetStringOrDefault(); + + if (imageCategory is not null && _imageCategories.TryGetValue(imageCategory, out string? message)) + { + throw new BingException(message, imageCategory); + } + + var textRegions = document + .RootElement + .GetProperty("tags"u8) + .FirstOrDefault(x => x.TryGetProperty("displayName"u8, out var displayName) && displayName.ValueEquals("##TextRecognition"u8)) + .GetPropertyOrDefault("actions") + .FirstOrDefault() + .GetPropertyOrDefault("data") + .GetPropertyOrDefault("regions") + .EnumerateArrayOrEmpty() + .Select(x => string.Join('\n', + x.GetPropertyOrDefault("lines") + .EnumerateArrayOrEmpty() + .Select(y => y.GetPropertyOrDefault("text").GetStringOrDefault()))); + + return string.Join("\n\n", textRegions); + } + /// <inheritdoc/> public async Task<IReadOnlyList<IBingReverseImageSearchResult>> ReverseImageSearchAsync(string url, BingSafeSearchLevel safeSearch = BingSafeSearchLevel.Moderate, string? language = null, @@ -88,7 +131,7 @@ public async Task<IReadOnlyList<IBingReverseImageSearchResult>> ReverseImageSear .EnumerateArray() .Select(x => x.GetPropertyOrDefault("actions")) .SelectMany(x => x.EnumerateArrayOrEmpty()) - .FirstOrDefault(x => x.TryGetProperty("actionType", out var actionTye) && actionTye.ValueEquals("VisualSearch"u8)) + .FirstOrDefault(x => x.TryGetProperty("actionType"u8, out var actionType) && actionType.ValueEquals("VisualSearch"u8)) .GetPropertyOrDefault("data") .GetPropertyOrDefault("value") .EnumerateArrayOrEmpty() diff --git a/src/Apis/Bing/IBingVisualSearch.cs b/src/Apis/Bing/IBingVisualSearch.cs index 01dae9b..a369f55 100644 --- a/src/Apis/Bing/IBingVisualSearch.cs +++ b/src/Apis/Bing/IBingVisualSearch.cs @@ -9,6 +9,14 @@ namespace Fergun.Apis.Bing; /// </summary> public interface IBingVisualSearch { + /// <summary> + /// Performs OCR to the specified image URL. + /// </summary> + /// <param name="url">The URL of an image.</param> + /// <param name="cancellationToken">The cancellation token.</param> + /// <returns>A <see cref="Task"/> representing the asynchronous OCR operation. The result contains the recognized text.</returns> + Task<string> OcrAsync(string url, CancellationToken cancellationToken = default); + /// <summary> /// Performs reverse image search to the specified image URL. /// </summary> diff --git a/src/Entities/OcrEngine.cs b/src/Entities/OcrEngine.cs index 79a1d1c..73c6762 100644 --- a/src/Entities/OcrEngine.cs +++ b/src/Entities/OcrEngine.cs @@ -12,6 +12,11 @@ public enum OcrEngine /// </summary> Google, + /// <summary> + /// Bing. + /// </summary> + Bing, + /// <summary> /// Yandex. /// </summary> diff --git a/src/Modules/OcrModule.cs b/src/Modules/OcrModule.cs index 5701f5d..164e313 100644 --- a/src/Modules/OcrModule.cs +++ b/src/Modules/OcrModule.cs @@ -5,9 +5,12 @@ using System.Threading.Tasks; using Discord; using Discord.Interactions; +using Fergun.Apis.Bing; using Fergun.Apis.Google; using Fergun.Apis.Yandex; using Fergun.Extensions; +using Fergun.Interactive; +using Fergun.Interactive.Selection; using Fergun.Preconditions; using Humanizer; using Microsoft.Extensions.Logging; @@ -23,21 +26,40 @@ public class OcrModule : InteractionModuleBase private readonly ILogger<OcrModule> _logger; private readonly IFergunLocalizer<OcrModule> _localizer; private readonly SharedModule _shared; + private readonly InteractiveService _interactive; private readonly IGoogleLensClient _googleLens; + private readonly IBingVisualSearch _bingVisualSearch; private readonly IYandexImageSearch _yandexImageSearch; - public OcrModule(ILogger<OcrModule> logger, IFergunLocalizer<OcrModule> localizer, SharedModule shared, - IGoogleLensClient googleLens, IYandexImageSearch yandexImageSearch) + public OcrModule(ILogger<OcrModule> logger, IFergunLocalizer<OcrModule> localizer, SharedModule shared, InteractiveService interactive, + IGoogleLensClient googleLens, IBingVisualSearch bingVisualSearch, IYandexImageSearch yandexImageSearch) { _logger = logger; _localizer = localizer; _shared = shared; + _interactive = interactive; _googleLens = googleLens; + _bingVisualSearch = bingVisualSearch; _yandexImageSearch = yandexImageSearch; } public override void BeforeExecute(ICommandInfo command) => _localizer.CurrentCulture = CultureInfo.GetCultureInfo(Context.Interaction.GetLanguageCode()); + [SlashCommand("bing", "Performs OCR to an image using Bing Visual Search.")] + public async Task<RuntimeResult> BingAsync([Summary(description: "The URL of an image.")] string? url = null, + [Summary(description: "An image file.")] IAttachment? file = null) + => await OcrAsync(OcrEngine.Bing, file?.Url ?? url, Context.Interaction); + + [SlashCommand("google", "Performs OCR to an image using Google Lens.")] + public async Task<RuntimeResult> GoogleAsync([Summary(description: "The URL of an image.")] string? url = null, + [Summary(description: "An image file.")] IAttachment? file = null) + => await OcrAsync(OcrEngine.Google, file?.Url ?? url, Context.Interaction); + + [SlashCommand("yandex", "Performs OCR to an image using Yandex.")] + public async Task<RuntimeResult> YandexAsync([Summary(description: "The URL of an image.")] string? url = null, + [Summary(description: "An image file.")] IAttachment? file = null) + => await OcrAsync(OcrEngine.Yandex, file?.Url ?? url, Context.Interaction); + [MessageCommand("OCR")] public async Task<RuntimeResult> OcrAsync(IMessage message) { @@ -51,18 +73,28 @@ public async Task<RuntimeResult> OcrAsync(IMessage message) return FergunResult.FromError(_localizer["NoImageUrlInMessage"], true); } - return await YandexAsync(url); - } + var page = new PageBuilder() + .WithTitle(_localizer["SelectOCREngine"]) + .WithColor(Color.Orange); - [SlashCommand("google", "Performs OCR to an image using Google Lens.")] - public async Task<RuntimeResult> GoogleAsync([Summary(description: "The URL of an image.")] string? url = null, - [Summary(description: "An image file.")] IAttachment? file = null) - => await OcrAsync(OcrEngine.Google, file?.Url ?? url, Context.Interaction); + var selection = new SelectionBuilder<OcrEngine>() + .AddUser(Context.User) + .WithOptions(Enum.GetValues<OcrEngine>()) + .WithSelectionPage(page) + .Build(); - [SlashCommand("yandex", "Performs OCR to an image using Yandex.")] - public async Task<RuntimeResult> YandexAsync([Summary(description: "The URL of an image.")] string? url = null, - [Summary(description: "An image file.")] IAttachment? file = null) - => await OcrAsync(OcrEngine.Yandex, file?.Url ?? url, Context.Interaction); + var result = await _interactive.SendSelectionAsync(selection, Context.Interaction, TimeSpan.FromMinutes(1), ephemeral: true); + + if (result.IsSuccess) + { + return await OcrAsync(result.Value, url, result.StopInteraction!, Context.Interaction, true); + } + + // Attempt to disable the components + _ = Context.Interaction.ModifyOriginalResponseAsync(x => x.Components = selection.GetOrAddComponents(true).Build()); + + return FergunResult.FromSilentError(); + } public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDiscordInteraction interaction, IDiscordInteraction? originalInteraction = null, bool ephemeral = false) @@ -114,6 +146,7 @@ public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDis text = ocrEngine switch { OcrEngine.Google => await _googleLens.OcrAsync(url), + OcrEngine.Bing => await _bingVisualSearch.OcrAsync(url), OcrEngine.Yandex => await _yandexImageSearch.OcrAsync(url), _ => throw new ArgumentException(_localizer["InvalidOCREngine"], nameof(ocrEngine)) }; @@ -123,6 +156,11 @@ public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDis _logger.LogWarning(e, "Failed to perform Google Lens OCR to url {Url}", url); return FergunResult.FromError(_localizer["GoogleLensOCRError"], ephemeral, interaction); } + catch (BingException e) + { + _logger.LogWarning(e, "Failed to perform Bing OCR to url {Url}", url); + return FergunResult.FromError(e.ImageCategory is null ? e.Message : _localizer[$"Bing{e.ImageCategory}"], ephemeral, interaction); + } catch (YandexException e) { _logger.LogWarning(e, "Failed to perform Yandex OCR to url {Url}", url); @@ -142,6 +180,7 @@ public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDis (var name, string iconUrl) = ocrEngine switch { OcrEngine.Google => (_localizer["GoogleLensOCR"], Constants.GoogleLensLogoUrl), + OcrEngine.Bing => (_localizer["BingVisualSearch"], Constants.BingIconUrl), OcrEngine.Yandex => (_localizer["YandexOCR"], Constants.YandexIconUrl), _ => throw new ArgumentException(_localizer["InvalidOCREngine"], nameof(ocrEngine)) }; diff --git a/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs b/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs index 076309d..c5542c8 100644 --- a/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs +++ b/tests/Fergun.Tests/Apis/BingVisualSearchTests.cs @@ -12,6 +12,27 @@ public class BingVisualSearchTests { private readonly IBingVisualSearch _bingVisualSearch = new BingVisualSearch(); + [Theory] + [InlineData("https://upload.wikimedia.org/wikipedia/commons/0/01/Windows_fonts_most_used.jpg")] + [InlineData("https://upload.wikimedia.org/wikipedia/commons/5/57/Lorem_Ipsum_Helvetica.png")] + public async Task OcrAsync_Returns_Text(string url) + { + string text = await _bingVisualSearch.OcrAsync(url); + + Assert.NotNull(text); + Assert.NotEmpty(text); + } + + [Theory] + [InlineData("https://upload.wikimedia.org/wikipedia/commons/2/29/Suru_Bog_10000px.jpg")] // 10000px image + [InlineData("https://simpl.info/bigimage/bigImage.jpg")] // 91 MB file + public async Task OcrAsync_Throws_BingException_If_Image_Is_Invalid(string url) + { + var task = _bingVisualSearch.OcrAsync(url); + + await Assert.ThrowsAsync<BingException>(() => task); + } + [Theory] [InlineData("https://r.bing.com/rp/ecXQMr9jqKMeHE3ADTBrSN_WNyA.jpg", BingSafeSearchLevel.Off, null)] [InlineData("https://r.bing.com/rp/vXuQ5-3dSnE08_cK26jVzOTxREk.jpg", BingSafeSearchLevel.Moderate, "en")] @@ -52,6 +73,7 @@ public async Task Disposed_BingVisualSearch_Usage_Throws_ObjectDisposedException (_bingVisualSearch as IDisposable)?.Dispose(); (_bingVisualSearch as IDisposable)?.Dispose(); + await Assert.ThrowsAsync<ObjectDisposedException>(() => _bingVisualSearch.OcrAsync(It.IsAny<string>())); await Assert.ThrowsAsync<ObjectDisposedException>(() => _bingVisualSearch.ReverseImageSearchAsync(It.IsAny<string>(), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string?>())); } diff --git a/tests/Fergun.Tests/Modules/OcrModuleTests.cs b/tests/Fergun.Tests/Modules/OcrModuleTests.cs index 8f452ce..52564bc 100644 --- a/tests/Fergun.Tests/Modules/OcrModuleTests.cs +++ b/tests/Fergun.Tests/Modules/OcrModuleTests.cs @@ -3,8 +3,11 @@ using System.Threading.Tasks; using Discord; using Discord.Interactions; +using Discord.WebSocket; +using Fergun.Apis.Bing; using Fergun.Apis.Google; using Fergun.Apis.Yandex; +using Fergun.Interactive; using Fergun.Modules; using GTranslate.Translators; using Microsoft.Extensions.Logging; @@ -18,8 +21,11 @@ public class OcrModuleTests private readonly Mock<IInteractionContext> _contextMock = new(); private readonly Mock<IDiscordInteraction> _interactionMock = new(); private readonly Mock<IGoogleLensClient> _googleLensMock = new(); + private readonly Mock<IBingVisualSearch> _bingVisualSearchMock = new(); private readonly Mock<IYandexImageSearch> _yandexImageSearchMock = new(); private readonly Mock<ILogger<OcrModule>> _loggerMock = new(); + private readonly DiscordSocketClient _client = new(); + private readonly InteractiveConfig _interactiveConfig = new() { DeferStopSelectionInteractions = false }; private readonly IFergunLocalizer<OcrModule> _ocrLocalizer = Utils.CreateMockedLocalizer<OcrModule>(); private readonly Mock<OcrModule> _moduleMock; private const string TextImageUrl = "https://example.com/image.png"; @@ -31,6 +37,9 @@ public OcrModuleTests() _googleLensMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == TextImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync("test"); _googleLensMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == EmptyImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty); _googleLensMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == InvalidImageUrl), It.IsAny<CancellationToken>())).ThrowsAsync(new GoogleLensException("Invalid image.")); + _bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == TextImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync("test"); + _bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == EmptyImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty); + _bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == InvalidImageUrl), It.IsAny<CancellationToken>())).ThrowsAsync(new BingException("Invalid image.")); _yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == TextImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync("test"); _yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == EmptyImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty); _yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == InvalidImageUrl), It.IsAny<CancellationToken>())).ThrowsAsync(new YandexException("Invalid image.")); @@ -39,7 +48,9 @@ public OcrModuleTests() var sharedLocalizer = Utils.CreateMockedLocalizer<SharedResource>(); var shared = new SharedModule(sharedLogger, sharedLocalizer, Mock.Of<IFergunTranslator>(), new GoogleTranslator2()); - _moduleMock = new Mock<OcrModule>(() => new OcrModule(_loggerMock.Object, _ocrLocalizer, shared, _googleLensMock.Object, _yandexImageSearchMock.Object)) { CallBase = true }; + var interactive = new InteractiveService(_client, _interactiveConfig); + _moduleMock = new Mock<OcrModule>(() => new OcrModule(_loggerMock.Object, _ocrLocalizer, shared, interactive, + _googleLensMock.Object, _bingVisualSearchMock.Object, _yandexImageSearchMock.Object)) { CallBase = true }; _contextMock.SetupGet(x => x.Interaction).Returns(_interactionMock.Object); ((IInteractionModuleBase)_moduleMock.Object).SetContext(_contextMock.Object); } @@ -51,7 +62,7 @@ public void BeforeExecute_Sets_Language() _moduleMock.Object.BeforeExecute(It.IsAny<ICommandInfo>()); Assert.Equal("en", _ocrLocalizer.CurrentCulture.TwoLetterISOLanguageName); } - + [Theory] [InlineData(TextImageUrl, true)] [InlineData(EmptyImageUrl, false)] @@ -71,6 +82,25 @@ public async Task GoogleAsync_Uses_GoogleLens(string url, bool success) It.IsAny<AllowedMentions>(), It.IsAny<MessageComponent>(), It.IsAny<Embed>(), It.IsAny<RequestOptions>(), It.IsAny<PollProperties>()), success ? Times.Once : Times.Never); } + [Theory] + [InlineData(TextImageUrl, true)] + [InlineData(EmptyImageUrl, false)] + public async Task BingAsync_Uses_BingVisualSearch(string url, bool success) + { + var module = _moduleMock.Object; + const bool isEphemeral = false; + + var result = await module.BingAsync(url); + Assert.Equal(success, result.IsSuccess); + + _interactionMock.Verify(x => x.DeferAsync(It.Is<bool>(b => b == isEphemeral), It.IsAny<RequestOptions>()), Times.Once); + + _bingVisualSearchMock.Verify(x => x.OcrAsync(It.Is<string>(s => s == url), It.IsAny<CancellationToken>()), Times.Once); + + _interactionMock.Verify(x => x.FollowupAsync(It.IsAny<string>(), It.IsAny<Embed[]>(), It.IsAny<bool>(), It.Is<bool>(b => b == isEphemeral), + It.IsAny<AllowedMentions>(), It.IsAny<MessageComponent>(), It.IsAny<Embed>(), It.IsAny<RequestOptions>(), It.IsAny<PollProperties>()), success ? Times.Once : Times.Never); + } + [Theory] [InlineData(TextImageUrl, true)] [InlineData(EmptyImageUrl, false)] diff --git a/tests/Fergun.Tests/Utils.cs b/tests/Fergun.Tests/Utils.cs index df1fc53..45c4e74 100644 --- a/tests/Fergun.Tests/Utils.cs +++ b/tests/Fergun.Tests/Utils.cs @@ -128,6 +128,10 @@ public static IBingVisualSearch CreateMockedBingVisualSearchApi(Faker? faker = n faker ??= new Faker(); var bingMock = new Mock<IBingVisualSearch>(); + bingMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == string.Empty), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty); + bingMock.Setup(x => x.OcrAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<CancellationToken>())).ReturnsAsync(faker.Lorem.Sentence()); + bingMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == "https://example.com/error"), It.IsAny<CancellationToken>())).ThrowsAsync(new BingException("Error message.")); + bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == string.Empty), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string>(), It.IsAny<CancellationToken>())).ReturnsAsync(Array.Empty<IBingReverseImageSearchResult>); bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string>(), It.IsAny<CancellationToken>())).ReturnsAsync(() => faker.Make(50, () => CreateMockedBingReverseImageSearchResult(faker)).AsReadOnly()); bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == "https://example.com/error"), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string>(), It.IsAny<CancellationToken>())).ThrowsAsync(new BingException("Error message."));