Skip to content

Commit

Permalink
Bring back Bing OCR
Browse files Browse the repository at this point in the history
  • Loading branch information
d4n3436 committed Aug 8, 2024
1 parent 5243f85 commit e68a34e
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 15 deletions.
45 changes: 44 additions & 1 deletion src/Apis/Bing/BingVisualSearch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,49 @@ public BingVisualSearch(HttpClient httpClient)
}
}

/// <inheritdoc/>
public async Task<string> OcrAsync(string url, CancellationToken cancellationToken = default)
{
ObjectDisposedException.ThrowIf(_disposed, this);
cancellationToken.ThrowIfCancellationRequested();

using var request = BuildRequest(url, "OCR");
using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false);

response.EnsureSuccessStatusCode();

await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false);
using var document = await JsonDocument.ParseAsync(stream, default, cancellationToken).ConfigureAwait(false);

string? imageCategory = document
.RootElement
.GetPropertyOrDefault("imageQualityHints")
.FirstOrDefault()
.GetPropertyOrDefault("category")
.GetStringOrDefault();

if (imageCategory is not null && _imageCategories.TryGetValue(imageCategory, out string? message))
{
throw new BingException(message, imageCategory);
}

var textRegions = document
.RootElement
.GetProperty("tags"u8)
.FirstOrDefault(x => x.TryGetProperty("displayName"u8, out var displayName) && displayName.ValueEquals("##TextRecognition"u8))
.GetPropertyOrDefault("actions")
.FirstOrDefault()
.GetPropertyOrDefault("data")
.GetPropertyOrDefault("regions")
.EnumerateArrayOrEmpty()
.Select(x => string.Join('\n',
x.GetPropertyOrDefault("lines")
.EnumerateArrayOrEmpty()
.Select(y => y.GetPropertyOrDefault("text").GetStringOrDefault())));

return string.Join("\n\n", textRegions);
}

/// <inheritdoc/>
public async Task<IReadOnlyList<IBingReverseImageSearchResult>> ReverseImageSearchAsync(string url,
BingSafeSearchLevel safeSearch = BingSafeSearchLevel.Moderate, string? language = null,
Expand Down Expand Up @@ -88,7 +131,7 @@ public async Task<IReadOnlyList<IBingReverseImageSearchResult>> ReverseImageSear
.EnumerateArray()
.Select(x => x.GetPropertyOrDefault("actions"))
.SelectMany(x => x.EnumerateArrayOrEmpty())
.FirstOrDefault(x => x.TryGetProperty("actionType", out var actionTye) && actionTye.ValueEquals("VisualSearch"u8))
.FirstOrDefault(x => x.TryGetProperty("actionType"u8, out var actionType) && actionType.ValueEquals("VisualSearch"u8))
.GetPropertyOrDefault("data")
.GetPropertyOrDefault("value")
.EnumerateArrayOrEmpty()
Expand Down
8 changes: 8 additions & 0 deletions src/Apis/Bing/IBingVisualSearch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ namespace Fergun.Apis.Bing;
/// </summary>
public interface IBingVisualSearch
{
/// <summary>
/// Performs OCR to the specified image URL.
/// </summary>
/// <param name="url">The URL of an image.</param>
/// <param name="cancellationToken">The cancellation token.</param>
/// <returns>A <see cref="Task"/> representing the asynchronous OCR operation. The result contains the recognized text.</returns>
Task<string> OcrAsync(string url, CancellationToken cancellationToken = default);

/// <summary>
/// Performs reverse image search to the specified image URL.
/// </summary>
Expand Down
5 changes: 5 additions & 0 deletions src/Entities/OcrEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ public enum OcrEngine
/// </summary>
Google,

/// <summary>
/// Bing.
/// </summary>
Bing,

/// <summary>
/// Yandex.
/// </summary>
Expand Down
63 changes: 51 additions & 12 deletions src/Modules/OcrModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
using System.Threading.Tasks;
using Discord;
using Discord.Interactions;
using Fergun.Apis.Bing;
using Fergun.Apis.Google;
using Fergun.Apis.Yandex;
using Fergun.Extensions;
using Fergun.Interactive;
using Fergun.Interactive.Selection;
using Fergun.Preconditions;
using Humanizer;
using Microsoft.Extensions.Logging;
Expand All @@ -23,21 +26,40 @@ public class OcrModule : InteractionModuleBase
private readonly ILogger<OcrModule> _logger;
private readonly IFergunLocalizer<OcrModule> _localizer;
private readonly SharedModule _shared;
private readonly InteractiveService _interactive;
private readonly IGoogleLensClient _googleLens;
private readonly IBingVisualSearch _bingVisualSearch;
private readonly IYandexImageSearch _yandexImageSearch;

public OcrModule(ILogger<OcrModule> logger, IFergunLocalizer<OcrModule> localizer, SharedModule shared,
IGoogleLensClient googleLens, IYandexImageSearch yandexImageSearch)
public OcrModule(ILogger<OcrModule> logger, IFergunLocalizer<OcrModule> localizer, SharedModule shared, InteractiveService interactive,
IGoogleLensClient googleLens, IBingVisualSearch bingVisualSearch, IYandexImageSearch yandexImageSearch)
{
_logger = logger;
_localizer = localizer;
_shared = shared;
_interactive = interactive;
_googleLens = googleLens;
_bingVisualSearch = bingVisualSearch;
_yandexImageSearch = yandexImageSearch;
}

public override void BeforeExecute(ICommandInfo command) => _localizer.CurrentCulture = CultureInfo.GetCultureInfo(Context.Interaction.GetLanguageCode());

[SlashCommand("bing", "Performs OCR to an image using Bing Visual Search.")]
public async Task<RuntimeResult> BingAsync([Summary(description: "The URL of an image.")] string? url = null,
[Summary(description: "An image file.")] IAttachment? file = null)
=> await OcrAsync(OcrEngine.Bing, file?.Url ?? url, Context.Interaction);

[SlashCommand("google", "Performs OCR to an image using Google Lens.")]
public async Task<RuntimeResult> GoogleAsync([Summary(description: "The URL of an image.")] string? url = null,
[Summary(description: "An image file.")] IAttachment? file = null)
=> await OcrAsync(OcrEngine.Google, file?.Url ?? url, Context.Interaction);

[SlashCommand("yandex", "Performs OCR to an image using Yandex.")]
public async Task<RuntimeResult> YandexAsync([Summary(description: "The URL of an image.")] string? url = null,
[Summary(description: "An image file.")] IAttachment? file = null)
=> await OcrAsync(OcrEngine.Yandex, file?.Url ?? url, Context.Interaction);

[MessageCommand("OCR")]
public async Task<RuntimeResult> OcrAsync(IMessage message)
{
Expand All @@ -51,18 +73,28 @@ public async Task<RuntimeResult> OcrAsync(IMessage message)
return FergunResult.FromError(_localizer["NoImageUrlInMessage"], true);
}

return await YandexAsync(url);
}
var page = new PageBuilder()
.WithTitle(_localizer["SelectOCREngine"])
.WithColor(Color.Orange);

[SlashCommand("google", "Performs OCR to an image using Google Lens.")]
public async Task<RuntimeResult> GoogleAsync([Summary(description: "The URL of an image.")] string? url = null,
[Summary(description: "An image file.")] IAttachment? file = null)
=> await OcrAsync(OcrEngine.Google, file?.Url ?? url, Context.Interaction);
var selection = new SelectionBuilder<OcrEngine>()
.AddUser(Context.User)
.WithOptions(Enum.GetValues<OcrEngine>())
.WithSelectionPage(page)
.Build();

[SlashCommand("yandex", "Performs OCR to an image using Yandex.")]
public async Task<RuntimeResult> YandexAsync([Summary(description: "The URL of an image.")] string? url = null,
[Summary(description: "An image file.")] IAttachment? file = null)
=> await OcrAsync(OcrEngine.Yandex, file?.Url ?? url, Context.Interaction);
var result = await _interactive.SendSelectionAsync(selection, Context.Interaction, TimeSpan.FromMinutes(1), ephemeral: true);

if (result.IsSuccess)
{
return await OcrAsync(result.Value, url, result.StopInteraction!, Context.Interaction, true);
}

// Attempt to disable the components
_ = Context.Interaction.ModifyOriginalResponseAsync(x => x.Components = selection.GetOrAddComponents(true).Build());

return FergunResult.FromSilentError();
}

public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDiscordInteraction interaction,
IDiscordInteraction? originalInteraction = null, bool ephemeral = false)
Expand Down Expand Up @@ -114,6 +146,7 @@ public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDis
text = ocrEngine switch
{
OcrEngine.Google => await _googleLens.OcrAsync(url),
OcrEngine.Bing => await _bingVisualSearch.OcrAsync(url),
OcrEngine.Yandex => await _yandexImageSearch.OcrAsync(url),
_ => throw new ArgumentException(_localizer["InvalidOCREngine"], nameof(ocrEngine))
};
Expand All @@ -123,6 +156,11 @@ public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDis
_logger.LogWarning(e, "Failed to perform Google Lens OCR to url {Url}", url);
return FergunResult.FromError(_localizer["GoogleLensOCRError"], ephemeral, interaction);
}
catch (BingException e)
{
_logger.LogWarning(e, "Failed to perform Bing OCR to url {Url}", url);
return FergunResult.FromError(e.ImageCategory is null ? e.Message : _localizer[$"Bing{e.ImageCategory}"], ephemeral, interaction);
}
catch (YandexException e)
{
_logger.LogWarning(e, "Failed to perform Yandex OCR to url {Url}", url);
Expand All @@ -142,6 +180,7 @@ public async Task<RuntimeResult> OcrAsync(OcrEngine ocrEngine, string? url, IDis
(var name, string iconUrl) = ocrEngine switch
{
OcrEngine.Google => (_localizer["GoogleLensOCR"], Constants.GoogleLensLogoUrl),
OcrEngine.Bing => (_localizer["BingVisualSearch"], Constants.BingIconUrl),
OcrEngine.Yandex => (_localizer["YandexOCR"], Constants.YandexIconUrl),
_ => throw new ArgumentException(_localizer["InvalidOCREngine"], nameof(ocrEngine))
};
Expand Down
22 changes: 22 additions & 0 deletions tests/Fergun.Tests/Apis/BingVisualSearchTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,27 @@ public class BingVisualSearchTests
{
private readonly IBingVisualSearch _bingVisualSearch = new BingVisualSearch();

[Theory]
[InlineData("https://upload.wikimedia.org/wikipedia/commons/0/01/Windows_fonts_most_used.jpg")]
[InlineData("https://upload.wikimedia.org/wikipedia/commons/5/57/Lorem_Ipsum_Helvetica.png")]
public async Task OcrAsync_Returns_Text(string url)
{
string text = await _bingVisualSearch.OcrAsync(url);

Assert.NotNull(text);
Assert.NotEmpty(text);
}

[Theory]
[InlineData("https://upload.wikimedia.org/wikipedia/commons/2/29/Suru_Bog_10000px.jpg")] // 10000px image
[InlineData("https://simpl.info/bigimage/bigImage.jpg")] // 91 MB file
public async Task OcrAsync_Throws_BingException_If_Image_Is_Invalid(string url)
{
var task = _bingVisualSearch.OcrAsync(url);

await Assert.ThrowsAsync<BingException>(() => task);
}

[Theory]
[InlineData("https://r.bing.com/rp/ecXQMr9jqKMeHE3ADTBrSN_WNyA.jpg", BingSafeSearchLevel.Off, null)]
[InlineData("https://r.bing.com/rp/vXuQ5-3dSnE08_cK26jVzOTxREk.jpg", BingSafeSearchLevel.Moderate, "en")]
Expand Down Expand Up @@ -52,6 +73,7 @@ public async Task Disposed_BingVisualSearch_Usage_Throws_ObjectDisposedException
(_bingVisualSearch as IDisposable)?.Dispose();
(_bingVisualSearch as IDisposable)?.Dispose();

await Assert.ThrowsAsync<ObjectDisposedException>(() => _bingVisualSearch.OcrAsync(It.IsAny<string>()));
await Assert.ThrowsAsync<ObjectDisposedException>(() => _bingVisualSearch.ReverseImageSearchAsync(It.IsAny<string>(), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string?>()));
}

Expand Down
34 changes: 32 additions & 2 deletions tests/Fergun.Tests/Modules/OcrModuleTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
using System.Threading.Tasks;
using Discord;
using Discord.Interactions;
using Discord.WebSocket;
using Fergun.Apis.Bing;
using Fergun.Apis.Google;
using Fergun.Apis.Yandex;
using Fergun.Interactive;
using Fergun.Modules;
using GTranslate.Translators;
using Microsoft.Extensions.Logging;
Expand All @@ -18,8 +21,11 @@ public class OcrModuleTests
private readonly Mock<IInteractionContext> _contextMock = new();
private readonly Mock<IDiscordInteraction> _interactionMock = new();
private readonly Mock<IGoogleLensClient> _googleLensMock = new();
private readonly Mock<IBingVisualSearch> _bingVisualSearchMock = new();
private readonly Mock<IYandexImageSearch> _yandexImageSearchMock = new();
private readonly Mock<ILogger<OcrModule>> _loggerMock = new();
private readonly DiscordSocketClient _client = new();
private readonly InteractiveConfig _interactiveConfig = new() { DeferStopSelectionInteractions = false };
private readonly IFergunLocalizer<OcrModule> _ocrLocalizer = Utils.CreateMockedLocalizer<OcrModule>();
private readonly Mock<OcrModule> _moduleMock;
private const string TextImageUrl = "https://example.com/image.png";
Expand All @@ -31,6 +37,9 @@ public OcrModuleTests()
_googleLensMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == TextImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync("test");
_googleLensMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == EmptyImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty);
_googleLensMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == InvalidImageUrl), It.IsAny<CancellationToken>())).ThrowsAsync(new GoogleLensException("Invalid image."));
_bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == TextImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync("test");
_bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == EmptyImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty);
_bingVisualSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == InvalidImageUrl), It.IsAny<CancellationToken>())).ThrowsAsync(new BingException("Invalid image."));
_yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == TextImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync("test");
_yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == EmptyImageUrl), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty);
_yandexImageSearchMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == InvalidImageUrl), It.IsAny<CancellationToken>())).ThrowsAsync(new YandexException("Invalid image."));
Expand All @@ -39,7 +48,9 @@ public OcrModuleTests()
var sharedLocalizer = Utils.CreateMockedLocalizer<SharedResource>();
var shared = new SharedModule(sharedLogger, sharedLocalizer, Mock.Of<IFergunTranslator>(), new GoogleTranslator2());

_moduleMock = new Mock<OcrModule>(() => new OcrModule(_loggerMock.Object, _ocrLocalizer, shared, _googleLensMock.Object, _yandexImageSearchMock.Object)) { CallBase = true };
var interactive = new InteractiveService(_client, _interactiveConfig);
_moduleMock = new Mock<OcrModule>(() => new OcrModule(_loggerMock.Object, _ocrLocalizer, shared, interactive,
_googleLensMock.Object, _bingVisualSearchMock.Object, _yandexImageSearchMock.Object)) { CallBase = true };
_contextMock.SetupGet(x => x.Interaction).Returns(_interactionMock.Object);
((IInteractionModuleBase)_moduleMock.Object).SetContext(_contextMock.Object);
}
Expand All @@ -51,7 +62,7 @@ public void BeforeExecute_Sets_Language()
_moduleMock.Object.BeforeExecute(It.IsAny<ICommandInfo>());
Assert.Equal("en", _ocrLocalizer.CurrentCulture.TwoLetterISOLanguageName);
}

[Theory]
[InlineData(TextImageUrl, true)]
[InlineData(EmptyImageUrl, false)]
Expand All @@ -71,6 +82,25 @@ public async Task GoogleAsync_Uses_GoogleLens(string url, bool success)
It.IsAny<AllowedMentions>(), It.IsAny<MessageComponent>(), It.IsAny<Embed>(), It.IsAny<RequestOptions>(), It.IsAny<PollProperties>()), success ? Times.Once : Times.Never);
}

[Theory]
[InlineData(TextImageUrl, true)]
[InlineData(EmptyImageUrl, false)]
public async Task BingAsync_Uses_BingVisualSearch(string url, bool success)
{
var module = _moduleMock.Object;
const bool isEphemeral = false;

var result = await module.BingAsync(url);
Assert.Equal(success, result.IsSuccess);

_interactionMock.Verify(x => x.DeferAsync(It.Is<bool>(b => b == isEphemeral), It.IsAny<RequestOptions>()), Times.Once);

_bingVisualSearchMock.Verify(x => x.OcrAsync(It.Is<string>(s => s == url), It.IsAny<CancellationToken>()), Times.Once);

_interactionMock.Verify(x => x.FollowupAsync(It.IsAny<string>(), It.IsAny<Embed[]>(), It.IsAny<bool>(), It.Is<bool>(b => b == isEphemeral),
It.IsAny<AllowedMentions>(), It.IsAny<MessageComponent>(), It.IsAny<Embed>(), It.IsAny<RequestOptions>(), It.IsAny<PollProperties>()), success ? Times.Once : Times.Never);
}

[Theory]
[InlineData(TextImageUrl, true)]
[InlineData(EmptyImageUrl, false)]
Expand Down
4 changes: 4 additions & 0 deletions tests/Fergun.Tests/Utils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ public static IBingVisualSearch CreateMockedBingVisualSearchApi(Faker? faker = n
faker ??= new Faker();
var bingMock = new Mock<IBingVisualSearch>();

bingMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == string.Empty), It.IsAny<CancellationToken>())).ReturnsAsync(string.Empty);
bingMock.Setup(x => x.OcrAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<CancellationToken>())).ReturnsAsync(faker.Lorem.Sentence());
bingMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == "https://example.com/error"), It.IsAny<CancellationToken>())).ThrowsAsync(new BingException("Error message."));

bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == string.Empty), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string>(), It.IsAny<CancellationToken>())).ReturnsAsync(Array.Empty<IBingReverseImageSearchResult>);
bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string>(), It.IsAny<CancellationToken>())).ReturnsAsync(() => faker.Make(50, () => CreateMockedBingReverseImageSearchResult(faker)).AsReadOnly());
bingMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == "https://example.com/error"), It.IsAny<BingSafeSearchLevel>(), It.IsAny<string>(), It.IsAny<CancellationToken>())).ThrowsAsync(new BingException("Error message."));
Expand Down

0 comments on commit e68a34e

Please sign in to comment.