From b9e8524afc42107900cdde733605151e65196557 Mon Sep 17 00:00:00 2001 From: d4n3436 Date: Fri, 1 Dec 2023 14:23:54 -0500 Subject: [PATCH] Fix Yandex reverse image search --- src/Apis/Yandex/HtmlEncodingConverter.cs | 20 ++++ src/Apis/Yandex/IYandexImageSearch.cs | 6 +- src/Apis/Yandex/YandexImageSearch.cs | 65 ++++------- .../Yandex/YandexReverseImageSearchResult.cs | 51 +++++++-- src/Modules/ImageModule.cs | 9 +- .../Apis/YandexImageSearchTests.cs | 102 +++--------------- tests/Fergun.Tests/Utils.cs | 4 +- 7 files changed, 102 insertions(+), 155 deletions(-) create mode 100644 src/Apis/Yandex/HtmlEncodingConverter.cs diff --git a/src/Apis/Yandex/HtmlEncodingConverter.cs b/src/Apis/Yandex/HtmlEncodingConverter.cs new file mode 100644 index 0000000..eaf11c5 --- /dev/null +++ b/src/Apis/Yandex/HtmlEncodingConverter.cs @@ -0,0 +1,20 @@ +using System; +using System.Net; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Fergun.Apis.Yandex; + +/// +/// Represents a converter that decodes HTML-encoded strings. +/// +public class HtmlEncodingConverter : JsonConverter +{ + /// + public override string? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + => WebUtility.HtmlDecode(reader.GetString()); + + /// + public override void Write(Utf8JsonWriter writer, string value, JsonSerializerOptions options) + => throw new NotSupportedException(); +} \ No newline at end of file diff --git a/src/Apis/Yandex/IYandexImageSearch.cs b/src/Apis/Yandex/IYandexImageSearch.cs index 8ae7097..9ee983a 100644 --- a/src/Apis/Yandex/IYandexImageSearch.cs +++ b/src/Apis/Yandex/IYandexImageSearch.cs @@ -14,7 +14,7 @@ public interface IYandexImageSearch /// /// The URL of an image. /// The cancellation token. - /// A representing the asynchronous OCR operation. The result contains the recognized text. + /// A representing the asynchronous OCR operation. The result contains the recognized text. Task OcrAsync(string url, CancellationToken cancellationToken = default); /// @@ -23,7 +23,7 @@ public interface IYandexImageSearch /// The URL of an image. /// The search filter mode. /// The cancellation token. - /// A representing the asynchronous search operation. The result contains an of search results. - Task> ReverseImageSearchAsync(string url, + /// A representing the asynchronous search operation. The result contains a read-only list of search results. + Task> ReverseImageSearchAsync(string url, YandexSearchFilterMode mode = YandexSearchFilterMode.Moderate, CancellationToken cancellationToken = default); } \ No newline at end of file diff --git a/src/Apis/Yandex/YandexImageSearch.cs b/src/Apis/Yandex/YandexImageSearch.cs index ecb3d20..d3da7d9 100644 --- a/src/Apis/Yandex/YandexImageSearch.cs +++ b/src/Apis/Yandex/YandexImageSearch.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Net; using System.Net.Http; using System.Text.Json; using System.Threading; @@ -112,7 +111,7 @@ public YandexImageSearch(HttpClient httpClient) } /// - public async Task> ReverseImageSearchAsync(string url, + public async Task> ReverseImageSearchAsync(string url, YandexSearchFilterMode mode = YandexSearchFilterMode.Moderate, CancellationToken cancellationToken = default) { EnsureNotDisposed(); @@ -155,17 +154,26 @@ public async Task> ReverseImageSear .RootElement .GetProperty("blocks")[0] .GetProperty("html") - .GetString() ?? string.Empty; + .GetString()!; var htmlDocument = await _parser.ParseDocumentAsync(html, cancellationToken).ConfigureAwait(false); - var rawItems = htmlDocument - .GetElementsByClassName("serp-list") - .FirstOrDefault()? - .GetElementsByClassName("serp-item") - .Select(x => x.GetAttribute("data-bem")) ?? Enumerable.Empty(); - - return EnumerateResults(rawItems); + string json = htmlDocument + .GetElementsByClassName("cbir-similar-page").First() + .GetElementsByClassName("cbir-similar-page__content").First() + .GetElementsByClassName("Root").First() + .GetAttribute("data-state")!; + + using var data = JsonDocument.Parse(json); + + return data.RootElement + .GetProperty("initialState") + .GetProperty("serpList") + .GetProperty("items") + .GetProperty("entities") + .EnumerateObject() + .Select(x => x.Value.GetProperty("viewerData").Deserialize()!) + .ToArray(); } /// @@ -180,43 +188,6 @@ public void Dispose() _disposed = true; } - private static IEnumerable EnumerateResults(IEnumerable rawItems) - { - foreach (string? rawItem in rawItems) - { - if (string.IsNullOrEmpty(rawItem)) - continue; - - JsonDocument document; - try - { - document = JsonDocument.Parse(rawItem); - } - catch - { - continue; - } - - var item = document.RootElement.GetPropertyOrDefault("serp-item"); - var snippet = item.GetPropertyOrDefault("snippet"); - - string? url = item - .GetPropertyOrDefault("img_href") - .GetStringOrDefault(); - - string? sourceUrl = snippet.GetPropertyOrDefault("url").GetStringOrDefault(); - string? title = snippet.GetPropertyOrDefault("title").GetStringOrDefault(); - string? text = snippet.GetPropertyOrDefault("text").GetStringOrDefault(); - - if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(sourceUrl) || string.IsNullOrEmpty(text)) - { - continue; - } - - yield return new YandexReverseImageSearchResult(url, sourceUrl, WebUtility.HtmlDecode(title), WebUtility.HtmlDecode(text)); - } - } - private void EnsureNotDisposed() { if (_disposed) diff --git a/src/Apis/Yandex/YandexReverseImageSearchResult.cs b/src/Apis/Yandex/YandexReverseImageSearchResult.cs index 1bb89d3..701f63a 100644 --- a/src/Apis/Yandex/YandexReverseImageSearchResult.cs +++ b/src/Apis/Yandex/YandexReverseImageSearchResult.cs @@ -1,4 +1,6 @@ -namespace Fergun.Apis.Yandex; +using System.Text.Json.Serialization; + +namespace Fergun.Apis.Yandex; /// /// Represents a Yandex reverse image search result. @@ -9,29 +11,56 @@ public class YandexReverseImageSearchResult : IYandexReverseImageSearchResult /// Initializes a new instance of the class. /// /// A URL pointing to the image. - /// A URL pointing to the webpage hosting the image. - /// The title of the image result. - /// The description of the image result. - internal YandexReverseImageSearchResult(string url, string sourceUrl, string? title, string text) + /// Snippet data. + public YandexReverseImageSearchResult(string url, YandexSnippetData snippet) { Url = url; - SourceUrl = sourceUrl; - Title = title; - Text = text; + Snippet = snippet; } /// + [JsonPropertyName("img_href")] public string Url { get; } + [JsonPropertyName("snippet")] + public YandexSnippetData Snippet { get; } + /// - public string SourceUrl { get; } + public string SourceUrl => Snippet.SourceUrl; /// - public string? Title { get; } + public string? Title => Snippet.Title; /// - public string Text { get; } + public string Text => Snippet.Text; /// public override string ToString() => $"{nameof(Title)} = {Title ?? "(None)"}, {nameof(Text)} = {Text}"; + + public class YandexSnippetData + { + /// + /// Initializes a new instance of the class. + /// + /// A URL pointing to the webpage hosting the image. + /// The title of the image result. + /// The description of the image result. + public YandexSnippetData(string sourceUrl, string? title, string text) + { + SourceUrl = sourceUrl; + Title = title; + Text = text; + } + + [JsonPropertyName("url")] + public string SourceUrl { get; } + + [JsonPropertyName("title")] + [JsonConverter(typeof(HtmlEncodingConverter))] + public string? Title { get; } + + [JsonPropertyName("text")] + [JsonConverter(typeof(HtmlEncodingConverter))] + public string Text { get; } + } } \ No newline at end of file diff --git a/src/Modules/ImageModule.cs b/src/Modules/ImageModule.cs index ff4125d..517cac0 100644 --- a/src/Modules/ImageModule.cs +++ b/src/Modules/ImageModule.cs @@ -247,12 +247,11 @@ public virtual async Task YandexAsync(string url, bool multiImage bool isNsfw = Context.Channel.IsNsfw(); - IYandexReverseImageSearchResult[] results; + IReadOnlyList results; try { - results = (await _yandexImageSearch.ReverseImageSearchAsync(url, isNsfw ? YandexSearchFilterMode.None : YandexSearchFilterMode.Family)) - .ToArray(); + results = await _yandexImageSearch.ReverseImageSearchAsync(url, isNsfw ? YandexSearchFilterMode.None : YandexSearchFilterMode.Family); } catch (YandexException e) { @@ -260,13 +259,13 @@ public virtual async Task YandexAsync(string url, bool multiImage return FergunResult.FromError(e.Message, ephemeral, interaction); } - if (results.Length == 0) + if (results.Count == 0) { return FergunResult.FromError(_localizer["NoResults"], ephemeral, interaction); } int count = multiImages ? 4 : 1; - int maxIndex = (int)Math.Ceiling((double)results.Length / count) - 1; + int maxIndex = (int)Math.Ceiling((double)results.Count / count) - 1; var paginator = new LazyPaginatorBuilder() .WithPageFactory(GeneratePage) diff --git a/tests/Fergun.Tests/Apis/YandexImageSearchTests.cs b/tests/Fergun.Tests/Apis/YandexImageSearchTests.cs index 475b626..764b57f 100644 --- a/tests/Fergun.Tests/Apis/YandexImageSearchTests.cs +++ b/tests/Fergun.Tests/Apis/YandexImageSearchTests.cs @@ -1,13 +1,9 @@ using System; -using System.Linq; using System.Net; using System.Net.Http; using System.Text.Json; using System.Threading; using System.Threading.Tasks; -using AngleSharp; -using AngleSharp.Dom; -using AngleSharp.Html.Dom; using Fergun.Apis.Yandex; using Moq; using Moq.Protected; @@ -76,7 +72,7 @@ public async Task OcrAsync_Throws_YandexException_If_Captcha_Is_Present() [InlineData("https://upload.wikimedia.org/wikipedia/commons/0/0e/Landscape-2454891_960_720.jpg", YandexSearchFilterMode.Family)] public async Task ReverseImageSearchAsync_Returns_Results(string url, YandexSearchFilterMode mode) { - var results = (await _yandexImageSearch.ReverseImageSearchAsync(url, mode)).ToArray(); + var results = await _yandexImageSearch.ReverseImageSearchAsync(url, mode); Assert.NotNull(results); Assert.NotEmpty(results); @@ -104,88 +100,6 @@ public async Task ReverseImageSearchAsync_Throws_YandexException_If_Captcha_Is_P await Assert.ThrowsAsync(() => task); } - [Fact] - public async Task ReverseImageSearchAsync_Ignores_Invalid_Results() - { - string[] rawResults = - { - string.Empty, - "{[", - @" -{ - ""serp-item"": - { - ""img_href"": null - } -}", - @" -{ - ""serp-item"": - { - ""img_href"": ""https://example.com/image.png"", - ""snippet"": - { - ""url"": null, - ""text"": ""sample text"" - } - } -}", - @" -{ - ""serp-item"": - { - ""img_href"": ""https://example.com/image.png"", - ""snippet"": - { - ""url"": ""https://example.com"", - ""text"": null - } - } -}" - }; - - var context = BrowsingContext.New(); - var document = await context.OpenNewAsync(); - var serpList = document.CreateElement(); - serpList.ClassName = "serp-list"; - - serpList.Append(rawResults.Select(x => - { - var item = document.CreateElement(); - item.ClassName = "serp-item"; - - item.SetAttribute("data-bem", x); - return (INode)item; - }).ToArray()); - - string html = serpList.ToHtml(); - - string json = $@" -{{ - ""blocks"": - [ - {{ - ""html"": ""{{{JsonEncodedText.Encode(html)}}}"" - }} - ] -}}"; - - var messageHandlerMock = new Mock(); - - messageHandlerMock - .Protected() - .As() - .SetupSequence(x => x.SendAsync(It.IsAny(), It.IsAny())) - .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent(json) }); - - var yandexImageSearch = new YandexImageSearch(new HttpClient(messageHandlerMock.Object)); - - var results = (await yandexImageSearch.ReverseImageSearchAsync("https://example.com/image.png")).ToArray(); - - Assert.NotNull(results); - Assert.Empty(results); - } - [Fact] public async Task Disposed_YandexImageSearch_Usage_Throws_ObjectDisposedException() { @@ -213,4 +127,18 @@ public void YandexException_Has_Expected_Values() Assert.Equal("Custom message 2", exception3.Message); Assert.Same(innerException, exception3.InnerException); } + + [Theory] + [InlineData("\"{title:"a"}\"", "{title:\"a\"}")] + [InlineData("\"D&D\"", "D&D")] + public void HtmlEncodingConverter_Returns_Expected_Values(string encodedString, string decodedString) + { + var options = new JsonSerializerOptions(); + options.Converters.Add(new HtmlEncodingConverter()); + + string deserialized = JsonSerializer.Deserialize(encodedString, options)!; + + Assert.Equal(decodedString, deserialized); + Assert.Throws(() => JsonSerializer.Serialize(decodedString, options)); + } } \ No newline at end of file diff --git a/tests/Fergun.Tests/Utils.cs b/tests/Fergun.Tests/Utils.cs index 6a12d32..791af15 100644 --- a/tests/Fergun.Tests/Utils.cs +++ b/tests/Fergun.Tests/Utils.cs @@ -156,8 +156,8 @@ public static IYandexImageSearch CreateMockedYandexImageSearchApi(Faker? faker = yandexMock.Setup(x => x.OcrAsync(It.Is(s => !string.IsNullOrEmpty(s)), It.IsAny())).ReturnsAsync(faker.Lorem.Sentence()); yandexMock.Setup(x => x.OcrAsync(It.Is(s => s == "https://example.com/error"), It.IsAny())).ThrowsAsync(new YandexException("Error message.")); - yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => s == string.Empty), It.IsAny(), It.IsAny())).ReturnsAsync(Enumerable.Empty); - yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => !string.IsNullOrEmpty(s)), It.IsAny(), It.IsAny())).ReturnsAsync(() => faker.MakeLazy(50, () => CreateMockedYandexReverseImageSearchResult(faker))); + yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => s == string.Empty), It.IsAny(), It.IsAny())).ReturnsAsync(Array.Empty); + yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => !string.IsNullOrEmpty(s)), It.IsAny(), It.IsAny())).ReturnsAsync(() => faker.MakeLazy(50, () => CreateMockedYandexReverseImageSearchResult(faker)).ToList()); yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is(s => s == "https://example.com/error"), It.IsAny(), It.IsAny())).ThrowsAsync(new YandexException("Error message.")); return yandexMock.Object;