Skip to content

Commit

Permalink
Fix Yandex reverse image search
Browse files Browse the repository at this point in the history
  • Loading branch information
d4n3436 committed Dec 1, 2023
1 parent 2272c26 commit b9e8524
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 155 deletions.
20 changes: 20 additions & 0 deletions src/Apis/Yandex/HtmlEncodingConverter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using System;
using System.Net;
using System.Text.Json;
using System.Text.Json.Serialization;

namespace Fergun.Apis.Yandex;

/// <summary>
/// Represents a converter that decodes HTML-encoded strings.
/// </summary>
public class HtmlEncodingConverter : JsonConverter<string>
{
/// <inheritdoc />
public override string? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
=> WebUtility.HtmlDecode(reader.GetString());

/// <inheritdoc />
public override void Write(Utf8JsonWriter writer, string value, JsonSerializerOptions options)
=> throw new NotSupportedException();
}
6 changes: 3 additions & 3 deletions src/Apis/Yandex/IYandexImageSearch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public interface IYandexImageSearch
/// </summary>
/// <param name="url">The URL of an image.</param>
/// <param name="cancellationToken">The cancellation token.</param>
/// <returns>A <see cref="Task"/> representing the asynchronous OCR operation. The result contains the recognized text.</returns>
/// <returns>A <see cref="Task{TResult}"/> representing the asynchronous OCR operation. The result contains the recognized text.</returns>
Task<string?> OcrAsync(string url, CancellationToken cancellationToken = default);

/// <summary>
Expand All @@ -23,7 +23,7 @@ public interface IYandexImageSearch
/// <param name="url">The URL of an image.</param>
/// <param name="mode">The search filter mode.</param>
/// <param name="cancellationToken">The cancellation token.</param>
/// <returns>A <see cref="Task{TResult}"/> representing the asynchronous search operation. The result contains an <see cref="IEnumerable{T}"/> of search results.</returns>
Task<IEnumerable<IYandexReverseImageSearchResult>> ReverseImageSearchAsync(string url,
/// <returns>A <see cref="Task{TResult}"/> representing the asynchronous search operation. The result contains a read-only list of search results.</returns>
Task<IReadOnlyList<IYandexReverseImageSearchResult>> ReverseImageSearchAsync(string url,
YandexSearchFilterMode mode = YandexSearchFilterMode.Moderate, CancellationToken cancellationToken = default);
}
65 changes: 18 additions & 47 deletions src/Apis/Yandex/YandexImageSearch.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text.Json;
using System.Threading;
Expand Down Expand Up @@ -112,7 +111,7 @@ public YandexImageSearch(HttpClient httpClient)
}

/// <inheritdoc/>
public async Task<IEnumerable<IYandexReverseImageSearchResult>> ReverseImageSearchAsync(string url,
public async Task<IReadOnlyList<IYandexReverseImageSearchResult>> ReverseImageSearchAsync(string url,
YandexSearchFilterMode mode = YandexSearchFilterMode.Moderate, CancellationToken cancellationToken = default)
{
EnsureNotDisposed();
Expand Down Expand Up @@ -155,17 +154,26 @@ public async Task<IEnumerable<IYandexReverseImageSearchResult>> ReverseImageSear
.RootElement
.GetProperty("blocks")[0]
.GetProperty("html")
.GetString() ?? string.Empty;
.GetString()!;

var htmlDocument = await _parser.ParseDocumentAsync(html, cancellationToken).ConfigureAwait(false);

var rawItems = htmlDocument
.GetElementsByClassName("serp-list")
.FirstOrDefault()?
.GetElementsByClassName("serp-item")
.Select(x => x.GetAttribute("data-bem")) ?? Enumerable.Empty<string?>();

return EnumerateResults(rawItems);
string json = htmlDocument
.GetElementsByClassName("cbir-similar-page").First()
.GetElementsByClassName("cbir-similar-page__content").First()
.GetElementsByClassName("Root").First()
.GetAttribute("data-state")!;

using var data = JsonDocument.Parse(json);

return data.RootElement
.GetProperty("initialState")
.GetProperty("serpList")
.GetProperty("items")
.GetProperty("entities")
.EnumerateObject()
.Select(x => x.Value.GetProperty("viewerData").Deserialize<YandexReverseImageSearchResult>()!)
.ToArray();
}

/// <inheritdoc/>
Expand All @@ -180,43 +188,6 @@ public void Dispose()
_disposed = true;
}

private static IEnumerable<YandexReverseImageSearchResult> EnumerateResults(IEnumerable<string?> rawItems)
{
foreach (string? rawItem in rawItems)
{
if (string.IsNullOrEmpty(rawItem))
continue;

JsonDocument document;
try
{
document = JsonDocument.Parse(rawItem);
}
catch
{
continue;
}

var item = document.RootElement.GetPropertyOrDefault("serp-item");
var snippet = item.GetPropertyOrDefault("snippet");

string? url = item
.GetPropertyOrDefault("img_href")
.GetStringOrDefault();

string? sourceUrl = snippet.GetPropertyOrDefault("url").GetStringOrDefault();
string? title = snippet.GetPropertyOrDefault("title").GetStringOrDefault();
string? text = snippet.GetPropertyOrDefault("text").GetStringOrDefault();

if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(sourceUrl) || string.IsNullOrEmpty(text))
{
continue;
}

yield return new YandexReverseImageSearchResult(url, sourceUrl, WebUtility.HtmlDecode(title), WebUtility.HtmlDecode(text));
}
}

private void EnsureNotDisposed()
{
if (_disposed)
Expand Down
51 changes: 40 additions & 11 deletions src/Apis/Yandex/YandexReverseImageSearchResult.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
namespace Fergun.Apis.Yandex;
using System.Text.Json.Serialization;

namespace Fergun.Apis.Yandex;

/// <summary>
/// Represents a Yandex reverse image search result.
Expand All @@ -9,29 +11,56 @@ public class YandexReverseImageSearchResult : IYandexReverseImageSearchResult
/// Initializes a new instance of the <see cref="YandexReverseImageSearchResult"/> class.
/// </summary>
/// <param name="url">A URL pointing to the image.</param>
/// <param name="sourceUrl">A URL pointing to the webpage hosting the image.</param>
/// <param name="title">The title of the image result.</param>
/// <param name="text">The description of the image result.</param>
internal YandexReverseImageSearchResult(string url, string sourceUrl, string? title, string text)
/// <param name="snippet">Snippet data.</param>
public YandexReverseImageSearchResult(string url, YandexSnippetData snippet)
{
Url = url;
SourceUrl = sourceUrl;
Title = title;
Text = text;
Snippet = snippet;
}

/// <inheritdoc/>
[JsonPropertyName("img_href")]
public string Url { get; }

[JsonPropertyName("snippet")]
public YandexSnippetData Snippet { get; }

/// <inheritdoc/>
public string SourceUrl { get; }
public string SourceUrl => Snippet.SourceUrl;

/// <inheritdoc/>
public string? Title { get; }
public string? Title => Snippet.Title;

/// <inheritdoc/>
public string Text { get; }
public string Text => Snippet.Text;

/// <inheritdoc/>
public override string ToString() => $"{nameof(Title)} = {Title ?? "(None)"}, {nameof(Text)} = {Text}";

public class YandexSnippetData
{
/// <summary>
/// Initializes a new instance of the <see cref="YandexSnippetData"/> class.
/// </summary>
/// <param name="sourceUrl">A URL pointing to the webpage hosting the image.</param>
/// <param name="title">The title of the image result.</param>
/// <param name="text">The description of the image result.</param>
public YandexSnippetData(string sourceUrl, string? title, string text)
{
SourceUrl = sourceUrl;
Title = title;
Text = text;
}

[JsonPropertyName("url")]
public string SourceUrl { get; }

[JsonPropertyName("title")]
[JsonConverter(typeof(HtmlEncodingConverter))]
public string? Title { get; }

[JsonPropertyName("text")]
[JsonConverter(typeof(HtmlEncodingConverter))]
public string Text { get; }
}
}
9 changes: 4 additions & 5 deletions src/Modules/ImageModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -247,26 +247,25 @@ public virtual async Task<RuntimeResult> YandexAsync(string url, bool multiImage

bool isNsfw = Context.Channel.IsNsfw();

IYandexReverseImageSearchResult[] results;
IReadOnlyList<IYandexReverseImageSearchResult> results;

try
{
results = (await _yandexImageSearch.ReverseImageSearchAsync(url, isNsfw ? YandexSearchFilterMode.None : YandexSearchFilterMode.Family))
.ToArray();
results = await _yandexImageSearch.ReverseImageSearchAsync(url, isNsfw ? YandexSearchFilterMode.None : YandexSearchFilterMode.Family);
}
catch (YandexException e)
{
_logger.LogWarning(e, "Failed to perform reverse image search to url {Url}", url);
return FergunResult.FromError(e.Message, ephemeral, interaction);
}

if (results.Length == 0)
if (results.Count == 0)
{
return FergunResult.FromError(_localizer["NoResults"], ephemeral, interaction);
}

int count = multiImages ? 4 : 1;
int maxIndex = (int)Math.Ceiling((double)results.Length / count) - 1;
int maxIndex = (int)Math.Ceiling((double)results.Count / count) - 1;

var paginator = new LazyPaginatorBuilder()
.WithPageFactory(GeneratePage)
Expand Down
102 changes: 15 additions & 87 deletions tests/Fergun.Tests/Apis/YandexImageSearchTests.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
using System;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using Fergun.Apis.Yandex;
using Moq;
using Moq.Protected;
Expand Down Expand Up @@ -76,7 +72,7 @@ public async Task OcrAsync_Throws_YandexException_If_Captcha_Is_Present()
[InlineData("https://upload.wikimedia.org/wikipedia/commons/0/0e/Landscape-2454891_960_720.jpg", YandexSearchFilterMode.Family)]
public async Task ReverseImageSearchAsync_Returns_Results(string url, YandexSearchFilterMode mode)
{
var results = (await _yandexImageSearch.ReverseImageSearchAsync(url, mode)).ToArray();
var results = await _yandexImageSearch.ReverseImageSearchAsync(url, mode);

Assert.NotNull(results);
Assert.NotEmpty(results);
Expand Down Expand Up @@ -104,88 +100,6 @@ public async Task ReverseImageSearchAsync_Throws_YandexException_If_Captcha_Is_P
await Assert.ThrowsAsync<YandexException>(() => task);
}

[Fact]
public async Task ReverseImageSearchAsync_Ignores_Invalid_Results()
{
string[] rawResults =
{
string.Empty,
"{[",
@"
{
""serp-item"":
{
""img_href"": null
}
}",
@"
{
""serp-item"":
{
""img_href"": ""https://example.com/image.png"",
""snippet"":
{
""url"": null,
""text"": ""sample text""
}
}
}",
@"
{
""serp-item"":
{
""img_href"": ""https://example.com/image.png"",
""snippet"":
{
""url"": ""https://example.com"",
""text"": null
}
}
}"
};

var context = BrowsingContext.New();
var document = await context.OpenNewAsync();
var serpList = document.CreateElement<IHtmlDivElement>();
serpList.ClassName = "serp-list";

serpList.Append(rawResults.Select(x =>
{
var item = document.CreateElement<IHtmlDivElement>();
item.ClassName = "serp-item";

item.SetAttribute("data-bem", x);
return (INode)item;
}).ToArray());

string html = serpList.ToHtml();

string json = $@"
{{
""blocks"":
[
{{
""html"": ""{{{JsonEncodedText.Encode(html)}}}""
}}
]
}}";

var messageHandlerMock = new Mock<HttpMessageHandler>();

messageHandlerMock
.Protected()
.As<HttpClient>()
.SetupSequence(x => x.SendAsync(It.IsAny<HttpRequestMessage>(), It.IsAny<CancellationToken>()))
.ReturnsAsync(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent(json) });

var yandexImageSearch = new YandexImageSearch(new HttpClient(messageHandlerMock.Object));

var results = (await yandexImageSearch.ReverseImageSearchAsync("https://example.com/image.png")).ToArray();

Assert.NotNull(results);
Assert.Empty(results);
}

[Fact]
public async Task Disposed_YandexImageSearch_Usage_Throws_ObjectDisposedException()
{
Expand Down Expand Up @@ -213,4 +127,18 @@ public void YandexException_Has_Expected_Values()
Assert.Equal("Custom message 2", exception3.Message);
Assert.Same(innerException, exception3.InnerException);
}

[Theory]
[InlineData("\"{title:&quot;a&quot;}\"", "{title:\"a\"}")]
[InlineData("\"D&amp;D\"", "D&D")]
public void HtmlEncodingConverter_Returns_Expected_Values(string encodedString, string decodedString)
{
var options = new JsonSerializerOptions();
options.Converters.Add(new HtmlEncodingConverter());

string deserialized = JsonSerializer.Deserialize<string>(encodedString, options)!;

Assert.Equal(decodedString, deserialized);
Assert.Throws<NotSupportedException>(() => JsonSerializer.Serialize(decodedString, options));
}
}
4 changes: 2 additions & 2 deletions tests/Fergun.Tests/Utils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ public static IYandexImageSearch CreateMockedYandexImageSearchApi(Faker? faker =
yandexMock.Setup(x => x.OcrAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<CancellationToken>())).ReturnsAsync(faker.Lorem.Sentence());
yandexMock.Setup(x => x.OcrAsync(It.Is<string>(s => s == "https://example.com/error"), It.IsAny<CancellationToken>())).ThrowsAsync(new YandexException("Error message."));

yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == string.Empty), It.IsAny<YandexSearchFilterMode>(), It.IsAny<CancellationToken>())).ReturnsAsync(Enumerable.Empty<IYandexReverseImageSearchResult>);
yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<YandexSearchFilterMode>(), It.IsAny<CancellationToken>())).ReturnsAsync(() => faker.MakeLazy(50, () => CreateMockedYandexReverseImageSearchResult(faker)));
yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == string.Empty), It.IsAny<YandexSearchFilterMode>(), It.IsAny<CancellationToken>())).ReturnsAsync(Array.Empty<IYandexReverseImageSearchResult>);
yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => !string.IsNullOrEmpty(s)), It.IsAny<YandexSearchFilterMode>(), It.IsAny<CancellationToken>())).ReturnsAsync(() => faker.MakeLazy(50, () => CreateMockedYandexReverseImageSearchResult(faker)).ToList());
yandexMock.Setup(x => x.ReverseImageSearchAsync(It.Is<string>(s => s == "https://example.com/error"), It.IsAny<YandexSearchFilterMode>(), It.IsAny<CancellationToken>())).ThrowsAsync(new YandexException("Error message."));

return yandexMock.Object;
Expand Down

0 comments on commit b9e8524

Please sign in to comment.