Skip to content

Commit

Permalink
use custom collation for sorting LcmCrdt data (#1291)
Browse files Browse the repository at this point in the history
* define custom collations per writing system based on wsId and use for sorting

* use a span-based comparison overload for custom collation to avoid allocating strings

* convert headword to lowercase when sorting in mongo

---------

Co-authored-by: Tim Haasdyk <[email protected]>
  • Loading branch information
hahn-kev and myieye authored Nov 29, 2024
1 parent 94ac2a7 commit 62e2c19
Show file tree
Hide file tree
Showing 9 changed files with 293 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
using FwDataMiniLcmBridge.Tests.Fixtures;

namespace FwDataMiniLcmBridge.Tests.MiniLcmTests;

[Collection(ProjectLoaderFixture.Name)]
public class SortingTests(ProjectLoaderFixture fixture) : SortingTestsBase
{
protected override Task<IMiniLcmApi> NewApi()
{
return Task.FromResult<IMiniLcmApi>(fixture.NewProjectApi("sorting-test", "en", "en"));
}
}
19 changes: 19 additions & 0 deletions backend/FwLite/LcmCrdt.Tests/MiniLcmTests/SortingTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace LcmCrdt.Tests.MiniLcmTests;

public class SortingTests : SortingTestsBase
{
private readonly MiniLcmApiFixture _fixture = new();

protected override async Task<IMiniLcmApi> NewApi()
{
await _fixture.InitializeAsync();
var api = _fixture.Api;
return api;
}

public override async Task DisposeAsync()
{
await base.DisposeAsync();
await _fixture.DisposeAsync();
}
}
4 changes: 2 additions & 2 deletions backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -233,15 +233,15 @@ private async IAsyncEnumerable<Entry> GetEntries(
queryable = queryable.WhereExemplar(ws.Value, options.Exemplar.Value);
}

var sortWs = (await GetWritingSystem(options.Order.WritingSystem, WritingSystemType.Vernacular))?.WsId;
var sortWs = (await GetWritingSystem(options.Order.WritingSystem, WritingSystemType.Vernacular));
if (sortWs is null)
throw new NullReferenceException($"sort writing system {options.Order.WritingSystem} not found");
queryable = queryable
.LoadWith(e => e.Senses).ThenLoad(s => s.ExampleSentences)
.LoadWith(e => e.ComplexForms)
.LoadWith(e => e.Components)
.AsQueryable()
.OrderBy(e => e.Headword(sortWs.Value))
.OrderBy(e => e.Headword(sortWs.WsId).CollateUnicode(sortWs))
.ThenBy(e => e.Id)
.Skip(options.Offset)
.Take(options.Count);
Expand Down
152 changes: 152 additions & 0 deletions backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
using System.Data;
using System.Data.Common;
using System.Globalization;
using System.Text;
using Microsoft.Data.Sqlite;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Diagnostics;
using Microsoft.Extensions.Caching.Memory;
using Microsoft.Extensions.Logging;

namespace LcmCrdt.Data;

public class SetupCollationInterceptor(IMemoryCache cache, ILogger<SetupCollationInterceptor> logger) : IDbConnectionInterceptor, ISaveChangesInterceptor
{
private WritingSystem[] GetWritingSystems(LcmCrdtDbContext dbContext, DbConnection connection)
{
//todo this needs to be invalidated when the writing systems change
return cache.GetOrCreate(CacheKey(connection),
entry =>
{
entry.SlidingExpiration = TimeSpan.FromMinutes(30);
try
{

return dbContext.WritingSystems.ToArray();
}
catch (SqliteException e)

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Build FwHeadless / publish-fw-headless

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite and run tests

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Linux

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Linux

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Windows

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Windows

The variable 'e' is declared but never used
{
return [];
}
}) ?? [];
}

private static string CacheKey(DbConnection connection)
{
return $"writingSystems|{connection.ConnectionString}";
}

private void InvalidateWritingSystemsCache(DbConnection connection)
{
cache.Remove(CacheKey(connection));
}

public void ConnectionOpened(DbConnection connection, ConnectionEndEventData eventData)
{
var context = (LcmCrdtDbContext?)eventData.Context;
if (context is null) throw new InvalidOperationException("context is null");
var sqliteConnection = (SqliteConnection)connection;
SetupCollations(sqliteConnection, GetWritingSystems(context, connection));

//setup general use collation
sqliteConnection.CreateCollation(SqlSortingExtensions.CollateUnicodeNoCase,
CultureInfo.CurrentCulture.CompareInfo,
(compareInfo, x, y) => compareInfo.Compare(x, y, CompareOptions.IgnoreCase));
}

public Task ConnectionOpenedAsync(DbConnection connection,
ConnectionEndEventData eventData,
CancellationToken cancellationToken = default)
{
ConnectionOpened(connection, eventData);
return Task.CompletedTask;
}

public InterceptionResult<int> SavingChanges(DbContextEventData eventData, InterceptionResult<int> result)
{
UpdateCollationsOnSave(eventData.Context);
return result;
}

public ValueTask<InterceptionResult<int>> SavingChangesAsync(DbContextEventData eventData,
InterceptionResult<int> result,
CancellationToken cancellationToken = default)
{
UpdateCollationsOnSave(eventData.Context);
return ValueTask.FromResult(result);
}

private void UpdateCollationsOnSave(DbContext? dbContext)
{
if (dbContext is null) return;
var connection = (SqliteConnection)dbContext.Database.GetDbConnection();
bool updateWs = false;
foreach (var entityEntry in dbContext.ChangeTracker.Entries<WritingSystem>())
{
if (entityEntry.State is EntityState.Added or EntityState.Modified)
{
var writingSystem = entityEntry.Entity;
SetupCollation(connection, writingSystem);
updateWs = true;
}
}

if (updateWs)
{
InvalidateWritingSystemsCache(connection);
}
}

private void SetupCollations(SqliteConnection connection, WritingSystem[] writingSystems)
{
foreach (var writingSystem in writingSystems)
{
SetupCollation(connection, writingSystem);
}
}

private void SetupCollation(SqliteConnection connection, WritingSystem writingSystem)
{
CompareInfo compareInfo;
try
{
//todo use ICU/SLDR instead
compareInfo = CultureInfo.CreateSpecificCulture(writingSystem.WsId.Code).CompareInfo;
}
catch (Exception e)
{
logger.LogError(e, "Failed to create compare info for '{WritingSystemId}'", writingSystem.WsId);
compareInfo = CultureInfo.InvariantCulture.CompareInfo;
}

//todo use custom comparison based on the writing system
CreateSpanCollation(connection, SqlSortingExtensions.CollationName(writingSystem),
compareInfo,
static (compareInfo, x, y) => compareInfo.Compare(x, y, CompareOptions.IgnoreCase));
}

//this is a premature optimization, but it avoids creating strings for each comparison and instead uses spans which avoids allocations
//if the new comparison function does not support spans then we can use SqliteConnection.CreateCollation instead which works with strings
private void CreateSpanCollation<T>(SqliteConnection connection,
string name, T state,
Func<T, ReadOnlySpan<char>, ReadOnlySpan<char>, int> compare)
{
if (connection.State != ConnectionState.Open)
throw new InvalidOperationException("Unable to create custom collation Connection must be open.");
var rc = SQLitePCL.raw.sqlite3__create_collation_utf8(connection.Handle,
name,
Tuple.Create(state, compare),
static (s, x, y) =>
{
var (state, compare) = (Tuple<T, Func<T, ReadOnlySpan<char>, ReadOnlySpan<char>, int>>) s;
Span<char> xSpan = stackalloc char[Encoding.UTF8.GetCharCount(x)];
Span<char> ySpan = stackalloc char[Encoding.UTF8.GetCharCount(y)];
Encoding.UTF8.GetChars(x, xSpan);
Encoding.UTF8.GetChars(y, ySpan);

return compare(state, xSpan, ySpan);
});
SqliteException.ThrowExceptionForRC(rc, connection.Handle);

}
}
30 changes: 30 additions & 0 deletions backend/FwLite/LcmCrdt/Data/SqlSortingExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.Data.SQLite;
using System.Linq.Expressions;
using LinqToDB;
using SIL.WritingSystems;

namespace LcmCrdt.Data;

public static class SqlSortingExtensions
{
public const string CollateUnicodeNoCase = "NOCASE_UNICODE";

[ExpressionMethod(nameof(CollateUnicodeExpression))]
internal static string CollateUnicode(this string value, WritingSystem ws)
{
//could optionally just return the value here, but it would work differently than sql
throw new InvalidOperationException("CollateUnicode is a LinqToDB only API.");
}

private static Expression<Func<string, WritingSystem, string>> CollateUnicodeExpression()
{
//todo maybe in the future we use a custom collation based on the writing system
return (s, ws) => s.Collate(CollationName(ws));
}

internal static string CollationName(WritingSystem ws)
{
//don't use ':' in the name, it won't work
return $"NOCASE_WS_{ws.WsId}";
}
}
14 changes: 12 additions & 2 deletions backend/FwLite/LcmCrdt/LcmCrdtDbContext.cs
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
using System.Text.Json;
using System.Data.Common;
using System.Text.Json;
using LcmCrdt.Data;
using Microsoft.Data.Sqlite;
using SIL.Harmony;
using SIL.Harmony.Db;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Diagnostics;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using Microsoft.Extensions.Options;

namespace LcmCrdt;

public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOptions, IOptions<CrdtConfig> options): DbContext(dbContextOptions), ICrdtDbContext
public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOptions, IOptions<CrdtConfig> options, SetupCollationInterceptor setupCollationInterceptor)
: DbContext(dbContextOptions), ICrdtDbContext
{
public DbSet<ProjectData> ProjectData => Set<ProjectData>();
public IQueryable<WritingSystem> WritingSystems => Set<WritingSystem>().AsNoTracking();
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
optionsBuilder.AddInterceptors(setupCollationInterceptor);
}

protected override void OnModelCreating(ModelBuilder modelBuilder)
{
Expand Down
2 changes: 2 additions & 0 deletions backend/FwLite/LcmCrdt/LcmCrdtKernel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using SIL.Harmony.Changes;
using LcmCrdt.Changes;
using LcmCrdt.Changes.Entries;
using LcmCrdt.Data;
using LcmCrdt.Objects;
using LcmCrdt.RemoteSync;
using LinqToDB;
Expand All @@ -28,6 +29,7 @@ public static IServiceCollection AddLcmCrdtClient(this IServiceCollection servic
{
LinqToDBForEFTools.Initialize();
services.AddMemoryCache();
services.AddSingleton<SetupCollationInterceptor>();
services.AddDbContext<LcmCrdtDbContext>(ConfigureDbOptions);
services.AddOptions<LcmCrdtConfig>().BindConfiguration("LcmCrdt");

Expand Down
62 changes: 62 additions & 0 deletions backend/FwLite/MiniLcm.Tests/SortingTestsBase.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
namespace MiniLcm.Tests;

public abstract class SortingTestsBase : MiniLcmTestBase
{
public override async Task InitializeAsync()
{
await base.InitializeAsync();
await Api.CreateWritingSystem(WritingSystemType.Analysis,
new WritingSystem()
{
Id = Guid.NewGuid(),
Type = WritingSystemType.Analysis,
WsId = "en",
Name = "English",
Abbreviation = "En",
Font = "Arial",
Exemplars = []
});
await Api.CreateWritingSystem(WritingSystemType.Vernacular,
new WritingSystem()
{
Id = Guid.NewGuid(),
Type = WritingSystemType.Vernacular,
WsId = "en-US",
Name = "English",
Abbreviation = "En",
Font = "Arial",
Exemplars = []
});
}

private Task CreateEntry(string headword)
{
return Api.CreateEntry(new() { LexemeForm = { { "en", headword } }, });
}


// ReSharper disable InconsistentNaming
const string Ru_A= "\u0410";
const string Ru_a = "\u0430";
const string Ru_Б= "\u0411";
const string Ru_б = "\u0431";
const string Ru_В= "\u0412";
const string Ru_в = "\u0432";
// ReSharper restore InconsistentNaming

[Theory]
[InlineData("aa,ab,ac")]
[InlineData("aa,Ab,ac")]
[InlineData($"{Ru_a}{Ru_a},{Ru_a}{Ru_б},{Ru_a}{Ru_в}")]
[InlineData($"{Ru_a}{Ru_a},{Ru_A}{Ru_б},{Ru_a}{Ru_в}")]
public async Task EntriesAreSorted(string headwords)
{
var headwordList = headwords.Split(',');
foreach (var headword in headwordList.OrderBy(h => Random.Shared.Next()))
{
await CreateEntry(headword);
}
var entries = await Api.GetEntries().Select(e => e.Headword()).ToArrayAsync();
entries.Should().Equal(headwordList);
}
}
4 changes: 2 additions & 2 deletions backend/LfClassicData/LfClassicMiniLcmApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ private async IAsyncEnumerable<Entry> Query(QueryOptions? options = null, string
new BsonDocument("$ne", new BsonArray { new BsonDocument("$trim", new BsonDocument("input", $"$citationForm.{sortWs}.value")), "" }),
})
},
{ "then", $"$citationForm.{sortWs}.value" },
{ "then", new BsonDocument("$toLower", $"$citationForm.{sortWs}.value") },
{ "else", new BsonDocument("$cond", new BsonDocument
{
{ "if", new BsonDocument("$and", new BsonArray
Expand All @@ -216,7 +216,7 @@ private async IAsyncEnumerable<Entry> Query(QueryOptions? options = null, string
new BsonDocument("$ne", new BsonArray { new BsonDocument("$trim", new BsonDocument("input", $"$lexeme.{sortWs}.value")), "" }),
})
},
{ "then", $"$lexeme.{sortWs}.value" },
{ "then", new BsonDocument("$toLower", $"$lexeme.{sortWs}.value") },
{ "else", "" }
})
}
Expand Down

0 comments on commit 62e2c19

Please sign in to comment.