Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use custom collation for sorting LcmCrdt data #1291

Merged
merged 5 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
using FwDataMiniLcmBridge.Tests.Fixtures;

namespace FwDataMiniLcmBridge.Tests.MiniLcmTests;

[Collection(ProjectLoaderFixture.Name)]
public class SortingTests(ProjectLoaderFixture fixture) : SortingTestsBase
{
protected override Task<IMiniLcmApi> NewApi()
{
return Task.FromResult<IMiniLcmApi>(fixture.NewProjectApi("sorting-test", "en", "en"));
}
}
19 changes: 19 additions & 0 deletions backend/FwLite/LcmCrdt.Tests/MiniLcmTests/SortingTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace LcmCrdt.Tests.MiniLcmTests;

public class SortingTests : SortingTestsBase
{
private readonly MiniLcmApiFixture _fixture = new();

protected override async Task<IMiniLcmApi> NewApi()
{
await _fixture.InitializeAsync();
var api = _fixture.Api;
return api;
}

public override async Task DisposeAsync()
{
await base.DisposeAsync();
await _fixture.DisposeAsync();
}
}
4 changes: 2 additions & 2 deletions backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

public class CrdtMiniLcmApi(DataModel dataModel, CurrentProjectService projectService, LcmCrdtDbContext dbContext) : IMiniLcmApi
{
private Guid ClientId { get; } = projectService.ProjectData.ClientId;

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FwHeadless / publish-fw-headless

Parameter 'dbContext' is unread.

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite and run tests

Parameter 'dbContext' is unread.

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Linux

Parameter 'dbContext' is unread.

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Linux

Parameter 'dbContext' is unread.

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

Parameter 'dbContext' is unread.

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

Parameter 'dbContext' is unread.

Check warning on line 18 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Windows

Parameter 'dbContext' is unread.
public ProjectData ProjectData => projectService.ProjectData;

private IQueryable<Entry> Entries => dataModel.QueryLatest<Entry>().AsTracking(false);
Expand Down Expand Up @@ -230,15 +230,15 @@
queryable = queryable.WhereExemplar(ws.Value, options.Exemplar.Value);
}

var sortWs = (await GetWritingSystem(options.Order.WritingSystem, WritingSystemType.Vernacular))?.WsId;
var sortWs = (await GetWritingSystem(options.Order.WritingSystem, WritingSystemType.Vernacular));
if (sortWs is null)
throw new NullReferenceException($"sort writing system {options.Order.WritingSystem} not found");
queryable = queryable
.LoadWith(e => e.Senses).ThenLoad(s => s.ExampleSentences)
.LoadWith(e => e.ComplexForms)
.LoadWith(e => e.Components)
.AsQueryable()
.OrderBy(e => e.Headword(sortWs.Value))
.OrderBy(e => e.Headword(sortWs.WsId).CollateUnicode(sortWs))
.ThenBy(e => e.Id)
.Skip(options.Offset)
.Take(options.Count);
Expand Down Expand Up @@ -337,7 +337,7 @@
async IAsyncEnumerable<AddEntryComponentChange> ToComplexFormComponents(IList<ComplexFormComponent> complexFormComponents)
{
foreach (var complexFormComponent in complexFormComponents)
{

Check warning on line 340 in backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
if (complexFormComponent.ComponentEntryId == default) complexFormComponent.ComponentEntryId = entry.Id;
if (complexFormComponent.ComplexFormEntryId == default) complexFormComponent.ComplexFormEntryId = entry.Id;
if (complexFormComponent.ComponentEntryId == complexFormComponent.ComplexFormEntryId)
Expand Down
152 changes: 152 additions & 0 deletions backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
using System.Data;
using System.Data.Common;
using System.Globalization;
using System.Text;
using Microsoft.Data.Sqlite;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Diagnostics;
using Microsoft.Extensions.Caching.Memory;
using Microsoft.Extensions.Logging;

namespace LcmCrdt.Data;

public class SetupCollationInterceptor(IMemoryCache cache, ILogger<SetupCollationInterceptor> logger) : IDbConnectionInterceptor, ISaveChangesInterceptor
{
private WritingSystem[] GetWritingSystems(LcmCrdtDbContext dbContext, DbConnection connection)
{
//todo this needs to be invalidated when the writing systems change
return cache.GetOrCreate(CacheKey(connection),
entry =>
{
entry.SlidingExpiration = TimeSpan.FromMinutes(30);
try
{

return dbContext.WritingSystems.ToArray();
}
catch (SqliteException e)

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Build FwHeadless / publish-fw-headless

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite and run tests

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Linux

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Linux

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Mac

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Windows

The variable 'e' is declared but never used

Check warning on line 27 in backend/FwLite/LcmCrdt/Data/SetupCollationInterceptor.cs

View workflow job for this annotation

GitHub Actions / Publish FW Lite app for Windows

The variable 'e' is declared but never used
{
return [];
}
}) ?? [];
}

private static string CacheKey(DbConnection connection)
{
return $"writingSystems|{connection.ConnectionString}";
}

private void InvalidateWritingSystemsCache(DbConnection connection)
{
cache.Remove(CacheKey(connection));
}

public void ConnectionOpened(DbConnection connection, ConnectionEndEventData eventData)
{
var context = (LcmCrdtDbContext?)eventData.Context;
if (context is null) throw new InvalidOperationException("context is null");
var sqliteConnection = (SqliteConnection)connection;
SetupCollations(sqliteConnection, GetWritingSystems(context, connection));

//setup general use collation
sqliteConnection.CreateCollation(SqlSortingExtensions.CollateUnicodeNoCase,
CultureInfo.CurrentCulture.CompareInfo,
(compareInfo, x, y) => compareInfo.Compare(x, y, CompareOptions.IgnoreCase));
}

public Task ConnectionOpenedAsync(DbConnection connection,
ConnectionEndEventData eventData,
CancellationToken cancellationToken = default)
{
ConnectionOpened(connection, eventData);
return Task.CompletedTask;
}

public InterceptionResult<int> SavingChanges(DbContextEventData eventData, InterceptionResult<int> result)
{
UpdateCollationsOnSave(eventData.Context);
return result;
}

public ValueTask<InterceptionResult<int>> SavingChangesAsync(DbContextEventData eventData,
InterceptionResult<int> result,
CancellationToken cancellationToken = default)
{
UpdateCollationsOnSave(eventData.Context);
return ValueTask.FromResult(result);
}

private void UpdateCollationsOnSave(DbContext? dbContext)
{
if (dbContext is null) return;
var connection = (SqliteConnection)dbContext.Database.GetDbConnection();
bool updateWs = false;
foreach (var entityEntry in dbContext.ChangeTracker.Entries<WritingSystem>())
{
if (entityEntry.State is EntityState.Added or EntityState.Modified)
{
var writingSystem = entityEntry.Entity;
SetupCollation(connection, writingSystem);
updateWs = true;
}
}

if (updateWs)
{
InvalidateWritingSystemsCache(connection);
}
}

private void SetupCollations(SqliteConnection connection, WritingSystem[] writingSystems)
{
foreach (var writingSystem in writingSystems)
{
SetupCollation(connection, writingSystem);
}
}

private void SetupCollation(SqliteConnection connection, WritingSystem writingSystem)
{
CompareInfo compareInfo;
try
{
//todo use ICU/SLDR instead
compareInfo = CultureInfo.CreateSpecificCulture(writingSystem.WsId.Code).CompareInfo;
}
catch (Exception e)
{
logger.LogError(e, "Failed to create compare info for '{WritingSystemId}'", writingSystem.WsId);
compareInfo = CultureInfo.InvariantCulture.CompareInfo;
}

//todo use custom comparison based on the writing system
CreateSpanCollation(connection, SqlSortingExtensions.CollationName(writingSystem),
compareInfo,
static (compareInfo, x, y) => compareInfo.Compare(x, y, CompareOptions.IgnoreCase));
}

//this is a premature optimization, but it avoids creating strings for each comparison and instead uses spans which avoids allocations
//if the new comparison function does not support spans then we can use SqliteConnection.CreateCollation instead which works with strings
private void CreateSpanCollation<T>(SqliteConnection connection,
string name, T state,
Func<T, ReadOnlySpan<char>, ReadOnlySpan<char>, int> compare)
{
if (connection.State != ConnectionState.Open)
throw new InvalidOperationException("Unable to create custom collation Connection must be open.");
var rc = SQLitePCL.raw.sqlite3__create_collation_utf8(connection.Handle,
name,
Tuple.Create(state, compare),
static (s, x, y) =>
{
var (state, compare) = (Tuple<T, Func<T, ReadOnlySpan<char>, ReadOnlySpan<char>, int>>) s;
Span<char> xSpan = stackalloc char[Encoding.UTF8.GetCharCount(x)];
Span<char> ySpan = stackalloc char[Encoding.UTF8.GetCharCount(y)];
Encoding.UTF8.GetChars(x, xSpan);
Encoding.UTF8.GetChars(y, ySpan);

return compare(state, xSpan, ySpan);
});
SqliteException.ThrowExceptionForRC(rc, connection.Handle);

}
}
30 changes: 30 additions & 0 deletions backend/FwLite/LcmCrdt/Data/SqlSortingExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.Data.SQLite;
using System.Linq.Expressions;
using LinqToDB;
using SIL.WritingSystems;

namespace LcmCrdt.Data;

public static class SqlSortingExtensions
{
public const string CollateUnicodeNoCase = "NOCASE_UNICODE";

[ExpressionMethod(nameof(CollateUnicodeExpression))]
internal static string CollateUnicode(this string value, WritingSystem ws)
{
//could optionally just return the value here, but it would work differently than sql
throw new InvalidOperationException("CollateUnicode is a LinqToDB only API.");
}

private static Expression<Func<string, WritingSystem, string>> CollateUnicodeExpression()
{
//todo maybe in the future we use a custom collation based on the writing system
return (s, ws) => s.Collate(CollationName(ws));
}

internal static string CollationName(WritingSystem ws)
{
//don't use ':' in the name, it won't work
return $"NOCASE_WS_{ws.WsId}";
}
}
14 changes: 12 additions & 2 deletions backend/FwLite/LcmCrdt/LcmCrdtDbContext.cs
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
using System.Text.Json;
using System.Data.Common;
using System.Text.Json;
using LcmCrdt.Data;
using Microsoft.Data.Sqlite;
using SIL.Harmony;
using SIL.Harmony.Db;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Diagnostics;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using Microsoft.Extensions.Options;

namespace LcmCrdt;

public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOptions, IOptions<CrdtConfig> options): DbContext(dbContextOptions), ICrdtDbContext
public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOptions, IOptions<CrdtConfig> options, SetupCollationInterceptor setupCollationInterceptor)
: DbContext(dbContextOptions), ICrdtDbContext
{
public DbSet<ProjectData> ProjectData => Set<ProjectData>();
public IQueryable<WritingSystem> WritingSystems => Set<WritingSystem>().AsNoTracking();
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
optionsBuilder.AddInterceptors(setupCollationInterceptor);
}

protected override void OnModelCreating(ModelBuilder modelBuilder)
{
Expand Down
2 changes: 2 additions & 0 deletions backend/FwLite/LcmCrdt/LcmCrdtKernel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using SIL.Harmony.Changes;
using LcmCrdt.Changes;
using LcmCrdt.Changes.Entries;
using LcmCrdt.Data;
using LcmCrdt.Objects;
using LcmCrdt.RemoteSync;
using LinqToDB;
Expand All @@ -27,6 +28,7 @@ public static IServiceCollection AddLcmCrdtClient(this IServiceCollection servic
{
LinqToDBForEFTools.Initialize();
services.AddMemoryCache();
services.AddSingleton<SetupCollationInterceptor>();
services.AddDbContext<LcmCrdtDbContext>(ConfigureDbOptions);
services.AddOptions<LcmCrdtConfig>().BindConfiguration("LcmCrdt");

Expand Down
62 changes: 62 additions & 0 deletions backend/FwLite/MiniLcm.Tests/SortingTestsBase.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
namespace MiniLcm.Tests;

public abstract class SortingTestsBase : MiniLcmTestBase
{
public override async Task InitializeAsync()
{
await base.InitializeAsync();
await Api.CreateWritingSystem(WritingSystemType.Analysis,
new WritingSystem()
{
Id = Guid.NewGuid(),
Type = WritingSystemType.Analysis,
WsId = "en",
Name = "English",
Abbreviation = "En",
Font = "Arial",
Exemplars = []
});
await Api.CreateWritingSystem(WritingSystemType.Vernacular,
new WritingSystem()
{
Id = Guid.NewGuid(),
Type = WritingSystemType.Vernacular,
WsId = "en-US",
Name = "English",
Abbreviation = "En",
Font = "Arial",
Exemplars = []
});
}

private Task CreateEntry(string headword)
{
return Api.CreateEntry(new() { LexemeForm = { { "en", headword } }, });
}


// ReSharper disable InconsistentNaming
const string Ru_A= "\u0410";
const string Ru_a = "\u0430";
const string Ru_Б= "\u0411";
const string Ru_б = "\u0431";
const string Ru_В= "\u0412";
const string Ru_в = "\u0432";
// ReSharper restore InconsistentNaming

[Theory]
[InlineData("aa,ab,ac")]
[InlineData("aa,Ab,ac")]
[InlineData($"{Ru_a}{Ru_a},{Ru_a}{Ru_б},{Ru_a}{Ru_в}")]
[InlineData($"{Ru_a}{Ru_a},{Ru_A}{Ru_б},{Ru_a}{Ru_в}")]
public async Task EntriesAreSorted(string headwords)
{
var headwordList = headwords.Split(',');
foreach (var headword in headwordList.OrderBy(h => Random.Shared.Next()))
{
await CreateEntry(headword);
}
var entries = await Api.GetEntries().Select(e => e.Headword()).ToArrayAsync();
entries.Should().Equal(headwordList);
}
}
4 changes: 2 additions & 2 deletions backend/LfClassicData/LfClassicMiniLcmApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ private async IAsyncEnumerable<Entry> Query(QueryOptions? options = null, string
new BsonDocument("$ne", new BsonArray { new BsonDocument("$trim", new BsonDocument("input", $"$citationForm.{sortWs}.value")), "" }),
})
},
{ "then", $"$citationForm.{sortWs}.value" },
{ "then", new BsonDocument("$toLower", $"$citationForm.{sortWs}.value") },
{ "else", new BsonDocument("$cond", new BsonDocument
{
{ "if", new BsonDocument("$and", new BsonArray
Expand All @@ -216,7 +216,7 @@ private async IAsyncEnumerable<Entry> Query(QueryOptions? options = null, string
new BsonDocument("$ne", new BsonArray { new BsonDocument("$trim", new BsonDocument("input", $"$lexeme.{sortWs}.value")), "" }),
})
},
{ "then", $"$lexeme.{sortWs}.value" },
{ "then", new BsonDocument("$toLower", $"$lexeme.{sortWs}.value") },
{ "else", "" }
})
}
Expand Down
Loading