Skip to content

Commit

Permalink
Add llm_pgettext translation to translate via LLM
Browse files Browse the repository at this point in the history
  • Loading branch information
anoek committed Sep 6, 2024
1 parent 70b7ade commit f23c347
Show file tree
Hide file tree
Showing 10 changed files with 254 additions and 10 deletions.
1 change: 1 addition & 0 deletions .vscode/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
"nums",
"offtopic",
"omnisearch",
"openai",
"opengotha",
"pgettext",
"playouts",
Expand Down
7 changes: 6 additions & 1 deletion Makefile.production
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ dev-put:
dev-del:
curl -X DELETE $(BETA_INDEX_HEADERS) http://localhost:1080/_index

beta-put: sync-translations audit-translations jsonify-po-files push-cdn upload-sentry-source-maps
beta-put: sync-translations create-unified-pot audit-translations jsonify-po-files push-cdn upload-sentry-source-maps
OGS_VERSION_HASH=$(OGS_VERSION_HASH) VENDOR_HASH=$(VENDOR_HASH) npm run minify-index --silent | curl -X PUT $(BETA_INDEX_HEADERS) -d @- https://beta.online-go.com/_index
make beta-notify-slack

Expand Down Expand Up @@ -82,6 +82,11 @@ sync-translations:
cd i18n/build/; ssh pootle@pootle "bash -c 'cd online-go.com/i18n/build; tar jcf - *'" | tar jxf -
cd i18n/locale/; ssh pootle@pootle "bash -c 'cd online-go.com/i18n/locale; tar jcf - *'" | tar jxf -

create-unified-pot:
# This is run on the pootle server and here, the pootle server does most of the work,
# this invocation extracts strings that need to be translated by LLMs
cd i18n; node create-unified-pot.js llm-translation-extraction

audit-translations:
cd i18n; node audit-translations.js

Expand Down
16 changes: 16 additions & 0 deletions i18n/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,20 @@ build/countries.json: cldr/common
mkdir -p build
./gen-country-lists.py

dev-run-translation-scripts:
cp ../dist/ogs.js build/ogs.strings.js
cp ../dist/ogs.js.map build/ogs.strings.js.map
sed -i 's/ogs.js.map/ogs.strings.js.map/' build/ogs.strings.js
python translation-code-replace-for-parsing.py
# Create a unified pot file script is run on the translation server typically
node create-unified-pot.js llm-translation-extraction
node audit-translations.js
node jsonify-po-files.js

audit:
node audit-translations.js

po:
node jsonify-po-files.js

-include Makefile.production
67 changes: 66 additions & 1 deletion i18n/audit-translations.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const deepl = require("deepl-node");
const fs = require("fs");
const PO = require("pofile");
const GoogleTranslate = require("@google-cloud/translate").v3.TranslationServiceClient;
const OpenAI = require("openai");

let keys = fs.existsSync("./keys.json") ? JSON.parse(fs.readFileSync("./keys.json")) : null;

Expand All @@ -17,6 +18,9 @@ const googleTranslate = keys
})
: null;

const openai = keys ? new OpenAI({ apiKey: keys.openai_api_key }) : null;
const OPENAI_MODEL = "gpt-4o";

let limit = 1;

/* We use emoji as placeholders for our auto-translations because the
Expand Down Expand Up @@ -166,7 +170,16 @@ async function main() {
JSON.stringify(translations_missing, null, 4),
);

if (deepl_translator && googleTranslate) {
// LLM translations
let llm_translations_needed = JSON.parse(fs.readFileSync("./build/llm-keys.json", "utf-8"));
for (let key in llm_translations_needed) {
for (let lang in languages) {
llm_translate(key, llm_translations_needed[key], lang, languages[lang]);
}
}

// Auto translate missing strs with deepl or google depending on language support
if (deepl_translator && googleTranslate && openai) {
if (Object.keys(vandalized_languages).length > 0) {
console.error(
`Critical error: ${
Expand Down Expand Up @@ -387,6 +400,58 @@ function detect_profanity(lang, msg) {
return false;
}

//function llm_translate(entry: {msgctxt: string, msgid: string}) {
let LLM_CACHE = undefined;
async function llm_translate(key, entry, lang, language) {
if (language === "Debug") {
return "[" + entry.msgid + "]";
}

if (!LLM_CACHE) {
if (fs.existsSync("./llm-keys-cache.json")) {
LLM_CACHE = JSON.parse(fs.readFileSync("./llm-keys-cache.json", "utf-8"));
} else {
LLM_CACHE = {};
}
}

if (!(lang in LLM_CACHE)) {
LLM_CACHE[lang] = {};
}

if (key in LLM_CACHE[lang]) {
return LLM_CACHE[lang][key];
}

let completion = await openai.chat.completions.create({
messages: [
{
role: "system",
content:
"You are translating user interface strings from English to " +
language +
". Only include the translation in your response.",
},
{
role: "system",
content: "The context provided for the string is: " + entry.msgctxt ?? "",
},
{
role: "system",
content: "Translate the provided string from English to " + language,
},
{ role: "user", content: entry.msgid },
],
model: OPENAI_MODEL,
});

let translation = completion.choices[0].message.content;
LLM_CACHE[lang][key] = translation;
fs.writeFileSync("./llm-keys-cache.json", JSON.stringify(LLM_CACHE, null, 4));
console.log("LLM translation", entry.msgid, " -> ", translation);
return translation;
}

main()
.then(() => console.log("Done"))
.catch((err) => console.error(err));
52 changes: 49 additions & 3 deletions i18n/create-unified-pot.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
/*
This script is executed in two places.
The first is on our translation server in a cron job to recompute translation
strings and insert them into the pootle server for translation.
The second is during the deployment process prior to audit-translations, which
performs the automatic translation of strings.
*/

"use strict";

const fs = require("fs");
const XGettext = require("xgettext-js");
const SourceMapConsumer = require("source-map").SourceMapConsumer;
const PO = require("pofile");

const MODE = process.argv[2] || "full";

if (MODE !== "full" && MODE !== "llm-translation-extraction") {
console.error('Invalid mode, expecting "full" or "llm-translation-extraction"');
process.exit(1);
}

console.log("Running in mode:", MODE);

main();

function pseudo_translate(str) {
Expand Down Expand Up @@ -46,6 +65,7 @@ function main() {

function prep(match) {
let ret = {
llm: false,
line: match.line,
column: match.column,
source: sourcemap.originalPositionFor({
Expand Down Expand Up @@ -78,6 +98,17 @@ function main() {
return ret;
}

function llm_ctxt(match) {
let ret = prep(match);
ret.llm = true;
ret.msgctxt = match.arguments[0].value;
ret.msgid = match.arguments[1].value;
if (match.arguments.length > 2) {
ret.msgid_plural = match.arguments[2].value;
}
return ret;
}

let source = data;
let parser = new XGettext({
keywords: {
Expand All @@ -97,6 +128,7 @@ function main() {
}
let po_items = {};
let ui_only_keys = {};
let llm_keys = {};
for (let item of po.items) {
let key = item.msgctxt ? item.msgctxt + "\x04" : "";
key += item.msgid;
Expand All @@ -109,6 +141,7 @@ function main() {

for (let m of parser.getMatches(source)) {
if (m.msgid == "") {
console.log("Skipping blank translation");
console.log(m);
continue;
}
Expand All @@ -124,6 +157,14 @@ function main() {
}

ui_only_keys[key] = 1;
if (m.llm) {
llm_keys[key] = {
msgctxt: m.msgctxt,
msgid: m.msgid,
msgid_plural: m.msgid_plural,
};
continue;
}

if (!(key in po_items)) {
po_items[key] = new PO.Item();
Expand All @@ -145,9 +186,14 @@ function main() {
}
}

fs.writeFile("build/llm-keys.json", JSON.stringify(llm_keys), () =>
console.log("build/llm-keys-ui-keys.json written"),
);
fs.writeFileSync("build/llm-keys.json", JSON.stringify(llm_keys, undefined, 4));
console.log("build/llm-keys-ui-keys.json written");

if (MODE === "llm-translation-extraction") {
console.log("llm-translation-extraction mode complete, exiting");
process.exit(0);
}

fs.writeFile("build/ogs-ui-keys.json", JSON.stringify(ui_only_keys), () =>
console.log("build/ogs-ui-keys.json written"),
);
Expand Down
16 changes: 16 additions & 0 deletions i18n/jsonify-po-files.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ async function main() {
const autotranslations = fs.existsSync("./autotranslations.json")
? JSON.parse(await fs.promises.readFile("./autotranslations.json", "utf-8"))
: {};
const llm_cache = fs.existsSync("./llm-keys-cache.json")
? JSON.parse(await fs.promises.readFile("./llm-keys-cache.json", "utf-8"))
: {};
const llm_needed = fs.existsSync("./build/llm-keys.json")
? JSON.parse(await fs.promises.readFile("./build/llm-keys.json", "utf-8"))
: {};

for (let lang in languages) {
console.log(`Processing ${lang}`);
Expand Down Expand Up @@ -57,6 +63,16 @@ async function main() {
}
}

if (lang in llm_cache) {
for (let key in llm_needed) {
if (key in llm_cache[lang]) {
result[key] = [llm_cache[lang][key]];
} else {
console.error(`Missing LLM translation for ${key}`);
}
}
}

let json = JSON.stringify(result, undefined, 1);

let country_map = {};
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
"jest-chain": "^1.1.6",
"jest-environment-jsdom": "^29.7.0",
"lint-staged": "^15.2.2",
"openai": "^4.57.3",
"pofile": "^1.1.0",
"postcss": "^8.4.16",
"postcss-inline-svg": "^5.0.0",
Expand Down
14 changes: 14 additions & 0 deletions src/lib/translate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,20 @@ export function pgettext(context: string, msgid: string) {
return debug_wrap(msgid);
}

/**
* Like pgettext(), but these strings will be automatically translated instead of passed along to our volunteers.
*
* The context is fed into the LLM system as general instructions and context.
* The msgid is the message that needs to be translated.
*/
export function llm_pgettext(context: string, msgid: string) {
const key = context + "" + msgid;
if (key in catalog) {
return catalog[key][0];
}
return debug_wrap(msgid);
}

/**
* Like pgettext() but for plural forms.
*/
Expand Down
7 changes: 4 additions & 3 deletions src/views/HelpFlows/GameLogHelp.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import React from "react";

import { HelpFlow, HelpItem } from "react-dynamic-help";

import { _, pgettext } from "translate";
import { llm_pgettext } from "translate";

/**
* A help flow intended for moderators and CMs wondering about mysterious auto score entries
Expand All @@ -31,11 +31,12 @@ export function GameLogHelp(): JSX.Element {
id="game-log-help"
showInitially={true}
debug={true}
description={pgettext("Name of a dynamic help flow", "Game Log Help")}
description={llm_pgettext("Name of a dynamic help flow", "Game Log Help")}
>
<HelpItem target="autoscore-game-log-entry" position={"bottom-center"}>
<div>
{_(
{llm_pgettext(
"",
"These come from the user's browser during autoscoring. Two of these from each user, at the beginning of the scoring phase, and if the users presses 'auto-score'",
)}
</div>
Expand Down
Loading

0 comments on commit f23c347

Please sign in to comment.