Skip to content

Commit

Permalink
discover, test, document wordnet node relationships
Browse files Browse the repository at this point in the history
  • Loading branch information
king8fisher committed Apr 21, 2024
1 parent 09a2859 commit a4e9341
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 71 deletions.
4 changes: 2 additions & 2 deletions deno.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"tasks": {
"check": "deno fmt && deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx",
"test": "deno test --allow-read=. --allow-write=. --allow-net"
},
"imports": {
"~/": "./",
"$std/": "https://deno.land/[email protected]/",
"@dbushell/xml-streamify": "jsr:@dbushell/xml-streamify@^0.4.0",
"zod": "https://deno.land/x/[email protected]/mod.ts"
},
"exclude": ["./bun/**"]
}
}
2 changes: 1 addition & 1 deletion parse_node_helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ export function SynsetNode(node: Node): Synset {
id: attr(node, "id"),
ili: attr(node, "ili"),
lexfile: attr(node, "lexfile"),
members: attr(node, "members"),
members: attr(node, "members").split(" "),
dcSource: attr(node, "dc:source"),
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
definitions: children(node, "Definition", (v) => DefinitionNode(v)),
Expand Down
63 changes: 2 additions & 61 deletions parse_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ import {
assertEquals,
assertGreater,
} from "$std/assert/mod.ts";
import { existsSync } from "$std/fs/exists.ts";
import { Node, parse } from "@dbushell/xml-streamify";
import { Node } from "@dbushell/xml-streamify";
import {
DefinitionNode,
ExampleNode,
Expand All @@ -21,54 +20,9 @@ import {
SynsetRelationNode,
SyntacticBehaviorNode,
} from "~/parse_node_helpers.ts";
import { testFileParser, version } from "./parse_wordnet.ts";
import { partsOfSpeechList } from "./wordnet_types.ts";

const version = "2023";
const fileName = `english-wordnet-${version}.xml`;
const localFileName = `./data/${fileName}`;

const testFilePath = async () => {
const path = await Deno.realPath(localFileName);
return path;
};

const testFileExists = async () => {
if (existsSync(localFileName)) {
const path = await Deno.realPath(localFileName);
const stat = await Deno.stat(path);
return stat.isFile;
}
return false;
};

const fetchTestFile = async () => {
const src = await fetch(
`https://en-word.net/static/${fileName}.gz`,
);
const dest = await Deno.open(localFileName, {
create: true,
write: true,
});
if (src.body == null) return;
await src.body
.pipeThrough(new DecompressionStream("gzip"))
.pipeTo(dest.writable);
};

const testFileParser = async () => {
if (!await testFileExists()) {
console.log("unzipping");
await fetchTestFile();
}
const p = await testFilePath();

const parser = parse(`file:///${p.replace("\\", "/")}`, {
ignoreDeclaration: false,
silent: false,
});
return parser;
};

Deno.test("quotes", async () => {
const parser = await testFileParser();

Expand Down Expand Up @@ -316,16 +270,3 @@ Deno.test("validate wordnet xml", async () => {
`${((performance.now() - start) / 1000).toFixed(2)}s`,
);
});

// deno-lint-ignore no-explicit-any
export function measureExecutionTime<T extends (...args: unknown[]) => any>(
func: T,
): (...args: Parameters<T>) => { result: ReturnType<T>; time: number } {
return (...args: Parameters<T>): { result: ReturnType<T>; time: number } => {
const start = performance.now();
const result = func(...args);
const end = performance.now();
const time = end - start;
return { result, time };
};
}
60 changes: 60 additions & 0 deletions parse_wordnet.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import { existsSync } from "$std/fs/exists.ts";
import { Node, parse } from "@dbushell/xml-streamify";
import { LexiconNode } from "~/parse_node_helpers.ts";

export const version = "2023";
export const fileName = `english-wordnet-${version}.xml`;
export const localFileName = `./data/${fileName}`;

const testFilePath = async () => {
const path = await Deno.realPath(localFileName);
return path;
};

const testFileExists = async () => {
if (existsSync(localFileName)) {
const path = await Deno.realPath(localFileName);
const stat = await Deno.stat(path);
return stat.isFile;
}
return false;
};

const fetchTestFile = async () => {
const src = await fetch(
`https://en-word.net/static/${fileName}.gz`,
);
const dest = await Deno.open(localFileName, {
create: true,
write: true,
});
if (src.body == null) return;
await src.body
.pipeThrough(new DecompressionStream("gzip"))
.pipeTo(dest.writable);
};

export const testFileParser = async () => {
if (!await testFileExists()) {
console.log("unzipping");
await fetchTestFile();
}
const p = await testFilePath();

const parser = parse(`file:///${p.replace("\\", "/")}`, {
ignoreDeclaration: false,
silent: false,
});
return parser;
};

export const parseLexicon = async (
parser: AsyncGenerator<Node, void | Node, void>,
) => {
for await (const node of parser) {
if (node.type == "Lexicon") {
return LexiconNode(node);
}
}
return undefined;
};
166 changes: 166 additions & 0 deletions parse_wordnet_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import { assert } from "$std/assert/assert.ts";
import { parseLexicon, testFileParser } from "~/parse_wordnet.ts";
import {
Definition,
Example,
Form,
ILIDefinition,
Lemma,
LexicalEntry,
Pronunciation,
Sense,
SenseRelation,
Synset,
SynsetRelation,
} from "~/wordnet_types.ts";

type IdRegistry = Map<string, void>;
type RefRegistry = Map<string, void>;

type IdsPack = {
synsetIds: IdRegistry;
senseIds: IdRegistry;
lexicalEntryIds: IdRegistry;
syntacticBehaviorsIds: IdRegistry;
};

type RefsPack = {
senseSynsetRefs: RefRegistry;
senseSubCatRefs: RefRegistry;
senseRelationTargetRefs: RefRegistry;
synsetMembersRefs: RefRegistry;
synsetRelationTargetRefs: RefRegistry;
};

Deno.test("wordnet node relationships", async () => {
const parser = await testFileParser();
const lexicon = await parseLexicon(parser);
assert(lexicon != undefined);

const synsetIds: IdRegistry = new Map();
const senseIds: IdRegistry = new Map();
const lexicalEntryIds: IdRegistry = new Map();
const syntacticBehaviorsIds: IdRegistry = new Map();

const senseSynsetRefs: RefRegistry = new Map();
const senseSubCatRefs: RefRegistry = new Map();
const senseRelationTargetRefs: RefRegistry = new Map();
const synsetMembersRefs: RefRegistry = new Map();
const synsetRelationTargetRefs: RefRegistry = new Map();

lexicon.id;
lexicon.label;
lexicon.language;
lexicon.email;
lexicon.license;
lexicon.version;
lexicon.citation;
lexicon.url;
lexicon.lexicalEntries.forEach(
(le: LexicalEntry) => {
lexicalEntryIds.set(le.id);
le.lemmas.forEach((l: Lemma) => {
l.writtenForm; //
l.partOfSpeech; //
l.pronunciations.forEach((p: Pronunciation) => {
p.variety;
p.inner;
});
});
le.senses.forEach((s: Sense) => {
senseIds.set(s.id);
senseSynsetRefs.set(s.synset);
if (s.subCat) senseSubCatRefs.set(s.subCat);
s.adjPosition; //
s.senseRelations.forEach((sr: SenseRelation) => {
sr.relType;
sr.dcType;
senseRelationTargetRefs.set(sr.target);
});
});
le.forms.forEach((f: Form) => {
f.writtenForm;
});
},
);
lexicon.synsets.forEach((s: Synset) => {
synsetIds.set(s.id);
s.ili;
s.members.forEach((m) => {
synsetMembersRefs.set(m);
});
s.partOfSpeech;
s.lexfile;
s.dcSource;
s.definitions.forEach((d: Definition) => {
d.inner;
});
s.examples.forEach((e: Example) => {
e.inner;
e.dcSource;
});
s.iliDefinitions.forEach((i: ILIDefinition) => {
i.inner;
});
s.synsetRelations.forEach((s: SynsetRelation) => {
s.relType;
synsetRelationTargetRefs.set(s.target);
});
});
lexicon.syntacticBehaviors.forEach((s) => syntacticBehaviorsIds.set(s.id));

assertAllowedRelationships(
{
synsetIds,
senseIds,
lexicalEntryIds,
syntacticBehaviorsIds,
} satisfies IdsPack,
{
senseSynsetRefs,
senseSubCatRefs,
senseRelationTargetRefs,
synsetMembersRefs,
synsetRelationTargetRefs,
} satisfies RefsPack,
new Map([
["senseSubCatRefs > syntacticBehaviorsIds", undefined],
["senseRelationTargetRefs > senseIds", undefined],
["synsetMembersRefs > lexicalEntryIds", undefined],
["synsetRelationTargetRefs > synsetIds", undefined],
["senseSynsetRefs > synsetIds", undefined],
]),
);
});

const assertAllowedRelationships = (
idsPack: IdsPack,
refsPack: RefsPack,
allowed: Map<string, void>,
) => {
const found = collectRelationships(idsPack, refsPack);
found.forEach((_v, k) => {
assert(allowed.has(k), "Disallowed relation: " + k);
});
};

const collectRelationships = (idsPack: IdsPack, refsPack: RefsPack) => {
const result: Map<string, void> = new Map();
Object.entries(refsPack).forEach(([refPackKey, refPackRegistry]) => {
Object.entries(idsPack).forEach(([idPackKey, idPackRegistry]) => {
for (const ref of refPackRegistry.keys()) {
if (idPackRegistry.has(ref)) {
result.set(refPackKey + " > " + idPackKey);
}
}
});
});
return result;
/*
senseSynsetRefs > synsetIds
senseSubCatRefs > syntacticBehaviorsIds
senseRelationTargetRefs > senseIds
synsetMembersRefs > lexicalEntryIds
synsetRelationTargetRefs > synsetIds
*/
};
16 changes: 9 additions & 7 deletions wordnet_types.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { z } from "zod";

export const Id = z.string();
export const LexicalEntryId = z.string();
export const SynsetId = z.string();
export const SenseId = z.string();
export const SyntacticBehaviorId = z.string();

/**
Expand Down Expand Up @@ -95,7 +97,7 @@ export const Lemma = z.object({
export const SenseRelation = z.object({
relType: SenseRelationRelType,
dcType: z.string().optional(), // TODO: This is only when relType is "other"
target: z.string(), // TODO Where this leads to
target: SenseId,
}).strict();

export const AdjPosition = z.union([
Expand All @@ -105,7 +107,7 @@ export const AdjPosition = z.union([
]);

export const Sense = z.object({
id: Id,
id: SenseId,
synset: SynsetId,
subCat: SyntacticBehaviorId.optional(),
adjPosition: AdjPosition.optional(),
Expand All @@ -117,7 +119,7 @@ export const Form = z.object({
}).strict();

export const LexicalEntry = z.object({
id: Id,
id: LexicalEntryId,
lemmas: z.array(Lemma).length(1),
senses: z.array(Sense).min(1),
forms: z.array(Form).min(0),
Expand All @@ -138,13 +140,13 @@ export const ILIDefinition = z.object({

export const SynsetRelation = z.object({
relType: SynsetRelationRelType,
target: z.string(), // TODO Where this leads to?
target: SynsetId,
}).strict();

export const Synset = z.object({
id: Id,
id: SynsetId,
ili: z.string(),
members: z.string(), // space-separated list
members: z.array(LexicalEntryId).min(1), // space-separated list of refs that we unwrap to array
partOfSpeech: PartsOfSpeech,
lexfile: z.string(),
dcSource: z.string().optional(),
Expand All @@ -156,7 +158,7 @@ export const Synset = z.object({

export const SyntacticBehavior = z.object({
id: SyntacticBehaviorId,
subcategorizationFrame: z.string(), // This is where huge variety lives
subcategorizationFrame: z.string(), // Sentence structure. This is where (not very huge) variety lives
}).strict();

export const Lexicon = z.object({
Expand Down

0 comments on commit a4e9341

Please sign in to comment.