diff --git a/deno.json b/deno.json index d21fa55..f5c49cd 100644 --- a/deno.json +++ b/deno.json @@ -1,5 +1,6 @@ { "tasks": { + "check": "deno fmt && deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx", "test": "deno test --allow-read=. --allow-write=. --allow-net" }, "imports": { @@ -7,6 +8,5 @@ "$std/": "https://deno.land/std@0.221.0/", "@dbushell/xml-streamify": "jsr:@dbushell/xml-streamify@^0.4.0", "zod": "https://deno.land/x/zod@v3.22.4/mod.ts" - }, - "exclude": ["./bun/**"] + } } diff --git a/parse_node_helpers.ts b/parse_node_helpers.ts index 0903e44..2732df7 100644 --- a/parse_node_helpers.ts +++ b/parse_node_helpers.ts @@ -130,7 +130,7 @@ export function SynsetNode(node: Node): Synset { id: attr(node, "id"), ili: attr(node, "ili"), lexfile: attr(node, "lexfile"), - members: attr(node, "members"), + members: attr(node, "members").split(" "), dcSource: attr(node, "dc:source"), partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")), definitions: children(node, "Definition", (v) => DefinitionNode(v)), diff --git a/parse_test.ts b/parse_test.ts index e261a19..7db0183 100644 --- a/parse_test.ts +++ b/parse_test.ts @@ -4,8 +4,7 @@ import { assertEquals, assertGreater, } from "$std/assert/mod.ts"; -import { existsSync } from "$std/fs/exists.ts"; -import { Node, parse } from "@dbushell/xml-streamify"; +import { Node } from "@dbushell/xml-streamify"; import { DefinitionNode, ExampleNode, @@ -21,54 +20,9 @@ import { SynsetRelationNode, SyntacticBehaviorNode, } from "~/parse_node_helpers.ts"; +import { testFileParser, version } from "./parse_wordnet.ts"; import { partsOfSpeechList } from "./wordnet_types.ts"; -const version = "2023"; -const fileName = `english-wordnet-${version}.xml`; -const localFileName = `./data/${fileName}`; - -const testFilePath = async () => { - const path = await Deno.realPath(localFileName); - return path; -}; - -const testFileExists = async () => { - if (existsSync(localFileName)) { - const path = await Deno.realPath(localFileName); - const stat = await Deno.stat(path); - return stat.isFile; - } - return false; -}; - -const fetchTestFile = async () => { - const src = await fetch( - `https://en-word.net/static/${fileName}.gz`, - ); - const dest = await Deno.open(localFileName, { - create: true, - write: true, - }); - if (src.body == null) return; - await src.body - .pipeThrough(new DecompressionStream("gzip")) - .pipeTo(dest.writable); -}; - -const testFileParser = async () => { - if (!await testFileExists()) { - console.log("unzipping"); - await fetchTestFile(); - } - const p = await testFilePath(); - - const parser = parse(`file:///${p.replace("\\", "/")}`, { - ignoreDeclaration: false, - silent: false, - }); - return parser; -}; - Deno.test("quotes", async () => { const parser = await testFileParser(); @@ -316,16 +270,3 @@ Deno.test("validate wordnet xml", async () => { `${((performance.now() - start) / 1000).toFixed(2)}s`, ); }); - -// deno-lint-ignore no-explicit-any -export function measureExecutionTime any>( - func: T, -): (...args: Parameters) => { result: ReturnType; time: number } { - return (...args: Parameters): { result: ReturnType; time: number } => { - const start = performance.now(); - const result = func(...args); - const end = performance.now(); - const time = end - start; - return { result, time }; - }; -} diff --git a/parse_wordnet.ts b/parse_wordnet.ts new file mode 100644 index 0000000..4a8b375 --- /dev/null +++ b/parse_wordnet.ts @@ -0,0 +1,60 @@ +import { existsSync } from "$std/fs/exists.ts"; +import { Node, parse } from "@dbushell/xml-streamify"; +import { LexiconNode } from "~/parse_node_helpers.ts"; + +export const version = "2023"; +export const fileName = `english-wordnet-${version}.xml`; +export const localFileName = `./data/${fileName}`; + +const testFilePath = async () => { + const path = await Deno.realPath(localFileName); + return path; +}; + +const testFileExists = async () => { + if (existsSync(localFileName)) { + const path = await Deno.realPath(localFileName); + const stat = await Deno.stat(path); + return stat.isFile; + } + return false; +}; + +const fetchTestFile = async () => { + const src = await fetch( + `https://en-word.net/static/${fileName}.gz`, + ); + const dest = await Deno.open(localFileName, { + create: true, + write: true, + }); + if (src.body == null) return; + await src.body + .pipeThrough(new DecompressionStream("gzip")) + .pipeTo(dest.writable); +}; + +export const testFileParser = async () => { + if (!await testFileExists()) { + console.log("unzipping"); + await fetchTestFile(); + } + const p = await testFilePath(); + + const parser = parse(`file:///${p.replace("\\", "/")}`, { + ignoreDeclaration: false, + silent: false, + }); + return parser; +}; + +export const parseLexicon = async ( + parser: AsyncGenerator, +) => { + for await (const node of parser) { + if (node.type == "Lexicon") { + return LexiconNode(node); + } + } + return undefined; +}; diff --git a/parse_wordnet_test.ts b/parse_wordnet_test.ts new file mode 100644 index 0000000..f4e47eb --- /dev/null +++ b/parse_wordnet_test.ts @@ -0,0 +1,166 @@ +import { assert } from "$std/assert/assert.ts"; +import { parseLexicon, testFileParser } from "~/parse_wordnet.ts"; +import { + Definition, + Example, + Form, + ILIDefinition, + Lemma, + LexicalEntry, + Pronunciation, + Sense, + SenseRelation, + Synset, + SynsetRelation, +} from "~/wordnet_types.ts"; + +type IdRegistry = Map; +type RefRegistry = Map; + +type IdsPack = { + synsetIds: IdRegistry; + senseIds: IdRegistry; + lexicalEntryIds: IdRegistry; + syntacticBehaviorsIds: IdRegistry; +}; + +type RefsPack = { + senseSynsetRefs: RefRegistry; + senseSubCatRefs: RefRegistry; + senseRelationTargetRefs: RefRegistry; + synsetMembersRefs: RefRegistry; + synsetRelationTargetRefs: RefRegistry; +}; + +Deno.test("wordnet node relationships", async () => { + const parser = await testFileParser(); + const lexicon = await parseLexicon(parser); + assert(lexicon != undefined); + + const synsetIds: IdRegistry = new Map(); + const senseIds: IdRegistry = new Map(); + const lexicalEntryIds: IdRegistry = new Map(); + const syntacticBehaviorsIds: IdRegistry = new Map(); + + const senseSynsetRefs: RefRegistry = new Map(); + const senseSubCatRefs: RefRegistry = new Map(); + const senseRelationTargetRefs: RefRegistry = new Map(); + const synsetMembersRefs: RefRegistry = new Map(); + const synsetRelationTargetRefs: RefRegistry = new Map(); + + lexicon.id; + lexicon.label; + lexicon.language; + lexicon.email; + lexicon.license; + lexicon.version; + lexicon.citation; + lexicon.url; + lexicon.lexicalEntries.forEach( + (le: LexicalEntry) => { + lexicalEntryIds.set(le.id); + le.lemmas.forEach((l: Lemma) => { + l.writtenForm; // + l.partOfSpeech; // + l.pronunciations.forEach((p: Pronunciation) => { + p.variety; + p.inner; + }); + }); + le.senses.forEach((s: Sense) => { + senseIds.set(s.id); + senseSynsetRefs.set(s.synset); + if (s.subCat) senseSubCatRefs.set(s.subCat); + s.adjPosition; // + s.senseRelations.forEach((sr: SenseRelation) => { + sr.relType; + sr.dcType; + senseRelationTargetRefs.set(sr.target); + }); + }); + le.forms.forEach((f: Form) => { + f.writtenForm; + }); + }, + ); + lexicon.synsets.forEach((s: Synset) => { + synsetIds.set(s.id); + s.ili; + s.members.forEach((m) => { + synsetMembersRefs.set(m); + }); + s.partOfSpeech; + s.lexfile; + s.dcSource; + s.definitions.forEach((d: Definition) => { + d.inner; + }); + s.examples.forEach((e: Example) => { + e.inner; + e.dcSource; + }); + s.iliDefinitions.forEach((i: ILIDefinition) => { + i.inner; + }); + s.synsetRelations.forEach((s: SynsetRelation) => { + s.relType; + synsetRelationTargetRefs.set(s.target); + }); + }); + lexicon.syntacticBehaviors.forEach((s) => syntacticBehaviorsIds.set(s.id)); + + assertAllowedRelationships( + { + synsetIds, + senseIds, + lexicalEntryIds, + syntacticBehaviorsIds, + } satisfies IdsPack, + { + senseSynsetRefs, + senseSubCatRefs, + senseRelationTargetRefs, + synsetMembersRefs, + synsetRelationTargetRefs, + } satisfies RefsPack, + new Map([ + ["senseSubCatRefs > syntacticBehaviorsIds", undefined], + ["senseRelationTargetRefs > senseIds", undefined], + ["synsetMembersRefs > lexicalEntryIds", undefined], + ["synsetRelationTargetRefs > synsetIds", undefined], + ["senseSynsetRefs > synsetIds", undefined], + ]), + ); +}); + +const assertAllowedRelationships = ( + idsPack: IdsPack, + refsPack: RefsPack, + allowed: Map, +) => { + const found = collectRelationships(idsPack, refsPack); + found.forEach((_v, k) => { + assert(allowed.has(k), "Disallowed relation: " + k); + }); +}; + +const collectRelationships = (idsPack: IdsPack, refsPack: RefsPack) => { + const result: Map = new Map(); + Object.entries(refsPack).forEach(([refPackKey, refPackRegistry]) => { + Object.entries(idsPack).forEach(([idPackKey, idPackRegistry]) => { + for (const ref of refPackRegistry.keys()) { + if (idPackRegistry.has(ref)) { + result.set(refPackKey + " > " + idPackKey); + } + } + }); + }); + return result; + /* + senseSynsetRefs > synsetIds + senseSubCatRefs > syntacticBehaviorsIds + senseRelationTargetRefs > senseIds + synsetMembersRefs > lexicalEntryIds + synsetRelationTargetRefs > synsetIds + */ +}; diff --git a/wordnet_types.ts b/wordnet_types.ts index 3f9d6ac..4ad3bfe 100644 --- a/wordnet_types.ts +++ b/wordnet_types.ts @@ -1,7 +1,9 @@ import { z } from "zod"; export const Id = z.string(); +export const LexicalEntryId = z.string(); export const SynsetId = z.string(); +export const SenseId = z.string(); export const SyntacticBehaviorId = z.string(); /** @@ -95,7 +97,7 @@ export const Lemma = z.object({ export const SenseRelation = z.object({ relType: SenseRelationRelType, dcType: z.string().optional(), // TODO: This is only when relType is "other" - target: z.string(), // TODO Where this leads to + target: SenseId, }).strict(); export const AdjPosition = z.union([ @@ -105,7 +107,7 @@ export const AdjPosition = z.union([ ]); export const Sense = z.object({ - id: Id, + id: SenseId, synset: SynsetId, subCat: SyntacticBehaviorId.optional(), adjPosition: AdjPosition.optional(), @@ -117,7 +119,7 @@ export const Form = z.object({ }).strict(); export const LexicalEntry = z.object({ - id: Id, + id: LexicalEntryId, lemmas: z.array(Lemma).length(1), senses: z.array(Sense).min(1), forms: z.array(Form).min(0), @@ -138,13 +140,13 @@ export const ILIDefinition = z.object({ export const SynsetRelation = z.object({ relType: SynsetRelationRelType, - target: z.string(), // TODO Where this leads to? + target: SynsetId, }).strict(); export const Synset = z.object({ - id: Id, + id: SynsetId, ili: z.string(), - members: z.string(), // space-separated list + members: z.array(LexicalEntryId).min(1), // space-separated list of refs that we unwrap to array partOfSpeech: PartsOfSpeech, lexfile: z.string(), dcSource: z.string().optional(), @@ -156,7 +158,7 @@ export const Synset = z.object({ export const SyntacticBehavior = z.object({ id: SyntacticBehaviorId, - subcategorizationFrame: z.string(), // This is where huge variety lives + subcategorizationFrame: z.string(), // Sentence structure. This is where (not very huge) variety lives }).strict(); export const Lexicon = z.object({