diff --git a/index.js b/index.js index 655e5fc..4e01be3 100644 --- a/index.js +++ b/index.js @@ -1,12 +1,13 @@ var pinyinify = require("./src/pinyinify"), segment = require("./src/segment"), + tag = require('./src/tag'), { simplify, traditionalize } = require("./src/simplify"); -let nodejieba = require("nodejieba"); + module.exports = { pinyinify, segment, simplify, traditionalize, - tag: nodejieba.tag + tag }; \ No newline at end of file diff --git a/spec/spec.js b/spec/spec.js index d00f8b3..1a51768 100644 --- a/spec/spec.js +++ b/spec/spec.js @@ -41,14 +41,19 @@ describe("Pinyinify", () => { expect("我说得快。").becomes("wǒ shuō de kuài."); expect("你们觉得我画得怎么样?").becomes("nǐ​men jué​de wǒ huà de zěn​me​yàng?"); expect("喂,睡得还好吗?").becomes("wèi, shuì de hái​hǎo ma?"); + expect("你得做个决定。").becomes("nǐ děi zuò gè jué​dìng."); + expect("你得在这儿休息。").becomes("nǐ děi zài zhè​r xiū​xi."); // 还 expect("我有钱了就还你。").becomes("wǒ yǒu​qián le jiù huán nǐ."); expect("我还给你。").becomes("wǒ huán​gěi nǐ."); expect("你还喜欢她吗?").becomes("nǐ hái xǐ​huan tā ma?"); expect("你得还我的东西。").becomes("nǐ děi huán wǒ de dōng​xi."); expect("他还会把钱还律师吗?").becomes("tā hái huì bǎ qián huán lǜ​shī ma?"); + expect("好吧,我至少还有些朋友。").becomes("hǎo ba, wǒ zhì​shǎo hái yǒu​xiē péng​you."); // 只 expect("他是一只鸟。").becomes("tā shì yī zhī niǎo."); + // 长 + expect("她长着一张圆脸和一双明亮的眼睛。").becomes("tā zhǎng zhe yī zhāng yuán liǎn hé yī shuāng míng​liàng de yǎn​jing."); expect("行了吗?").becomes("xíng le ma?"); expect("人要是行干一行行一行。").becomes("rén yào​shi xíng gàn yī háng xíng yī háng."); diff --git a/src/pinyinDict.js b/src/pinyinDict.js index ad55e4e..8cefdc7 100644 --- a/src/pinyinDict.js +++ b/src/pinyinDict.js @@ -100337,7 +100337,7 @@ let dict = [ "重音|||重音|zhòng​yīn", "重音节|||重音節|zhòng​yīn​jié", "重头戏|||重頭戲|zhòng​tóu​xì", - "重点|||重點|chóng​diǎn", + "重点|||重點|zhòngdiǎn", "野人|||野人|yě​rén", "野兔|||野兔|yě​tù", "野叟曝言|||野叟曝言|Yě​sǒu​Pù​yán", diff --git a/src/pinyinify.js b/src/pinyinify.js index 57ea962..d093bda 100644 --- a/src/pinyinify.js +++ b/src/pinyinify.js @@ -2,7 +2,8 @@ let segment = require("./segment"), pinyin = require("pinyin"), { pinyinDict } = require("./pinyinDict"), { normalizeEnglish } = require("./punctuation"), - tag = require("nodejieba").tag; + tag = require("./tag"), + { isCharacterText } = require("./util"); function pinyinify(text, isDetailed) { let segments = segment(text); @@ -62,6 +63,9 @@ function decideAmbiguousChar(char, cuts, cutIndex) { return "jué"; case "长": case "長": + nextTags = tag(afterText.join("")); + if (nextTags && nextTags.length && nextTags[0].tag === "uz") + return "zhǎng"; // zhǎng has higher frequency due to compond words, // but cháng is more common as an individual character. return "cháng"; @@ -80,7 +84,7 @@ function decideAmbiguousChar(char, cuts, cutIndex) { return "děi"; } } - if (afterTag[0] === "t" || afterTag[0] === "v") { + if (afterTag[0] === "t" || afterTag[0] === "v" || afterTag[0] === "p") { return "děi"; } } @@ -90,6 +94,7 @@ function decideAmbiguousChar(char, cuts, cutIndex) { nextTags = tag(afterText.join("")); if (nextTags && nextTags.length) { let afterTag = nextTags[0].tag; + if (afterText[0][0] === "有") break; if (afterTag[0] === "r" || afterTag[0] === "n") { return "huán"; } @@ -111,11 +116,6 @@ function decideAmbiguousChar(char, cuts, cutIndex) { } } -function isCharacterText(text) { - // https://stackoverflow.com/questions/21109011/javascript-unicode-string-chinese-character-but-no-punctuation - return /^([\u4E00-\u9FCC\u3400-\u4DB5\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\ud840-\ud868][\udc00-\udfff]|\ud869[\udc00-\uded6\udf00-\udfff]|[\ud86a-\ud86c][\udc00-\udfff]|\ud86d[\udc00-\udf34\udf40-\udfff]|\ud86e[\udc00-\udc1d])+$/.test(String(text)); -} - function shouldPutSpaceBetween(word1, word2) { if (!word2) return false; if (isCharacterText(word1) && isCharacterText(word2)) return true; diff --git a/src/tag.js b/src/tag.js new file mode 100644 index 0000000..b6d6799 --- /dev/null +++ b/src/tag.js @@ -0,0 +1,18 @@ +let nodejieba = require("nodejieba"), + { isCharacterText } = require("./util"); + +function tag(text) { + let tokens = nodejieba.tag(text); + let outTokens = []; + for (let { word, tag } of tokens) { + if (word.length > 1 && tag === "x") { + for (let char of word) { + outTokens.push(nodejieba.tag(char)[0]); + } + } + else outTokens.push({ word, tag }); + } + return outTokens; +} + +module.exports = tag; \ No newline at end of file diff --git a/src/util.js b/src/util.js new file mode 100644 index 0000000..90f8bc5 --- /dev/null +++ b/src/util.js @@ -0,0 +1,6 @@ +module.exports = { + isCharacterText: (text) => { + // https://stackoverflow.com/questions/21109011/javascript-unicode-string-chinese-character-but-no-punctuation + return /^([\u4E00-\u9FCC\u3400-\u4DB5\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\ud840-\ud868][\udc00-\udfff]|\ud869[\udc00-\uded6\udf00-\udfff]|[\ud86a-\ud86c][\udc00-\udfff]|\ud86d[\udc00-\udf34\udf40-\udfff]|\ud86e[\udc00-\udc1d])+$/.test(String(text)); + } +} \ No newline at end of file