Skip to content

Commit

Permalink
fix some pinyin issues
Browse files Browse the repository at this point in the history
  • Loading branch information
peterolson committed Nov 20, 2019
1 parent 8762829 commit 26631fa
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 10 deletions.
5 changes: 3 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
var pinyinify = require("./src/pinyinify"),
segment = require("./src/segment"),
tag = require('./src/tag'),
{ simplify, traditionalize } = require("./src/simplify");
let nodejieba = require("nodejieba");


module.exports = {
pinyinify,
segment,
simplify,
traditionalize,
tag: nodejieba.tag
tag
};
5 changes: 5 additions & 0 deletions spec/spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,19 @@ describe("Pinyinify", () => {
expect("我说得快。").becomes("wǒ shuō de kuài.");
expect("你们觉得我画得怎么样?").becomes("nǐ​men jué​de wǒ huà de zěn​me​yàng?");
expect("喂,睡得还好吗?").becomes("wèi, shuì de hái​hǎo ma?");
expect("你得做个决定。").becomes("nǐ děi zuò gè jué​dìng.");
expect("你得在这儿休息。").becomes("nǐ děi zài zhè​r xiū​xi.");
// 还
expect("我有钱了就还你。").becomes("wǒ yǒu​qián le jiù huán nǐ.");
expect("我还给你。").becomes("wǒ huán​gěi nǐ.");
expect("你还喜欢她吗?").becomes("nǐ hái xǐ​huan tā ma?");
expect("你得还我的东西。").becomes("nǐ děi huán wǒ de dōng​xi.");
expect("他还会把钱还律师吗?").becomes("tā hái huì bǎ qián huán lǜ​shī ma?");
expect("好吧,我至少还有些朋友。").becomes("hǎo ba, wǒ zhì​shǎo hái yǒu​xiē péng​you.");
// 只
expect("他是一只鸟。").becomes("tā shì yī zhī niǎo.");
// 长
expect("她长着一张圆脸和一双明亮的眼睛。").becomes("tā zhǎng zhe yī zhāng yuán liǎn hé yī shuāng míng​liàng de yǎn​jing.");

expect("行了吗?").becomes("xíng le ma?");
expect("人要是行干一行行一行。").becomes("rén yào​shi xíng gàn yī háng xíng yī háng.");
Expand Down
2 changes: 1 addition & 1 deletion src/pinyinDict.js
Original file line number Diff line number Diff line change
Expand Up @@ -100337,7 +100337,7 @@ let dict = [
"重音|||重音|zhòng​yīn",
"重音节|||重音節|zhòng​yīn​jié",
"重头戏|||重頭戲|zhòng​tóu​xì",
"重点|||重點|chóng​diǎn",
"重点|||重點|zhòngdiǎn",
"野人|||野人|yě​rén",
"野兔|||野兔|yě​tù",
"野叟曝言|||野叟曝言|Yě​sǒu​Pù​yán",
Expand Down
14 changes: 7 additions & 7 deletions src/pinyinify.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ let segment = require("./segment"),
pinyin = require("pinyin"),
{ pinyinDict } = require("./pinyinDict"),
{ normalizeEnglish } = require("./punctuation"),
tag = require("nodejieba").tag;
tag = require("./tag"),
{ isCharacterText } = require("./util");

function pinyinify(text, isDetailed) {
let segments = segment(text);
Expand Down Expand Up @@ -62,6 +63,9 @@ function decideAmbiguousChar(char, cuts, cutIndex) {
return "jué";
case "长":
case "長":
nextTags = tag(afterText.join(""));
if (nextTags && nextTags.length && nextTags[0].tag === "uz")
return "zhǎng";
// zhǎng has higher frequency due to compond words,
// but cháng is more common as an individual character.
return "cháng";
Expand All @@ -80,7 +84,7 @@ function decideAmbiguousChar(char, cuts, cutIndex) {
return "děi";
}
}
if (afterTag[0] === "t" || afterTag[0] === "v") {
if (afterTag[0] === "t" || afterTag[0] === "v" || afterTag[0] === "p") {
return "děi";
}
}
Expand All @@ -90,6 +94,7 @@ function decideAmbiguousChar(char, cuts, cutIndex) {
nextTags = tag(afterText.join(""));
if (nextTags && nextTags.length) {
let afterTag = nextTags[0].tag;
if (afterText[0][0] === "有") break;
if (afterTag[0] === "r" || afterTag[0] === "n") {
return "huán";
}
Expand All @@ -111,11 +116,6 @@ function decideAmbiguousChar(char, cuts, cutIndex) {
}
}

function isCharacterText(text) {
// https://stackoverflow.com/questions/21109011/javascript-unicode-string-chinese-character-but-no-punctuation
return /^([\u4E00-\u9FCC\u3400-\u4DB5\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\ud840-\ud868][\udc00-\udfff]|\ud869[\udc00-\uded6\udf00-\udfff]|[\ud86a-\ud86c][\udc00-\udfff]|\ud86d[\udc00-\udf34\udf40-\udfff]|\ud86e[\udc00-\udc1d])+$/.test(String(text));
}

function shouldPutSpaceBetween(word1, word2) {
if (!word2) return false;
if (isCharacterText(word1) && isCharacterText(word2)) return true;
Expand Down
18 changes: 18 additions & 0 deletions src/tag.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
let nodejieba = require("nodejieba"),
{ isCharacterText } = require("./util");

function tag(text) {
let tokens = nodejieba.tag(text);
let outTokens = [];
for (let { word, tag } of tokens) {
if (word.length > 1 && tag === "x") {
for (let char of word) {
outTokens.push(nodejieba.tag(char)[0]);
}
}
else outTokens.push({ word, tag });
}
return outTokens;
}

module.exports = tag;
6 changes: 6 additions & 0 deletions src/util.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module.exports = {
isCharacterText: (text) => {
// https://stackoverflow.com/questions/21109011/javascript-unicode-string-chinese-character-but-no-punctuation
return /^([\u4E00-\u9FCC\u3400-\u4DB5\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\ud840-\ud868][\udc00-\udfff]|\ud869[\udc00-\uded6\udf00-\udfff]|[\ud86a-\ud86c][\udc00-\udfff]|\ud86d[\udc00-\udf34\udf40-\udfff]|\ud86e[\udc00-\udc1d])+$/.test(String(text));
}
}

0 comments on commit 26631fa

Please sign in to comment.