Skip to content

Commit

Permalink
Merge pull request #54 from chineseboost/hg/hanzi-numerals-measure
Browse files Browse the repository at this point in the history
Fix numeral + measure word conversion
  • Loading branch information
hughgrigg authored Oct 31, 2021
2 parents 668ef20 + c68e7f2 commit 561f70d
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ build/
*.cache
*.log
*.tmp
tmp/

# Commit your application's lock file https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control
# You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file
Expand Down
2 changes: 1 addition & 1 deletion data/tweaks_pinyin.data
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<?php return ['/(我|你|您|他|她|它|牠|祂|谁|誰[们們]?)([把将])(.{1,20})落/u'=>'$1$2$3 la4','/得([不得]?)到/u'=>'de2$1dao4','/(.{1,2})\1{1}地/u'=>'$1$1 de5','/([么|麽].)地/u'=>'$1 de5','/([一|两|那|这|這|此].)地/u'=>'$1 di4','/不([\p{Han}]{1,4})地/u'=>'bu4 $1 de5','/(一|二|三|四|五|六|七|八|九|十|百|千|万|亿)+个/u'=>'$1 ge5',];
<?php return ['/(我|你|您|他|她|它|牠|祂|谁|誰[们們]?)([把将])(.{1,20})落/u'=>'$1$2$3 la4','/得([不得]?)到/u'=>'de2$1dao4','/(.{1,2})\1{1}地/u'=>'$1$1 de5','/([么|麽].)地/u'=>'$1 de5','/([一|两|那|这|這|此].)地/u'=>'$1 di4','/不([\p{Han}]{1,4})地/u'=>'bu4 $1 de5','/((?>一|二|三|四|五|六|七|八|九|十|百|千|万|亿)+)个/u'=>'$1 ge5',];
11 changes: 8 additions & 3 deletions scripts/build_data_files.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@

mb_internal_encoding('UTF-8');

$ceDictPath = '/tmp/cedict_1_0_ts_utf-8_mdbg.txt.gz';
$tmpPath = __DIR__.'/tmp';
if (!is_dir($tmpPath)) {
mkdir($tmpPath);
}

$ceDictPath = "$tmpPath/cedict_1_0_ts_utf-8_mdbg.txt.gz";
if (!is_file($ceDictPath)) {
copy(
'https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz',
$ceDictPath
);
}

$wordFreqPath = '/tmp/global_wordfreq.release_UTF-8.txt';
$wordFreqPath = "$tmpPath/global_wordfreq.release_UTF-8.txt";
if (!is_file($wordFreqPath)) {
copy(
'https://s3.amazonaws.com/files.chineseboost.com/BCC_LEX_Zh/global_wordfreq.release_UTF-8.txt',
Expand Down Expand Up @@ -239,7 +244,7 @@
'/([一|两|那|这|這|此].)地/u' => '$1 di4',
'/不([\p{Han}]{1,4})地/u' => 'bu4 $1 de5',
sprintf(
'/(%s)+个/u',
'/((?>%s)+)个/u',
implode('|', $numerals)
) => '$1 ge5',
];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ public function furthestForwardMatchingProvider(): array
['西藏', 'Xi1zang4'],
['西安', "Xi1'an1"],
['天峨', "Tian1'e2"],
['两百五十一个人', 'Liang3 bai3 wu3 shi2 yi1 ge5 ren2'],
['彼岸消失了,脚下也不稳。', "Bi3'an4 xiao1shi1 le5, jiao3 xia4 ye3 bu4 wen3."],
['母亲安抚着她在哭的婴儿', "Mu3qin1 an1fu3 zhe5 ta1 zai4 ku1 de5 ying1'er2"],
['一个人藏,十个人找。', 'Yi1 ge5 ren2 cang2, shi2 ge5 ren2 zhao3.'],
Expand Down
46 changes: 46 additions & 0 deletions tests/Unit/Hanzi/HanziSentence/HanziSentenceNumeralMeasureTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?php

namespace Pinyin\Tests\Unit\Hanzi\HanziSentence;

use PHPUnit\Framework\TestCase;
use Pinyin\Hanzi\HanziSentence;

class HanziSentenceNumeralMeasureTest extends TestCase
{
/**
* @param string $sentence
* @param string $expectedPinyin
*
* @dataProvider numeralMeasurePinyinProvider
*/
public function testNumeralMeasurePinyin(
string $sentence,
string $expectedPinyin
): void {
// Given a hanzi sentence containing numerals and a measure word;
$hanziSentence = new HanziSentence($sentence);

// When we convert it to pinyin;
$pinyin = $hanziSentence->asPinyin();

// Then we should get the correct pinyin.
self::assertEquals($expectedPinyin, (string) $pinyin);
}

/**
* @return array[]
*/
public static function numeralMeasurePinyinProvider(): array
{
return [
[
'一亿三千两百万八千六百七十二个东西',
'Yi1 yi4 san1 qian1 liang3 bai3 wan4 ba1 qian1 liu4 bai3 qi1 shi2 er4 ge5 dong1xi1',
],
[
'两百五十一个人',
'Liang3 bai3 wu3 shi2 yi1 ge5 ren2',
],
];
}
}

0 comments on commit 561f70d

Please sign in to comment.