Skip to content

Commit

Permalink
Merge pull request #52 from chineseboost/hg/better-quote-handling
Browse files Browse the repository at this point in the history
Improve quote handling
  • Loading branch information
hughgrigg authored Oct 30, 2021
2 parents 1c9d2e2 + b5e0ade commit 668ef20
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 3 deletions.
2 changes: 1 addition & 1 deletion data/02_pinyin.data

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions scripts/build_data_files.php
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
'亦说' => 'yi4 yue4',
'亦說' => 'yi4 yue4',
'睡着' => 'shui4zhao2',
'还要' => 'hai2yao4',
];
$pronouns = [
'' => 'wo3',
Expand Down
16 changes: 15 additions & 1 deletion src/Hanzi/Conversion/FurthestForwardMatching.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ static function (string $section): string {
$pinyin = preg_replace('/\s+/u', ' ', $pinyin);
$pinyin = preg_replace('/\s([,!?.:)])/u', '$1', $pinyin);
$pinyin = preg_replace('/([(])\s/u', '$1', $pinyin);
$pinyin = preg_replace('/(["\'“‘])\s+/u', '$1', $pinyin);
$pinyin = preg_replace('/\s+([”’"\'])/u', '$1', $pinyin);
$pinyin = preg_replace('/([”’])(\S)/u', '$1 $2', $pinyin);
$pinyin = preg_replace('/(["\']){2}/u', '$1 $1', $pinyin);
$pinyin = preg_replace_callback(
'/(^["\']|[\s"]["\']|[“‘])([a-zA-Z])/u',
static function (array $matches): string {
return $matches[1].mb_strtoupper($matches[2]);
},
$pinyin
);
$pinyin = PinyinYear::replaceYears($pinyin);
$firstChar = mb_strtoupper(mb_substr($pinyin, 0, 1));
$rest = mb_substr($pinyin, 1);
Expand All @@ -64,7 +75,10 @@ private static function furthestForwardMatching(string $subject): string
}
continue;
}
if ($pos === 1 && $pinyin && ($furthestForward === '' || $furthestForward === '')) {
if ($pos === 1 && $pinyin
&& ($furthestForward === ''
|| $furthestForward
=== '')) {
$pinyin = preg_replace('/([0-5]?)\s*$/u', 'r$1 ', $pinyin, 1);
$subject = mb_substr($subject, 1);
break;
Expand Down
2 changes: 1 addition & 1 deletion src/PinyinSentence.php
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ static function (HtmlAble $element): string {

return trim(
<<<HTML
<span class="pinyin sentence" lang="zh-Latn-CN-pinyin">{$elementsHtml}</span>
<span class="pinyin sentence" lang="zh-Latn-CN-pinyin">$elementsHtml</span>
HTML
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ public function furthestForwardMatchingProvider(): array
['母亲安抚着她在哭的婴儿', "Mu3qin1 an1fu3 zhe5 ta1 zai4 ku1 de5 ying1'er2"],
['一个人藏,十个人找。', 'Yi1 ge5 ren2 cang2, shi2 ge5 ren2 zhao3.'],
['他这么快地把工作做完', 'Ta1 zhe4me5 kuai4 de5 ba3 gong1zuo4 zuo4 wan2'],
['“还要咖啡吗?”“不了,谢谢。”', '“Hai2yao4 ka1fei1 ma5?” “Bu4le5, xie4xie5.”'],
[
'科学家的工作就是对理论加以检验。',
'Ke1xue2jia1 de5 gong1zuo4 jiu4shi4 dui4 li3lun4 jia1yi3 jian3yan4.',
Expand Down
4 changes: 4 additions & 0 deletions tests/Unit/Hanzi/HanziSentence/HanziSentenceElementsTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ public function elementsProvider(): array
'如果你喝速溶咖啡,就不需要咖啡壶了。',
['如果', '', '', '速溶', '咖啡', '', '', '', '需要', '咖啡', '', '', ''],
],
[
'“还要咖啡吗?”“不了,谢谢。”',
['', '还要', '咖啡', '', '?”', '', '不了', '', '谢谢', '。”'],
],
];
}
}
44 changes: 44 additions & 0 deletions tests/Unit/Hanzi/HanziSentence/HanziSentenceQuoteMarksTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php

namespace Pinyin\Tests\Unit\Hanzi\HanziSentence;

use PHPUnit\Framework\TestCase;
use Pinyin\Hanzi\HanziSentence;

class HanziSentenceQuoteMarksTest extends TestCase
{
/**
* @param string $sentence
* @param string $expectedPinyin
*
* @dataProvider quoteMarkSpacingProvider
*/
public function testQuoteMarkSpacing(string $sentence, string $expectedPinyin): void
{
// Given a hanzi sentence;
$hanziSentence = new HanziSentence($sentence);

// When we convert it to pinyin;
$pinyin = $hanziSentence->asPinyin();

// Then the spacing should be correct.
self::assertEquals($expectedPinyin, (string) $pinyin);
}

/**
* @return array[]
*/
public static function quoteMarkSpacingProvider(): array
{
return [
[
'“还要咖啡吗?”“不了,谢谢。”',
'“Hai2yao4 ka1fei1 ma5?” “Bu4le5, xie4xie5.”',
],
[
'"还要咖啡吗?""不了,谢谢。"',
'"Hai2yao4 ka1fei1 ma5?" "Bu4le5, xie4xie5."',
],
];
}
}

0 comments on commit 668ef20

Please sign in to comment.