Skip to content

Commit

Permalink
Reimplement TR29: word boundaries.
Browse files Browse the repository at this point in the history
  • Loading branch information
eddieantonio committed Jan 4, 2016
1 parent 995eaa4 commit 6ac712d
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 41 deletions.
169 changes: 143 additions & 26 deletions Modules/word.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@

#include "word_break_property.h"

typedef enum {
WB_START,
} wb_state;
/* Macros defined in http://unicode.org/reports/tr29/#Word_Boundary_Rules */
#define AHLetter(prop) ((prop) == ALetter || (prop) == Hebrew_Letter)
#define MidNumLetQ(prop) ((prop) == MidNumLet || (prop) == Single_Quote)

#define AHLetter(prop) (prop == ALetter || prop == Hebrew_Letter)
#define MidNumLetQ(prop) (prop == MidNumLet || prop == Single_Quote)
/* Macro used in WB4 */
#define ExtendOrFormat(prop) ((prop) == Extend || (prop) == Format)

/**********************************************************************/

/* Recursive binary search for the propery. */
/* Recursive binary search for the property. */
static wb_property search_for_property(code_point, left, right)
Charvalue code_point;
size_t left, right;
Expand All @@ -66,6 +66,8 @@ static wb_property search_for_property(code_point, left, right)
}
/**********************************************************************/

/* Returns the word break property for a Unicode code point. */
__attribute__ ((const))
static wb_property property(code_point)
Charvalue code_point;
{
Expand All @@ -74,49 +76,167 @@ static wb_property property(code_point)
}
/**********************************************************************/

/* Returns the next character, skipping Extend and Format characters.
* WB4: Skip over Extend and Format characters. */
__attribute__ ((pure))
static Char* skip_to_next(from)
Char *from;
{
if (from == NULL) {
return NULL;
}

do {
from = from->next;
} while (from != NULL && ExtendOrFormat(property(from->value)));
return from;
}

__attribute__ ((pure))
static Char* skip_twice(from)
Char *from;
{
return skip_to_next(skip_to_next(from));
}

__attribute__ ((pure))
static wb_property char_property(node)
const Char *node;
{
if (node == NULL) {
return eot;
}
return property(node->value);
}

/*
* Returns the next Char* BEFORE the next boundary. Note that this may just
* be the same character as was given.
*
* Implements the Unicode TR29 Word Boundary Rules:
* http://unicode.org/reports/tr29/#Word_Boundary_Rules
*/
static Char* find_next_boundary(start)
Char *start;
{
wb_state state = WB_START;
Char *current = start;

/* The first thing the loop does is advance, so we need a dummy "start of
* loop" character that does not participate in the search. */
Char dummy;
dummy.next = start;
Char *current = &dummy;
wb_property left = sot, right, lookahead, lookbehind;

/* WB1: Break at the start and end of text. */
if (start == NULL) {
return NULL;
}

/* TODO:
* - [ ] implement FSM
* - [ ] traverse the linked list
*/

/* Loop to find next word break. */

/* WB2: Break at the start and end of text. */
while (current->next != NULL) {
/* Advance all the pointers. */
current = current->next;
lookbehind = left;
left = char_property(current);
right = char_property(skip_to_next(current));
lookahead = char_property(skip_twice(current));

/* WB3: Do not break within CRLF. */
if (left == CR && right == LF) continue;

/* WB3a: Otherwise break before and after newlines */
if (left == Newline || left == CR || left == LF) return current;
/* WB3b */
if (right == Newline || right == CR || right == LF) return current;

/* Ignore Format and Extend characters, except when they appear at the
* beginning of a region of text. */
/* WB4: handled by skip_to_next() and skip_twice(). */

/* WB5: Do not break between most letters. */
if (AHLetter(left) && AHLetter(right)) continue;

/* WB6: Do not break letters across certain punctuation. */
if (AHLetter(left) &&
(right == MidLetter || MidNumLetQ(right)) &&
AHLetter(left)) continue;
/* WB7 */
if (AHLetter(lookbehind) &&
(left == MidLetter || MidNumLetQ(left)) &&
AHLetter(right )) continue;
/* WB7a */
if (left == Hebrew_Letter && right == Single_Quote) continue;
/* WB7b */
if (left == Hebrew_Letter &&
right == Double_Quote &&
lookahead == Hebrew_Letter) continue;
/* WB7b */
if (lookbehind == Hebrew_Letter &&
left == Double_Quote &&
right == Hebrew_Letter) continue;

/* WB8: Do not break within sequences of digits, or digits adjacent to
* letters (“3a”, or “A3”). */
if (left == Numeric && right == Numeric) continue;
/* WB9 */
if (AHLetter(left) && right == Numeric) continue;
/* WB10 */
if (left == Numeric && AHLetter(right)) continue;

/* WB11: Do not break within sequences such as “3.2” or “3,456.789” */
if (lookbehind == Numeric &&
(left == MidNum || MidNumLetQ(left)) &&
right == Numeric) continue;
/* WB12 */
if (left == Numeric &&
(right == MidNum || MidNumLetQ(right)) &&
lookahead == Numeric) continue;

/* WB13: Do not break between Katakana. */
if (left == Katakana && right == Katakana) continue;

/* WB13a: Do not break from extenders. */
if ((AHLetter(left) ||
left == Numeric ||
left == Katakana ||
left == ExtendNumLet) &&
right == ExtendNumLet) continue;
/* WB13b */
if (left == ExtendNumLet &&
(AHLetter(right) ||
right == Numeric ||
right == Katakana ||
right == ExtendNumLet)) continue;

/* WB13c: Do not break between regional indicator symbols. */
if (left == Regional_Indicator &&
right == Regional_Indicator) continue;

/* WB14: Otherwise, break everywhere (including around ideographs). */
return current;
}

return current;
}
/**********************************************************************/

static size_t find_utf8_length(start, after)
Char *start, *after;
const Char *start, *after;
{
Char *current;
size_t len = 0, cplen = 0;
const Char *current;
size_t len = 0, encoded_length = 0;
utf8proc_uint8_t dummy_buffer[4];


for (current = start; current != after; current = current->next) {
assert(current != NULL);
/* Do a dummy encoding of the character for the side-effect of
* returning its length. */
cplen = utf8proc_encode_char(current->value, dummy_buffer);
assert(cplen > 0);
encoded_length = utf8proc_encode_char(current->value, dummy_buffer);
assert(encoded_length > 0);

len += cplen;
len += encoded_length;
}

return len;
Expand Down Expand Up @@ -170,16 +290,13 @@ static Boolean is_word_start(value)
case UTF8PROC_CATEGORY_LT: /**< Letter, titlecase */
case UTF8PROC_CATEGORY_LM: /**< Letter, modifier */
case UTF8PROC_CATEGORY_LO: /**< Letter, other */
case UTF8PROC_CATEGORY_MN: /**< Mark, nonspacing */
case UTF8PROC_CATEGORY_MC: /**< Mark, spacing combining */
case UTF8PROC_CATEGORY_ME: /**< Mark, enclosing */
case UTF8PROC_CATEGORY_ND: /**< Number, decimal digit */
case UTF8PROC_CATEGORY_NL: /**< Number, letter */
case UTF8PROC_CATEGORY_NO: /**< Number, other */
case UTF8PROC_CATEGORY_PC: /**< Punctuation, connector */
case UTF8PROC_CATEGORY_SC: /**< Symbol, currency */
case UTF8PROC_CATEGORY_SK: /**< Symbol, modifier */
case UTF8PROC_CATEGORY_SO: /**< Symbol, other */
case UTF8PROC_CATEGORY_MN : /**< Mark, nonspacing */
case UTF8PROC_CATEGORY_MC : /**< Mark, spacing combining */
case UTF8PROC_CATEGORY_ME : /**< Mark, enclosing */
/* Allow for private use characters. */
case UTF8PROC_CATEGORY_CO: /**< Other, private use */
return True;
Expand Down
20 changes: 11 additions & 9 deletions Modules/word_break_property.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,24 @@

typedef enum {
Other,
CR,
LF,
MidNum,
Double_Quote,
MidLetter,
Newline,
Extend,
Regional_Indicator,
Format,
Katakana,
Hebrew_Letter,
ALetter,
Newline,
Numeric,
Regional_Indicator,
Single_Quote,
Double_Quote,
MidNumLet,
MidLetter,
MidNum,
Numeric,
ExtendNumLet,
CR,
Katakana,
Single_Quote
sot,
eot
} wb_property;

typedef struct {
Expand Down
26 changes: 24 additions & 2 deletions Supplement/generate_word_break.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,28 @@
};
'''.lstrip()

CATEGORY_NAMES = r'''
Other
CR
LF
Newline
Extend
Regional_Indicator
Format
Katakana
Hebrew_Letter
ALetter
Single_Quote
Double_Quote
MidNumLet
MidLetter
MidNum
Numeric
ExtendNumLet
sot
eot
'''.strip().split()

def open_word_break_file():
filename = 'WordBreakProperty.txt.gz'
path = os.path.join(os.path.dirname(__file__), filename)
Expand Down Expand Up @@ -94,12 +116,12 @@ def parse_lines(word_break_file):
yield contents

def to_c_header(values):
category_names = ['Other'] + list(set([category for _, category in values]))
assert set(category for _, category in values).issubset(set(CATEGORY_NAMES))
values.sort(key=lambda c: c[0][0])

yield PROLOGUE
yield '\n'
yield generate_enum('wb_property', category_names)
yield generate_enum('wb_property', CATEGORY_NAMES)
yield '\n'
yield STRUCT_DEF
yield '\n'
Expand Down
12 changes: 8 additions & 4 deletions Tests/word_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ TEST find_words_returns_nfc() {
char pho_nfc[] = { 'p', 'h', 0xE1, 0xBB, 0x9F, 0 };
/* With two combining characters. */
cstring_to_text(text, (char []) { 'p', 'h', 'o',
0xCC, 0x9B, // ◌̛
0xCC, 0x89, // ◌̉
0xCC, 0x9B, /* ◌̛ */
0xCC, 0x89, /* ◌̉ */
0 });
find_words(wordlist, text);

Expand Down Expand Up @@ -65,7 +65,7 @@ TEST find_words_segments_english_with_punctuation() {
ASSERT_STR_EQ("quick", wordlist->second->string);
ASSERT_STR_EQ("brown", wordlist->third->string);
ASSERT_STR_EQ("fox", wordlist->fourth->string);
ASSERT_STR_EQ("can't", wordlist->fifth->string);
ASSERT_STR_EQ("cant", wordlist->fifth->string);
ASSERT_STR_EQ("jump", wordlist->sixth->string);
ASSERT_STR_EQ("32.3", wordlist->seventh->string);
ASSERT_STR_EQ("right", wordlist->nineth->string);
Expand Down Expand Up @@ -125,6 +125,10 @@ TEST find_words_segments_japanese() {
#undef third
#undef fourth
#undef fifth
#undef sixth
#undef seventh
#undef eighth
#undef nineth

static void setup_find_words(void *unused) {
initialize_texts((Text*[]) {text, NULL});
Expand All @@ -143,7 +147,7 @@ SUITE(find_words_suite) {
RUN_TEST(find_words_segments_a_single_ascii_word);
RUN_TEST(find_words_returns_nfc);
RUN_TEST(find_words_returns_zero_when_not_given_words);
/*RUN_TEST(find_words_segments_english_with_punctuation);*/
RUN_TEST(find_words_segments_english_with_punctuation);

/* Older tests. May still be useful... */
/*RUN_TEST(find_words_segments_spanish_words);*/
Expand Down

0 comments on commit 6ac712d

Please sign in to comment.