Support for surname prefixes and hyphenated names.

raycardillo · Dec 17, 2019 · f874d5b · f874d5b
1 parent e8a4eaf
commit f874d5b
Show file tree

Hide file tree

Showing 10 changed files with 280 additions and 70 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.1.0-dev.4
+
+* Refactored soundex again to support surname prefixes and hyphenated names
+* More tests added
+
 ## 0.1.0-dev.3
 
 * Refactored soundex for more variants

diff --git a/README.md b/README.md
@@ -1,2 +1,20 @@
-# dart_phonetics
-Phonetic Algorithms for Dart.
+# Dart Phonetics
+================
+
+[![Project license](https://img.shields.io/badge/license-Apache%202.0-informational)](http://www.apache.org/licenses/LICENSE-2.0)
+[![Pub package](https://img.shields.io/pub/v/dart_phonetics)](https://pub.dev/packages/dart_phonetics)
+[![Dartdoc reference](https://img.shields.io/badge/dartdoc-reference-blue)](https://pub.dev/documentation/dart_phonetics/latest/)
+
+A collection of phonetic algorithms for [Dart](https://dart.dev/) and [Flutter](https://flutter.dev/). These algorithms help find words or names that sound similar by generating an encoding that can be compared or indexed for fuzzy searching.
+
+
+## Algorithms Implemented
+
+- **American Soundex** - A highly configurable implementation of the Soundex algorithm. There are better algorithms available, but this algorithm is classic, and is required when analyzing American surnames in genealogy or census data.
+- **Refined Soundex** -  The refined soundex is a variation that is better for applications such as spell checking. It uses a mapping that aims to be more precise and does not truncate to 4 characters by default.
+- ***More Under Development***
+
+
+### _Work In Progress_
+
+_This project is a work in progress that is being developed because I need these algorithms for another project. I'll spend time implementing more phonetic algorithms depending on demand, need, or community interest._
diff --git a/example/main.dart b/example/main.dart
@@ -2,15 +2,19 @@ import 'package:dart_phonetics/dart_phonetics.dart';
 
 void _printResult(Object encoder, String input, PhoneticEncoding encoding) {
   print('${encoder?.runtimeType?.toString()} - "$input"\n  primary = ${encoding
-      ?.primary}\n  alternate = ${encoding?.alternate}\n');
+      ?.primary}\n  alternate = ${encoding?.alternates}\n');
 }
 
 void main() {
-  final inputString = 'Cardillo';
+  final inputString = 'Cardillo-Ashcroft';
 
   final soundex = Soundex.americanEncoder;
   _printResult(soundex, inputString, soundex.encode(inputString));
 
+  final customSoundex = Soundex.fromMapping(Soundex.americanMapping,
+      maxLength: null, paddingEnabled: false, ignoreHW: false);
+  _printResult(customSoundex, inputString, customSoundex.encode(inputString));
+
   final refinedSoundex = RefinedSoundex.defaultEncoder;
   _printResult(refinedSoundex, inputString, refinedSoundex.encode(inputString));
 }
diff --git a/lib/src/encoder.dart b/lib/src/encoder.dart
@@ -42,19 +42,22 @@ class PhoneticEncoderException implements Exception {
   }
 }
 
-/// The common interface for all phonetic encoders.
+/// A data class that provides a [primary] encoding as well as a set
+/// of _optional_ [alternates].
 class PhoneticEncoding {
   /// The primary phonetic encoding.
   final String primary;
 
   /// An alternative phonetic encoding for algorithms that support this.
-  final String alternate;
+  final Set<String> alternates;
 
-  PhoneticEncoding(this.primary, [this.alternate]);
+  /// Creates an instance of this data class.
+  PhoneticEncoding(this.primary, [this.alternates]);
 
+  /// Returns a [String] that's useful for debugging or diagnostics.
   @override
   String toString() {
-    return 'PhoneticEncoding{primary=$primary, alternate=$alternate}';
+    return 'PhoneticEncoding{primary=$primary, alternates=$alternates}';
   }
 }
 

diff --git a/lib/src/refined_soundex.dart b/lib/src/refined_soundex.dart
@@ -27,6 +27,10 @@ import 'package:dart_phonetics/src/soundex.dart';
 /// support other languages or character sets by providing a custom mapping.
 ///
 /// See [Soundex] for more background and references to Soundex algorithms.
+///
+/// A good description of Refined Soundex can be found here:
+/// - https://web.archive.org/web/20010513121003/http://www.bluepoof.com:80/Soundex/info2.html
+/// - http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html
 class RefinedSoundex implements PhoneticEncoder {
   /// The character mapping to use when encoding. A value of [$nul] means
   /// ignore the input character and do not encode it (e.g., vowels).

diff --git a/lib/src/soundex.dart b/lib/src/soundex.dart
@@ -28,9 +28,10 @@ import 'package:dart_phonetics/src/utils.dart';
 /// sure you know which variant you need if working with existing data. The
 /// most notable exceptions are in census data and SQL implementations.
 ///
-/// The implementation of this class is unique because it uses a common
-/// strategy that is configurable to support many variants. In particular,
-/// it's possible to use this strategy for other languages or character sets.
+/// The implementation of this class is unique because it uses a mapping
+/// strategy with several configurable behaviors that can be enabled or
+/// disabled to support many variants. it's also possible to use a custom
+/// mapping for other languages or character sets.
 ///
 /// For convenience, there are several static instances available for some of
 /// the more common implementations:
@@ -44,17 +45,27 @@ import 'package:dart_phonetics/src/utils.dart';
 /// - [genealogyEncoder] - Implements the rules from the _genealogy.com_
 /// (https://www.genealogy.com/articles/research/00000060.html) website. This
 /// is the same as the [americanEncoder] but ignored characters are not
-/// tracked and are completely ignored instead.
+/// tracked for consonant breaks and are completely ignored instead.
 ///
 /// If you want (or need) to understand more details, here are some good
 /// references that help explain the history and variants:
+/// - https://web.archive.org/web/20011107131342/http://www.bluepoof.com/Soundex/info.html
 /// - http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
 /// - https://west-penwith.org.uk/misc/soundex.htm
 class Soundex implements PhoneticEncoder {
   /// The character mapping to use when encoding. A value of [$nul] means
   /// ignore the input character and do not encode it (e.g., vowels).
   final Map<int, int> soundexMapping;
 
+  /// Indicates that prefix processing is enabled (and will be returned as
+  /// [PhoneticEncoding.alternate] when available). This also detects the
+  /// second part of a double barreled name.
+  final bool prefixesEnabled;
+
+  /// Indicates that hyphenated parts processing is enabled. When enabled,
+  /// any parts that are found are also encoded and returned as alternates.
+  final bool hyphenatedPartsEnabled;
+
   /// Indicates if [$H] and [$W] should be completely ignored and not mapped
   /// at all. This is a special case for some census data.
   final bool ignoreHW;
@@ -120,29 +131,61 @@ class Soundex implements PhoneticEncoder {
   //#region Constructors
 
   /// Private constructor for initializing an instance.
-  Soundex._internal(this.soundexMapping, this.ignoreHW, this.trackIgnored,
-      this.maxLength, this.paddingChar, this.paddingEnabled);
+  Soundex._internal(
+      this.soundexMapping,
+      this.prefixesEnabled,
+      this.hyphenatedPartsEnabled,
+      this.ignoreHW,
+      this.trackIgnored,
+      this.maxLength,
+      this.paddingChar,
+      this.paddingEnabled);
 
   /// Creates a custom Soundex instance. This constructor can be used to
   /// provide custom mappings for non-Western character sets, etc.
   factory Soundex.fromMapping(final Map<int, int> soundexMapping,
-          {bool ignoreHW = true,
+          {bool prefixesEnabled = true,
+          bool hyphenatedPartsEnabled = true,
+          bool ignoreHW = true,
           bool trackIgnored = true,
           int maxLength = 4,
           int paddingChar = $0,
           bool paddingEnabled = true}) =>
-      Soundex._internal(Map.unmodifiable(soundexMapping), ignoreHW,
-          trackIgnored, maxLength, paddingChar, paddingEnabled);
+      Soundex._internal(
+          Map.unmodifiable(soundexMapping),
+          prefixesEnabled,
+          hyphenatedPartsEnabled,
+          ignoreHW,
+          trackIgnored,
+          maxLength,
+          paddingChar,
+          paddingEnabled);
 
   /// Gets the [americanEncoder] instance of the Soundex encoder by default.
   factory Soundex() => americanEncoder;
 
   //#endregion
 
-  /// Returns a [PhoneticEncoding] for the [input] String.
+  /// Trims well known surname prefixes. This is very subject to interpretation
+  /// but see the NARA specification as well as the following reference that
+  /// provided some additional information and guidance:
+  /// http://www.genealogyintime.com/GenealogyResources/Articles/what_is_soundex_and_how_does_soundex_work_page2.html
+  String _trimPrefixes(String input) {
+    return input.replaceFirst(
+        RegExp(r"^(Con|Dela|De La|Di|Du|De|D'|La|Le|L'|Van|Von)\s*",
+            caseSensitive: false),
+        '');
+  }
+
+  /// Splits the string into two parts of a double barrel (using the hyphen).
+  /// The second part will be `null` if a double barrel name was not found.
+  List<String> _splitHyphenatedParts(String input) {
+    return input.split(RegExp(r'\s*-\s*'));
+  }
+
+  /// Returns a single encoding for the [input] String.
   /// Returns `null` if the input is `null` or empty (after cleaning up).
-  @override
-  PhoneticEncoding encode(String input) {
+  String _encode(String input) {
     // clean up the input and convert to uppercase
     input = PhoneticUtils.clean(input);
     if (input == null) {
@@ -194,6 +237,63 @@ class Soundex implements PhoneticEncoder {
       }
     }
 
-    return PhoneticEncoding(soundex.toString());
+    return soundex.toString();
+  }
+
+  /// Adds an encoding to [alternates] if there was a known prefix present.
+  void _addTrimmedPrefixToAlternates(
+      final Set<String> alternates, final String input) {
+    final trimmed = _trimPrefixes(input);
+    if (trimmed.length < input.length) {
+      alternates.add(_encode(trimmed));
+    }
+  }
+
+  /// Returns a [PhoneticEncoding] for the [input] String.
+  /// Returns `null` if the input is `null` or empty (after cleaning up).
+  @override
+  PhoneticEncoding encode(String input) {
+    if (input == null || input.isEmpty) {
+      return null;
+    }
+
+    List<String> parts;
+    if (hyphenatedPartsEnabled) {
+      parts = _splitHyphenatedParts(input);
+    } else {
+      parts = [input];
+    }
+
+    final iterator = parts.iterator;
+    if (!iterator.moveNext()) {
+      return null;
+    }
+
+    // first we encode the primary part
+    final firstPart = iterator.current;
+    final primary = _encode(firstPart);
+
+    // ignore: prefer_collection_literals
+    final alternates = Set<String>();
+
+    if (prefixesEnabled) {
+      _addTrimmedPrefixToAlternates(alternates, firstPart);
+    }
+
+    // now go through all parts and add more alternates
+    while (iterator.moveNext()) {
+      final part = iterator.current;
+      if (part != null) {
+        alternates.add(_encode(part));
+        if (prefixesEnabled) {
+          _addTrimmedPrefixToAlternates(alternates, part);
+        }
+      }
+    }
+
+    // remove the primary if it made it into the alternate list from others
+    alternates.remove(primary);
+
+    return PhoneticEncoding(primary, alternates.isEmpty ? null : alternates);
   }
 }
diff --git a/lib/src/utils.dart b/lib/src/utils.dart
@@ -74,24 +74,21 @@ class PhoneticUtils {
     return String.fromCharCodes(cleanedCodeUnits).toUpperCase();
   }
 
-  /// Encodes [s1] and [s2] using [encoder] and then returns an array
-  /// containing the [differenceEncoded] similarity valude for the
-  /// [PhoneticEncoding.primary] and [PhoneticEncoding.alternate] encodings.
+  /// Encodes [s1] and [s2] using [encoder] and then returns the similarity
+  /// for the [PhoneticEncoding.primary] encoding.
   ///
   /// Despite the name, this is actually a measure of similarity.
   /// This naming is consistent with the SQL `DIFFERENCE` function definition.
-  static List<int> differences(
+  static int primaryDifference(
       final PhoneticEncoder encoder, final String s1, final String s2) {
     final encoding1 = encoder.encode(s1);
     final encoding2 = encoder.encode(s2);
 
-    return [
-      differenceEncoded(encoding1?.primary, encoding2?.primary),
-      differenceEncoded(encoding1?.alternate, encoding2?.alternate),
-    ];
+    return differenceEncoded(encoding1?.primary, encoding2?.primary);
   }
 
-  /// Returns the number of characters that are the same in [e1] and [e2].
+  /// Returns the number of characters that are the same in the [e1] and [e2]
+  /// encoded strings.
   ///
   /// Despite the name, this is actually a measure of similarity.
   /// This naming is consistent with the SQL `DIFFERENCE` function definition.

diff --git a/test/refined_soundex_test.dart b/test/refined_soundex_test.dart
@@ -196,22 +196,24 @@ void main() {
       final soundex = RefinedSoundex();
 
       // Edge cases
-      expect(0, PhoneticUtils.differences(soundex, null, null)[0]);
-      expect(0, PhoneticUtils.differences(soundex, '', '')[0]);
-      expect(0, PhoneticUtils.differences(soundex, ' ', ' ')[0]);
+      expect(0, PhoneticUtils.primaryDifference(soundex, null, null));
+      expect(0, PhoneticUtils.primaryDifference(soundex, '', ''));
+      expect(0, PhoneticUtils.primaryDifference(soundex, ' ', ' '));
 
       // Normal cases
-      expect(6, PhoneticUtils.differences(soundex, 'Smith', 'Smythe')[0]);
-      expect(3, PhoneticUtils.differences(soundex, 'Ann', 'Andrew')[0]);
-      expect(1, PhoneticUtils.differences(soundex, 'Margaret', 'Andrew')[0]);
-      expect(1, PhoneticUtils.differences(soundex, 'Janet', 'Margaret')[0]);
+      expect(6, PhoneticUtils.primaryDifference(soundex, 'Smith', 'Smythe'));
+      expect(3, PhoneticUtils.primaryDifference(soundex, 'Ann', 'Andrew'));
+      expect(1, PhoneticUtils.primaryDifference(soundex, 'Margaret', 'Andrew'));
+      expect(1, PhoneticUtils.primaryDifference(soundex, 'Janet', 'Margaret'));
 
       // Special cases
-      expect(5, PhoneticUtils.differences(soundex, 'Green', 'Greene')[0]);
+      expect(5, PhoneticUtils.primaryDifference(soundex, 'Green', 'Greene'));
+      expect(1,
+          PhoneticUtils.primaryDifference(soundex, 'Blotchet-Halls', 'Greene'));
       expect(
-          1, PhoneticUtils.differences(soundex, 'Blotchet-Halls', 'Greene')[0]);
-      expect(8, PhoneticUtils.differences(soundex, 'Smithers', 'Smythers')[0]);
-      expect(5, PhoneticUtils.differences(soundex, 'Anothers', 'Brothers')[0]);
+          8, PhoneticUtils.primaryDifference(soundex, 'Smithers', 'Smythers'));
+      expect(
+          5, PhoneticUtils.primaryDifference(soundex, 'Anothers', 'Brothers'));
     });
   });
 }