Merge #205: Implement error correction

76d0dae fuzz: add fuzztests that try to correct bech32 and codex32 errors (Andrew Poelstra) 383f788 correction: support erasures (Andrew Poelstra) 2e1b7be implement error correction (Andrew Poelstra) 6c24f98 primitives: introduce the Berlekamp-Massey algorithm for computing linear shift registers (Andrew Poelstra) fc903d6 field: require TryInto<Base> for ExtensionField (Andrew Poelstra) 4dfe325 field: add ability to multiply by integers (Andrew Poelstra) 74ec75f bech32: use correct generator exponents (Andrew Poelstra) Pull request description: This implements the core algorithms for error correction. In principle this exposes an API which is sufficient for somebody to implement error correction (of both substitutions and erasures). In practice the API is unlikely to be super usable because: * We yield error locations as indices from the *end* of the string rather than from the beginning (which we do because the error correction logic doesn't know the original string or even its length); * We similarly require the user indicate the location of erasures as indices from the end of the string; * We yield errors as GF32 offsets to be added to the current character in the string, rather than as correct characters (again, we do this because we don't know the string). * There is a situation in which we detectably cannot correct the string, but we yield some "corrections" anyway (to detect this case, we need to notice if the error iterator ends "early" for a technical definition of "early"; this is not too hard but there's an API question about whether the iterator should be yielding a `Result` or what). * We don't have a way for the user to signal erasures other than providing a valid bech32 character and then later telling the correction logic that the location is an erasure. We should be able to parse `?`s or something. There is also some missing functionality: * We should be able to correct "burst errors" where if the user indicates a long string of erasures all in a row, we should be able to correct up to checksum-length-many of them. (But if there are other errors, we then won't detect them, so I'm unsure what the UX should look like..) * Eventually we ought to have a "list decoder" which not only provides a unique best correction if one exists, but always provides a list of "plausible" corrections that the user would then need to check against the blockchain. This would involve a totally different error correction algorithm and I don't intend to do it in the next several years, but throwing it out there anyway. The next PR will be an "error correction API" PR. I would like some guidance from users on what this API should look like. ACKs for top commit: clarkmoody: ACK 76d0dae Tree-SHA512: 83c6e0a261475bfcf23bff0c7911714f4e366222a67881638818ee991dfe7900e8b38ece872a89ddcfa91cb15b89bd90b0d38d3ae87d2d079bda81c8ed4805e3
rust-bitcoin · Oct 18, 2024 · 3aab51d · 3aab51d
2 parents 3f98190 + 76d0dae
commit 3aab51d
Show file tree

Hide file tree

Showing 15 changed files with 1,143 additions and 13 deletions.
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        fuzz_target: [decode_rnd, encode_decode, parse_hrp]
+        fuzz_target: [berlekamp_massey, correct_bech32, correct_codex32, decode_rnd, encode_decode, parse_hrp]
     steps:
       - name: Install test dependencies
         run: sudo apt-get update -y && sudo apt-get install -y binutils-dev libunwind8-dev libcurl4-openssl-dev libelf-dev libdw-dev cmake gcc libiberty-dev

diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -17,6 +17,18 @@ bech32 = { path = ".." }
 [workspace]
 members = ["."]
 
+[[bin]]
+name = "berlekamp_massey"
+path = "fuzz_targets/berlekamp_massey.rs"
+
+[[bin]]
+name = "correct_bech32"
+path = "fuzz_targets/correct_bech32.rs"
+
+[[bin]]
+name = "correct_codex32"
+path = "fuzz_targets/correct_codex32.rs"
+
 [[bin]]
 name = "decode_rnd"
 path = "fuzz_targets/decode_rnd.rs"

diff --git a/fuzz/fuzz_targets/berlekamp_massey.rs b/fuzz/fuzz_targets/berlekamp_massey.rs
@@ -0,0 +1,58 @@
+use bech32::primitives::LfsrIter;
+use bech32::Fe32;
+use honggfuzz::fuzz;
+
+fn do_test(data: &[u8]) {
+    for ch in data {
+        if *ch >= 32 {
+            return;
+        }
+    }
+    if data.is_empty() || data.len() > 1_000 {
+        return;
+    }
+
+    let mut iv = Vec::with_capacity(data.len());
+    for ch in data {
+        iv.push(Fe32::try_from(*ch).unwrap());
+    }
+
+    for (i, d) in LfsrIter::berlekamp_massey(&iv).take(data.len()).enumerate() {
+        assert_eq!(data[i], d.to_u8());
+    }
+}
+
+fn main() {
+    loop {
+        fuzz!(|data| {
+            do_test(data);
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    fn extend_vec_from_hex(hex: &str, out: &mut Vec<u8>) {
+        let mut b = 0;
+        for (idx, c) in hex.as_bytes().iter().filter(|&&c| c != b'\n').enumerate() {
+            b <<= 4;
+            match *c {
+                b'A'..=b'F' => b |= c - b'A' + 10,
+                b'a'..=b'f' => b |= c - b'a' + 10,
+                b'0'..=b'9' => b |= c - b'0',
+                _ => panic!("Bad hex"),
+            }
+            if (idx & 1) == 1 {
+                out.push(b);
+                b = 0;
+            }
+        }
+    }
+
+    #[test]
+    fn duplicate_crash() {
+        let mut a = Vec::new();
+        extend_vec_from_hex("00", &mut a);
+        super::do_test(&a);
+    }
+}
diff --git a/fuzz/fuzz_targets/correct_bech32.rs b/fuzz/fuzz_targets/correct_bech32.rs
@@ -0,0 +1,112 @@
+use std::collections::HashMap;
+
+use bech32::primitives::correction::CorrectableError as _;
+use bech32::primitives::decode::CheckedHrpstring;
+use bech32::{Bech32, Fe32};
+use honggfuzz::fuzz;
+
+// coinbase output of block 862290
+static CORRECT: &[u8; 62] = b"bc1qwzrryqr3ja8w7hnja2spmkgfdcgvqwp5swz4af4ngsjecfz0w0pqud7k38";
+
+fn do_test(data: &[u8]) {
+    if data.is_empty() || data.len() % 2 == 1 {
+        return;
+    }
+
+    let mut any_actual_errors = false;
+    let mut e2t = 0;
+    let mut erasures = Vec::with_capacity(CORRECT.len());
+    // Start with a correct string
+    let mut hrpstring = *CORRECT;
+    // ..then mangle it
+    let mut errors = HashMap::with_capacity(data.len() / 2);
+    for sl in data.chunks_exact(2) {
+        let idx = usize::from(sl[0]) & 0x7f;
+        if idx >= CORRECT.len() - 3 {
+            return;
+        }
+        let offs = match Fe32::try_from(sl[1]) {
+            Ok(fe) => fe,
+            Err(_) => return,
+        };
+
+        hrpstring[idx + 3] =
+            (Fe32::from_char(hrpstring[idx + 3].into()).unwrap() + offs).to_char() as u8;
+
+        if errors.insert(CORRECT.len() - (idx + 3) - 1, offs).is_some() {
+            return;
+        }
+        if sl[0] & 0x80 == 0x80 {
+            // We might push "dummy" errors which are erasures that aren't actually wrong.
+            // If we do this too many times, we'll exceed the singleton bound so correction
+            // will fail, but as long as we're within the bound everything should "work",
+            // in the sense that there will be no crashes and the error corrector will
+            // just yield an error with value Q.
+            erasures.push(CORRECT.len() - (idx + 3) - 1);
+            e2t += 1;
+            if offs != Fe32::Q {
+                any_actual_errors = true;
+            }
+        } else if offs != Fe32::Q {
+            any_actual_errors = true;
+            e2t += 2;
+        }
+    }
+    // We need _some_ errors.
+    if !any_actual_errors {
+        return;
+    }
+
+    let s = unsafe { core::str::from_utf8_unchecked(&hrpstring) };
+    let mut correct_ctx = CheckedHrpstring::new::<Bech32>(s)
+        .unwrap_err()
+        .correction_context::<Bech32>()
+        .unwrap();
+
+    correct_ctx.add_erasures(&erasures);
+
+    let iter = correct_ctx.bch_errors();
+    if e2t <= 3 {
+        for (idx, fe) in iter.unwrap() {
+            assert_eq!(errors.remove(&idx), Some(fe));
+        }
+        for val in errors.values() {
+            assert_eq!(*val, Fe32::Q);
+        }
+    }
+}
+
+fn main() {
+    loop {
+        fuzz!(|data| {
+            do_test(data);
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    fn extend_vec_from_hex(hex: &str, out: &mut Vec<u8>) {
+        let mut b = 0;
+        for (idx, c) in hex.as_bytes().iter().filter(|&&c| c != b'\n').enumerate() {
+            b <<= 4;
+            match *c {
+                b'A'..=b'F' => b |= c - b'A' + 10,
+                b'a'..=b'f' => b |= c - b'a' + 10,
+                b'0'..=b'9' => b |= c - b'0',
+                _ => panic!("Bad hex"),
+            }
+            if (idx & 1) == 1 {
+                out.push(b);
+                b = 0;
+            }
+        }
+    }
+
+    #[test]
+    fn duplicate_crash() {
+        let mut a = Vec::new();
+        extend_vec_from_hex("04010008", &mut a);
+        super::do_test(&a);
+    }
+}
diff --git a/fuzz/fuzz_targets/correct_codex32.rs b/fuzz/fuzz_targets/correct_codex32.rs
@@ -0,0 +1,137 @@
+use std::collections::HashMap;
+
+use bech32::primitives::correction::CorrectableError as _;
+use bech32::primitives::decode::CheckedHrpstring;
+use bech32::{Checksum, Fe1024, Fe32};
+use honggfuzz::fuzz;
+
+/// The codex32 checksum algorithm, defined in BIP-93.
+///
+/// Used in this fuzztest because it can correct up to 4 errors, vs bech32 which
+/// can correct only 1. Should exhibit more interesting behavior.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Codex32 {}
+
+impl Checksum for Codex32 {
+    type MidstateRepr = u128;
+    type CorrectionField = Fe1024;
+    const ROOT_GENERATOR: Self::CorrectionField = Fe1024::new([Fe32::_9, Fe32::_9]);
+    const ROOT_EXPONENTS: core::ops::RangeInclusive<usize> = 9..=16;
+
+    const CHECKSUM_LENGTH: usize = 13;
+    const CODE_LENGTH: usize = 93;
+    // Copied from BIP-93
+    const GENERATOR_SH: [u128; 5] = [
+        0x19dc500ce73fde210,
+        0x1bfae00def77fe529,
+        0x1fbd920fffe7bee52,
+        0x1739640bdeee3fdad,
+        0x07729a039cfc75f5a,
+    ];
+    const TARGET_RESIDUE: u128 = 0x10ce0795c2fd1e62a;
+}
+
+static CORRECT: &[u8; 48] = b"ms10testsxxxxxxxxxxxxxxxxxxxxxxxxxx4nzvca9cmczlw";
+
+fn do_test(data: &[u8]) {
+    if data.is_empty() || data.len() % 2 == 1 {
+        return;
+    }
+
+    let mut any_actual_errors = false;
+    let mut e2t = 0;
+    let mut erasures = Vec::with_capacity(CORRECT.len());
+    // Start with a correct string
+    let mut hrpstring = *CORRECT;
+    // ..then mangle it
+    let mut errors = HashMap::with_capacity(data.len() / 2);
+    for sl in data.chunks_exact(2) {
+        let idx = usize::from(sl[0]) & 0x7f;
+        if idx >= CORRECT.len() - 3 {
+            return;
+        }
+        let offs = match Fe32::try_from(sl[1]) {
+            Ok(fe) => fe,
+            Err(_) => return,
+        };
+
+        hrpstring[idx + 3] =
+            (Fe32::from_char(hrpstring[idx + 3].into()).unwrap() + offs).to_char() as u8;
+
+        if errors.insert(CORRECT.len() - (idx + 3) - 1, offs).is_some() {
+            return;
+        }
+        if sl[0] & 0x80 == 0x80 {
+            // We might push "dummy" errors which are erasures that aren't actually wrong.
+            // If we do this too many times, we'll exceed the singleton bound so correction
+            // will fail, but as long as we're within the bound everything should "work",
+            // in the sense that there will be no crashes and the error corrector will
+            // just yield an error with value Q.
+            erasures.push(CORRECT.len() - (idx + 3) - 1);
+            e2t += 1;
+            if offs != Fe32::Q {
+                any_actual_errors = true;
+            }
+        } else if offs != Fe32::Q {
+            any_actual_errors = true;
+            e2t += 2;
+        }
+    }
+    // We need _some_ errors.
+    if !any_actual_errors {
+        return;
+    }
+
+    let s = unsafe { core::str::from_utf8_unchecked(&hrpstring) };
+    let mut correct_ctx = CheckedHrpstring::new::<Codex32>(s)
+        .unwrap_err()
+        .correction_context::<Codex32>()
+        .unwrap();
+
+    correct_ctx.add_erasures(&erasures);
+
+    let iter = correct_ctx.bch_errors();
+    if e2t <= 8 {
+        for (idx, fe) in iter.unwrap() {
+            assert_eq!(errors.remove(&idx), Some(fe));
+        }
+        for val in errors.values() {
+            assert_eq!(*val, Fe32::Q);
+        }
+    }
+}
+
+fn main() {
+    loop {
+        fuzz!(|data| {
+            do_test(data);
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    fn extend_vec_from_hex(hex: &str, out: &mut Vec<u8>) {
+        let mut b = 0;
+        for (idx, c) in hex.as_bytes().iter().filter(|&&c| c != b'\n').enumerate() {
+            b <<= 4;
+            match *c {
+                b'A'..=b'F' => b |= c - b'A' + 10,
+                b'a'..=b'f' => b |= c - b'a' + 10,
+                b'0'..=b'9' => b |= c - b'0',
+                _ => panic!("Bad hex"),
+            }
+            if (idx & 1) == 1 {
+                out.push(b);
+                b = 0;
+            }
+        }
+    }
+
+    #[test]
+    fn duplicate_crash() {
+        let mut a = Vec::new();
+        extend_vec_from_hex("8c00a10091039e0185008000831f8e0f", &mut a);
+        super::do_test(&a);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -105,7 +105,7 @@
 //!     type MidstateRepr = u128;
 //!     type CorrectionField = bech32::primitives::gf32_ext::Fe32Ext<2>;
 //!     const ROOT_GENERATOR: Self::CorrectionField = Fe1024::new([Fe32::_9, Fe32::_9]);
-//!     const ROOT_EXPONENTS: core::ops::RangeInclusive<usize> = 77..=84;
+//!     const ROOT_EXPONENTS: core::ops::RangeInclusive<usize> = 9..=16;
 //!
 //!     const CHECKSUM_LENGTH: usize = 13;
 //!     const CODE_LENGTH: usize = 93;