From 8ddf42f570a699d53679b8e8ecf9512e95601ab3 Mon Sep 17 00:00:00 2001 From: Takumasa Sakao Date: Mon, 19 Aug 2024 17:27:13 +0900 Subject: [PATCH] Support ansicode and char width for tab --- src/app.rs | 8 ++-- src/bytes.rs | 127 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 97 insertions(+), 38 deletions(-) diff --git a/src/app.rs b/src/app.rs index 935852c..2501740 100644 --- a/src/app.rs +++ b/src/app.rs @@ -339,11 +339,11 @@ impl App { if let Some(record) = record { action_tx.send(Action::SetClock(record.start_time))?; let mut result = termtext::Converter::new(style) - .convert(&normalize_stdout(record.stdout.clone())); + .convert(&normalize_stdout(&record.stdout)); log::debug!("result: {:?}", result); if record.stdout.is_empty() { result = termtext::Converter::new(style) - .convert(&normalize_stdout(record.stderr.clone())); + .convert(&normalize_stdout(&record.stderr)); result.mark_text( 0, result.len(), @@ -357,7 +357,9 @@ impl App { let previous_record = self.store.get_record(previous_id)?; if let Some(previous_record) = previous_record { let previous_result = termtext::Converter::new(style) - .convert(&normalize_stdout(previous_record.stdout)); + .convert(&normalize_stdout( + &previous_record.stdout, + )); let previous_string = previous_result.plain_text(); if diff_mode == DiffMode::Add { diff_and_mark( diff --git a/src/bytes.rs b/src/bytes.rs index 76965eb..c3463b9 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,25 +1,88 @@ +use unicode_width::UnicodeWidthChar; + const TAB_SIZE: usize = 4; -pub fn normalize_stdout(b: Vec) -> Vec { +pub fn normalize_stdout(s: &[u8]) -> Vec { // Naively replace tabs ('\t') with at most `TAB_SIZE` spaces (' ') while // maintaining the alignment / elasticity per line (see tests below). - let mut b = b; - let (mut i, mut j) = (0, 0); // j tracks alignment - while i < b.len() { - if b[i] == b'\n' { - (i, j) = (i + 1, 0); - } else if b[i] == b'\t' { - b[i] = b' '; - let r = TAB_SIZE - (j % TAB_SIZE); - for _ in 1..r { - b.insert(i, b' '); + let str = String::from_utf8_lossy(s).to_string(); + let mut b = Vec::with_capacity(str.len() * TAB_SIZE); + let mut chars = str.chars(); + let mut width = 0; + while let Some(c) = chars.next() { + let count = skip_ansi_escape_sequence(c, &mut chars.clone()); + if count > 0 { + b.push(c); + for _ in 0..count { + b.push(chars.next().unwrap_or(' ')); } - (i, j) = (i + r, 0); + continue; + } + + if c == '\t' { + let r = TAB_SIZE - (width % TAB_SIZE); + b.resize(b.len() + r, ' '); + width += r; + } else if c == '\n' { + b.push('\n'); + width = 0; } else { - (i, j) = (i + 1, j + 1); + b.push(c); + width += c.width().unwrap_or(1); } } - b + b.into_iter().collect::().into_bytes() +} + +// Based on https://github.com/mgeisler/textwrap/blob/63970361d1d653ec8715acb931c3c109750d4a57/src/core.rs +/// The CSI or “Control Sequence Introducer” introduces an ANSI escape +/// sequence. This is typically used for colored text and will be +/// ignored when computing the text width. +const CSI: (char, char) = ('\x1b', '['); +/// The final bytes of an ANSI escape sequence must be in this range. +const ANSI_FINAL_BYTE: std::ops::RangeInclusive = '\x40'..='\x7e'; +/// Skip ANSI escape sequences. +/// +/// The `ch` is the current `char`, the `chars` provide the following +/// characters. The `chars` will be modified if `ch` is the start of +/// an ANSI escape sequence. +/// +/// Returns `usize` the count of skipped characters +fn skip_ansi_escape_sequence>(ch: char, chars: &mut I) -> usize { + let mut count = 0; + if ch != CSI.0 { + return 0; // Nothing to skip here. + } + + let next = chars.next(); + count += 1; + if next == Some(CSI.1) { + // We have found the start of an ANSI escape code, typically + // used for colored terminal text. We skip until we find a + // "final byte" in the range 0x40–0x7E. + for ch in chars { + count += 1; + if ANSI_FINAL_BYTE.contains(&ch) { + break; + } + } + } else if next == Some(']') { + // We have found the start of an Operating System Command, + // which extends until the next sequence "\x1b\\" (the String + // Terminator sequence) or the BEL character. The BEL + // character is non-standard, but it is still used quite + // often, for example, by GNU ls. + let mut last = ']'; + for new in chars { + count += 1; + if new == '\x07' || (new == '\\' && last == CSI.0) { + break; + } + last = new; + } + } + + count } mod test { @@ -27,33 +90,27 @@ mod test { #[test] fn test_normalize_stdout() { + assert_eq!(normalize_stdout(b"\t"), b" "); // Make sure we don't miss any tabs in edge cases. - assert_eq!( - normalize_stdout(b"\t\t\t\t\t".to_vec()), - b" ".to_vec() - ); + assert_eq!(normalize_stdout(b"\t\t\t\t\t"), b" "); // Make sure tab is elastic (from 1 space to TAB_SIZE spaces). - assert_eq!(normalize_stdout(b"\t12345".to_vec()), b" 12345".to_vec()); - assert_eq!(normalize_stdout(b"1\t2345".to_vec()), b"1 2345".to_vec()); - assert_eq!(normalize_stdout(b"12\t345".to_vec()), b"12 345".to_vec()); - assert_eq!(normalize_stdout(b"123\t45".to_vec()), b"123 45".to_vec()); - assert_eq!(normalize_stdout(b"1234\t5".to_vec()), b"1234 5".to_vec()); + assert_eq!(normalize_stdout(b"\t12345"), b" 12345"); + assert_eq!(normalize_stdout(b"1\t2345"), b"1 2345"); + assert_eq!(normalize_stdout(b"12\t345"), b"12 345"); + assert_eq!(normalize_stdout(b"123\t45"), b"123 45"); + assert_eq!(normalize_stdout(b"1234\t5"), b"1234 5"); // Make sure we reset alignment on new lines. + assert_eq!(normalize_stdout(b"123\t\n4\t5"), b"123 \n4 5"); + assert_eq!(normalize_stdout(b"12\t3\n4\t5"), b"12 3\n4 5"); + assert_eq!(normalize_stdout(b"1\t23\n4\t5"), b"1 23\n4 5"); + assert_eq!(normalize_stdout(b"\t123\n4\t5"), b" 123\n4 5"); assert_eq!( - normalize_stdout(b"123\t\n4\t5".to_vec()), - b"123 \n4 5".to_vec() - ); - assert_eq!( - normalize_stdout(b"12\t3\n4\t5".to_vec()), - b"12 3\n4 5".to_vec() - ); - assert_eq!( - normalize_stdout(b"1\t23\n4\t5".to_vec()), - b"1 23\n4 5".to_vec() + normalize_stdout("あ\tい\nう\tえ".as_bytes()), + "あ い\nう え".as_bytes() ); assert_eq!( - normalize_stdout(b"\t123\n4\t5".to_vec()), - b" 123\n4 5".to_vec() + normalize_stdout(b"\x1b[34ma\t\x1b[39mb\x1b[0m"), + b"\x1b[34ma \x1b[39mb\x1b[0m" ); } }