From 8dd719f8499b23cca84fe599f5e4843df32a5feb Mon Sep 17 00:00:00 2001
From: Henner Zeller <hzeller@google.com>
Date: Wed, 27 Oct 2021 16:07:46 +0200
Subject: [PATCH] Output token columns to be _character_ not _byte_-offset in
 line.

Take multi-byte characters into account.

Remove the non-descript operator() in LineColumnMap and replace
with GetLineColAtOffset(). Also add such a convenience method
in text view to reduce abstraction leakage. Ideally, we would
actually never have to deal with 'raw' LineColumnMaps inside
code, only with attached text view.

The GetLineColAtOffset() needs the original text to correctly
count UTF8-characters in the line in question; this should
probably be replaced with either
  * remembering the original string-view from constructing
    the LineColumnMap (but CAVE: lifetime issues)
  * or maybe a sub-structure that remembers multi-byte
    offset within a line.
For now, this is not a big deal as the original string-view
always is available.

Fixes #1047 #1059

Signed-off-by: Henner Zeller <hzeller@google.com>
---
 common/analysis/file_analyzer.cc         | 14 +++++-----
 common/analysis/lint_rule_status.cc      |  7 +++--
 common/analysis/lint_rule_status_test.cc | 19 ++++++++++++++
 common/analysis/lint_waiver.cc           |  8 +++---
 common/strings/BUILD                     |  5 +++-
 common/strings/line_column_map.cc        | 22 +++++++++++-----
 common/strings/line_column_map.h         | 13 ++++++++--
 common/strings/line_column_map_test.cc   | 14 +++++++---
 common/text/text_structure.h             |  6 ++++-
 verilog/analysis/verilog_linter.cc       |  2 +-
 verilog/formatting/formatter.cc          |  7 +++--
 verilog/tools/ls/verible-lsp-adapter.cc  | 33 ++++++++++++------------
 12 files changed, 104 insertions(+), 46 deletions(-)
diff --git a/common/analysis/file_analyzer.cc b/common/analysis/file_analyzer.cc
index b44956b3d..96216a301 100644
--- a/common/analysis/file_analyzer.cc
+++ b/common/analysis/file_analyzer.cc
@@ -104,13 +104,12 @@ absl::Status FileAnalyzer::Parse(Parser* parser) {
 std::string FileAnalyzer::TokenErrorMessage(
     const TokenInfo& error_token) const {
   // TODO(fangism): accept a RejectedToken to get an explanation message.
-  const LineColumnMap& line_column_map = Data().GetLineColumnMap();
   const absl::string_view base_text = Data().Contents();
   std::ostringstream output_stream;
   if (!error_token.isEOF()) {
     // TODO(hzeller): simply print LineColumnRange ?
-    const auto left = line_column_map(error_token.left(base_text));
-    auto right = line_column_map(error_token.right(base_text));
+    const auto left = Data().GetLineColAtOffset(error_token.left(base_text));
+    auto right = Data().GetLineColAtOffset(error_token.right(base_text));
     --right.column;  // Point to last character, not one-past-the-end.
     output_stream << "token: \"" << error_token.text() << "\" at " << left;
     if (left.line == right.line) {
@@ -124,7 +123,7 @@ std::string FileAnalyzer::TokenErrorMessage(
       output_stream << '-' << right;
     }
   } else {
-    const auto end = line_column_map(base_text.length());
+    const auto end = Data().GetLineColAtOffset(base_text.length());
     output_stream << "token: <<EOF>> at " << end;
   }
   return output_stream.str();
@@ -142,17 +141,16 @@ std::vector<std::string> FileAnalyzer::TokenErrorMessages() const {
 void FileAnalyzer::ExtractLinterTokenErrorDetail(
     const RejectedToken& error_token,
     const ReportLinterErrorFunction& error_report) const {
-  const LineColumnMap& line_column_map = Data().GetLineColumnMap();
   const absl::string_view base_text = Data().Contents();
 
   const LineColumn start =
       error_token.token_info.isEOF()
-          ? line_column_map(base_text.length())
-          : line_column_map(error_token.token_info.left(base_text));
+          ? Data().GetLineColAtOffset(base_text.length())
+          : Data().GetLineColAtOffset(error_token.token_info.left(base_text));
   const LineColumn end =
       error_token.token_info.isEOF()
           ? start
-          : line_column_map(error_token.token_info.right(base_text));
+          : Data().GetLineColAtOffset(error_token.token_info.right(base_text));
 
   absl::string_view context_line = "";
   const auto& lines = Data().Lines();
diff --git a/common/analysis/lint_rule_status.cc b/common/analysis/lint_rule_status.cc
index b0b3e8635..da332bcd4 100644
--- a/common/analysis/lint_rule_status.cc
+++ b/common/analysis/lint_rule_status.cc
@@ -125,7 +125,8 @@ void LintStatusFormatter::FormatLintRuleStatuses(
       *stream << " (autofix available)";
     }
     *stream << std::endl;
-    auto cursor = line_column_map_(violation.violation->token.left(base));
+    auto cursor = line_column_map_.GetLineColAtOffset(
+        base, violation.violation->token.left(base));
     if (cursor.line < static_cast<int>(lines.size())) {
       *stream << lines[cursor.line] << std::endl;
       *stream << verible::Spacer(cursor.column) << "^" << std::endl;
@@ -143,7 +144,9 @@ void LintStatusFormatter::FormatViolation(std::ostream* stream,
                                           absl::string_view rule_name) const {
   // TODO(fangism): Use the context member to print which named construct or
   // design element the violation appears in (or full stack thereof).
-  (*stream) << path << ':' << line_column_map_(violation.token.left(base))
+  (*stream) << path << ':'
+            << line_column_map_.GetLineColAtOffset(base,
+                                                   violation.token.left(base))
             << ": " << violation.reason << ' ' << url << " [" << rule_name
             << ']';
 }
diff --git a/common/analysis/lint_rule_status_test.cc b/common/analysis/lint_rule_status_test.cc
index 38c6c1729..46b710af1 100644
--- a/common/analysis/lint_rule_status_test.cc
+++ b/common/analysis/lint_rule_status_test.cc
@@ -238,6 +238,25 @@ TEST(LintRuleStatusFormatterTestWithContext, MultipleStatusesSimpleOutput) {
   RunLintStatusesTest(test, true);
 }
 
+TEST(LintRuleStatusFormatterTestWithContext, PointToCorrectUtf8Char) {
+  SymbolPtr root = Node();
+  static const int dont_care_tag = 0;
+  constexpr absl::string_view text("äöüß\n");
+  //                                ^ä^ü
+  LintStatusTest test = {
+      "rule",
+      "URL",
+      "some/file.sv",
+      text,
+      {{"reason1", TokenInfo(dont_care_tag, text.substr(0, 2)),
+        "some/file.sv:1:1: reason1 URL "
+        "[rule]\näöüß\n"},
+       {"reason2", TokenInfo(dont_care_tag, text.substr(strlen("äö"), 2)),
+        "some/file.sv:1:3: reason2 URL "
+        "[rule]\näöüß\n  "}}};
+  RunLintStatusesTest(test, true);
+}
+
 TEST(AutoFixTest, ValidUseCases) {
   //                                       0123456789abcdef
   static constexpr absl::string_view text("This is an image");
diff --git a/common/analysis/lint_waiver.cc b/common/analysis/lint_waiver.cc
index f2f1e9812..914cd1ac3 100644
--- a/common/analysis/lint_waiver.cc
+++ b/common/analysis/lint_waiver.cc
@@ -69,7 +69,7 @@ void LintWaiver::RegexToLines(absl::string_view contents,
       for (std::cregex_iterator i(contents.begin(), contents.end(), *re);
            i != std::cregex_iterator(); i++) {
         std::cmatch match = *i;
-        WaiveOneLine(rule.first, line_map(match.position()).line);
+        WaiveOneLine(rule.first, line_map.LineAtOffset(match.position()));
       }
     }
   }
@@ -251,7 +251,8 @@ static absl::Status WaiveCommandHandler(
   LineColumn regex_token_pos = {};
 
   for (const auto& token : tokens) {
-    token_pos = line_map(token.left(waive_content));
+    token_pos =
+        line_map.GetLineColAtOffset(waive_content, token.left(waive_content));
 
     switch (token.token_enum()) {
       case CFG_TK_COMMAND:
@@ -434,7 +435,8 @@ absl::Status LintWaiverBuilder::ApplyExternalWaivers(
   for (const auto c_range : commands) {
     const auto command = make_container_range(c_range.begin(), c_range.end());
 
-    command_pos = line_map(command.begin()->left(waivers_config_content));
+    command_pos = line_map.GetLineColAtOffset(
+        waivers_config_content, command.begin()->left(waivers_config_content));
 
     if (command[0].token_enum() == CFG_TK_COMMENT) {
       continue;
diff --git a/common/strings/BUILD b/common/strings/BUILD
index 6b1cda162..258c9ce4e 100644
--- a/common/strings/BUILD
+++ b/common/strings/BUILD
@@ -282,7 +282,10 @@ cc_library(
         "//verilog/analysis:__pkg__",
         "//verilog/formatting:__pkg__",
     ],
-    deps = ["@com_google_absl//absl/strings"],
+    deps = [
+       "utf8",
+       "@com_google_absl//absl/strings"
+    ],
 )
 
 cc_test(
diff --git a/common/strings/line_column_map.cc b/common/strings/line_column_map.cc
index bbb8c9d46..ce9e49775 100644
--- a/common/strings/line_column_map.cc
+++ b/common/strings/line_column_map.cc
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "common/strings/utf8.h"
 
 namespace verible {
 
@@ -71,16 +72,23 @@ LineColumnMap::LineColumnMap(const std::vector<absl::string_view>& lines) {
   }
 }
 
-// Translate byte-offset into line-column.
-// Byte offsets beyond the end-of-file will return an unspecified result.
-LineColumn LineColumnMap::operator()(int offset) const {
+LineColumn LineColumnMap::GetLineColAtOffset(absl::string_view base,
+                                             int bytes_offset) const {
   const auto begin = beginning_of_line_offsets_.begin();
   const auto end = beginning_of_line_offsets_.end();
-  const auto base = std::upper_bound(begin, end, offset) - 1;
   // std::upper_bound is a binary search.
-  const int line_number = std::distance(begin, base);
-  const int column = offset - *base;
-  return LineColumn{line_number, column};
+  const auto line_at_offset = std::upper_bound(begin, end, bytes_offset) - 1;
+  const int line_number = std::distance(begin, line_at_offset);
+  const int len_within_line = bytes_offset - *line_at_offset;
+  absl::string_view line = base.substr(*line_at_offset, len_within_line);
+  return LineColumn{line_number, utf8_len(line)};
 }
 
+int LineColumnMap::LineAtOffset(int bytes_offset) const {
+  const auto begin = beginning_of_line_offsets_.begin();
+  const auto end = beginning_of_line_offsets_.end();
+  // std::upper_bound is a binary search.
+  const auto line_at_offset = std::upper_bound(begin, end, bytes_offset) - 1;
+  return std::distance(begin, line_at_offset);
+}
 }  // namespace verible
diff --git a/common/strings/line_column_map.h b/common/strings/line_column_map.h
index db9aa18be..dcf8e8c98 100644
--- a/common/strings/line_column_map.h
+++ b/common/strings/line_column_map.h
@@ -75,8 +75,17 @@ class LineColumnMap {
     return beginning_of_line_offsets_[index];
   }
 
-  // Translate byte-offset into line and column.
-  LineColumn operator()(int bytes_offset) const;
+  // Get line number at the given byte offset.
+  int LineAtOffset(int bytes_offset) const;
+
+  // Get line and column at the given offset. The column takes multi-byte
+  // encodings into account, so the column represents the true character not
+  // simply the byte-offset within the line.
+  //
+  // TODO(hzeller): technically, we don't need the base as we already got it
+  // in the constructor, but change separately after lifetime questions have
+  // been considered.
+  LineColumn GetLineColAtOffset(absl::string_view base, int bytes_offset) const;
 
   const std::vector<int>& GetBeginningOfLineOffsets() const {
     return beginning_of_line_offsets_;
diff --git a/common/strings/line_column_map_test.cc b/common/strings/line_column_map_test.cc
index 7d7d049d7..718c4f66b 100644
--- a/common/strings/line_column_map_test.cc
+++ b/common/strings/line_column_map_test.cc
@@ -49,8 +49,9 @@ struct LineColumnMapTestData {
 
 // Raw line and column are 0-indexed.
 const LineColumnMapTestData map_test_data[] = {
-    {"", {0}, {{0, {0, 0}}, {1, {0, 1}}}},   // empty file
-    {"_", {0}, {{0, {0, 0}}, {1, {0, 1}}}},  // no \n before EOF
+    // Also testing beyond the end of the file - it should return last position
+    {"", {0}, {{0, {0, 0}}, {1, {0, 0}}, {100, {0, 0}}}},  // empty file
+    {"_", {0}, {{0, {0, 0}}, {1, {0, 1}}}},                // no \n before EOF
     {"abc", {0}, {{0, {0, 0}}, {2, {0, 2}}, {3, {0, 3}}}},
     {"\n", {0, 1}, {{0, {0, 0}}, {1, {1, 0}}}},  // one empty line
     {"\n\n", {0, 1, 2}, {{0, {0, 0}}, {1, {1, 0}}, {2, {2, 0}}}},
@@ -60,6 +61,12 @@ const LineColumnMapTestData map_test_data[] = {
     {"hello\ndarkness\nmy old friend\n",
      {0, 6, 15, 29},
      {{0, {0, 0}}, {10, {1, 4}}, {15, {2, 0}}, {20, {2, 5}}}},
+    // Multi-byte characters. Let's use strlen() to count the bytes, the
+    // column should accurately point to the character.
+    {"😀😀😀", {0}, {{static_cast<int>(2 * strlen("😀")), {0, 2}}}},
+    {"Heizölrückstoßabdämpfung",
+     {0},
+     {{static_cast<int>(strlen("Heizölrückstoß")), {0, 14}}}},
 };
 
 // Test test verifies that line-column offset appear to the user correctly.
@@ -163,7 +170,8 @@ TEST(LineColumnMapTest, Lookup) {
   for (const auto& test_case : map_test_data) {
     const LineColumnMap line_map(test_case.text);
     for (const auto& q : test_case.queries) {
-      EXPECT_EQ(q.line_col, line_map(q.offset))
+      EXPECT_EQ(q.line_col,
+                line_map.GetLineColAtOffset(test_case.text, q.offset))
           << "Text: \"" << test_case.text << "\"\n"
           << "Failed testing offset " << q.offset;
     }
diff --git a/common/text/text_structure.h b/common/text/text_structure.h
index a8147e402..7f2a1492f 100644
--- a/common/text/text_structure.h
+++ b/common/text/text_structure.h
@@ -95,6 +95,10 @@ class TextStructureView {
 
   const LineColumnMap& GetLineColumnMap() const { return line_column_map_; }
 
+  LineColumn GetLineColAtOffset(int bytes_offset) const {
+    return line_column_map_.GetLineColAtOffset(contents_, bytes_offset);
+  }
+
   const std::vector<TokenSequence::const_iterator>& GetLineTokenMap() const {
     return line_token_map_;
   }
@@ -201,7 +205,7 @@ class TextStructureView {
   // the contents_ string view.
   absl::Status SyntaxTreeConsistencyCheck() const;
 
-private:
+ private:
   void RecalculateLineColumnMap() {
     line_column_map_ = LineColumnMap(contents_);
   }
diff --git a/verilog/analysis/verilog_linter.cc b/verilog/analysis/verilog_linter.cc
index a14ef968e..bb9e3a7c9 100644
--- a/verilog/analysis/verilog_linter.cc
+++ b/verilog/analysis/verilog_linter.cc
@@ -481,7 +481,7 @@ static void AppendLintRuleStatuses(
           [&](const verible::LintViolation& violation) {
             // Lookup the line number on which the offending token resides.
             const size_t offset = violation.token.left(text_base);
-            const size_t line = line_map(offset).line;
+            const size_t line = line_map.LineAtOffset(offset);
             // Check that line number against the set of waived lines.
             const bool waived =
                 LintWaiver::LineNumberSetContains(*waived_lines, line);
diff --git a/verilog/formatting/formatter.cc b/verilog/formatting/formatter.cc
index 7ad31d225..babc2babc 100644
--- a/verilog/formatting/formatter.cc
+++ b/verilog/formatting/formatter.cc
@@ -484,7 +484,8 @@ static void PrintLargestPartitions(
     stream << hline << "\n[" << partition->Size() << " tokens";
     if (!partition->IsEmpty()) {
       stream << ", starting at line:col "
-             << line_column_map(
+             << line_column_map.GetLineColAtOffset(
+                    base_text,
                     partition->TokensRange().front().token->left(base_text));
     }
     stream << "]: " << *partition << std::endl;
@@ -616,7 +617,9 @@ class ContinuationCommentAligner {
  private:
   int GetTokenColumn(const verible::TokenInfo* token) {
     CHECK_NOTNULL(token);
-    const int column = line_column_map_(token->left(base_text_)).column;
+    const int column =
+        line_column_map_.GetLineColAtOffset(base_text_, token->left(base_text_))
+            .column;
     CHECK_GE(column, 0);
     return column;
   }
diff --git a/verilog/tools/ls/verible-lsp-adapter.cc b/verilog/tools/ls/verible-lsp-adapter.cc
index b02da9d4b..c5e5212f6 100644
--- a/verilog/tools/ls/verible-lsp-adapter.cc
+++ b/verilog/tools/ls/verible-lsp-adapter.cc
@@ -25,11 +25,13 @@
 namespace verilog {
 // Convert our representation of a linter violation to a LSP-Diagnostic
 static verible::lsp::Diagnostic ViolationToDiagnostic(
-    const verilog::LintViolationWithStatus &v, absl::string_view base,
-    const verible::LineColumnMap &lc_map) {
+    const verilog::LintViolationWithStatus &v,
+    const verible::TextStructureView &text) {
   const verible::LintViolation &violation = *v.violation;
-  verible::LineColumn start = lc_map(violation.token.left(base));
-  verible::LineColumn end = lc_map(violation.token.right(base));
+  verible::LineColumn start =
+      text.GetLineColAtOffset(violation.token.left(text.Contents()));
+  verible::LineColumn end =
+      text.GetLineColAtOffset(violation.token.right(text.Contents()));
   const char *fix_msg = violation.autofixes.empty() ? "" : " (fix available)";
   return verible::lsp::Diagnostic{
       .range =
@@ -81,24 +83,24 @@ std::vector<verible::lsp::Diagnostic> CreateDiagnostics(
     if (--remaining <= 0) break;
   }
 
-  const absl::string_view base = current->parser().Data().Contents();
-  const verible::LineColumnMap& line_column_map = current->parser().Data().GetLineColumnMap();
   for (const auto &v : lint_violations) {
-    result.emplace_back(ViolationToDiagnostic(v, base, line_column_map));
+    result.emplace_back(ViolationToDiagnostic(v, current->parser().Data()));
     if (--remaining <= 0) break;
   }
   return result;
 }
 
 static std::vector<verible::lsp::TextEdit> AutofixToTextEdits(
-    const verible::AutoFix &fix, absl::string_view base,
-    const verible::LineColumnMap &lc_map) {
+    const verible::AutoFix &fix, const verible::TextStructureView &text) {
   std::vector<verible::lsp::TextEdit> result;
   // TODO(hzeller): figure out if edits are stacking or are all based
   // on the same start status.
+  const absl::string_view base = text.Contents();
   for (const verible::ReplacementEdit &edit : fix.Edits()) {
-    verible::LineColumn start = lc_map(edit.fragment.begin() - base.begin());
-    verible::LineColumn end = lc_map(edit.fragment.end() - base.begin());
+    verible::LineColumn start =
+        text.GetLineColAtOffset(edit.fragment.begin() - base.begin());
+    verible::LineColumn end =
+        text.GetLineColAtOffset(edit.fragment.end() - base.begin());
     result.emplace_back(verible::lsp::TextEdit{
         .range =
             {
@@ -122,13 +124,12 @@ std::vector<verible::lsp::CodeAction> GenerateLinterCodeActions(
       verilog::GetSortedViolations(current->lint_result());
   if (lint_violations.empty()) return result;
 
-  const absl::string_view base = current->parser().Data().Contents();
-  const verible::LineColumnMap& line_column_map = current->parser().Data().GetLineColumnMap();
+  const verible::TextStructureView &text = current->parser().Data();
 
   for (const auto &v : lint_violations) {
     const verible::LintViolation &violation = *v.violation;
     if (violation.autofixes.empty()) continue;
-    auto diagnostic = ViolationToDiagnostic(v, base, line_column_map);
+    auto diagnostic = ViolationToDiagnostic(v, text);
 
     // The editor usually has the cursor on a line or word, so we
     // only want to output edits that are relevant.
@@ -144,8 +145,8 @@ std::vector<verible::lsp::CodeAction> GenerateLinterCodeActions(
           // The following is translated from json, map uri -> edits.
           // We're only sending changes for one document, the current one.
           .edit = {.changes = {{p.textDocument.uri,
-                                AutofixToTextEdits(fix, base,
-                                                   line_column_map)}}},
+                                AutofixToTextEdits(fix,
+                                                   current->parser().Data())}}},
       });
       preferred_fix = false;  // only the first is preferred.
     }