[lldb] Implement ANSI & Unicode aware string stripping & padding #130878

JDevlieghere · 2025-03-12T02:01:17Z

This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from #121860.

llvmbot · 2025-03-12T02:01:51Z

@llvm/pr-subscribers-lldb

Author: Jonas Devlieghere (JDevlieghere)

Changes

This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from #121860.

Full diff: https://github.com/llvm/llvm-project/pull/130878.diff

2 Files Affected:

(modified) lldb/include/lldb/Utility/AnsiTerminal.h (+87-14)
(modified) lldb/unittests/Utility/AnsiTerminalTest.cpp (+49)

diff --git a/lldb/include/lldb/Utility/AnsiTerminal.h b/lldb/include/lldb/Utility/AnsiTerminal.h
index 1939c49c7b859..b388f9fdf4441 100644
--- a/lldb/include/lldb/Utility/AnsiTerminal.h
+++ b/lldb/include/lldb/Utility/AnsiTerminal.h
@@ -70,9 +70,12 @@
 #define ANSI_1_CTRL(ctrl1) "\033["##ctrl1 ANSI_ESC_END
 #define ANSI_2_CTRL(ctrl1, ctrl2) "\033["##ctrl1 ";"##ctrl2 ANSI_ESC_END
 
+#define ANSI_ESC_START_LEN 2
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Locale.h"
 
 #include <string>
 
@@ -172,28 +175,98 @@ inline std::string FormatAnsiTerminalCodes(llvm::StringRef format,
   return fmt;
 }
 
+inline std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef>
+FindNextAnsiSequence(llvm::StringRef str) {
+  llvm::StringRef left;
+  llvm::StringRef right = str;
+
+  while (!right.empty()) {
+    const size_t start = right.find(ANSI_ESC_START);
+
+    // ANSI_ESC_START not found.
+    if (start == llvm::StringRef::npos)
+      return {str, {}, {}};
+
+    // Split the string around the current ANSI_ESC_START.
+    left = str.substr(0, left.size() + start);
+    llvm::StringRef escape = right.substr(start);
+    right = right.substr(start + ANSI_ESC_START_LEN + 1);
+
+    const size_t end = right.find_first_not_of("0123456789;");
+
+    // ANSI_ESC_END found.
+    if (end < right.size() && (right[end] == 'm' || right[end] == 'G'))
+      return {left, escape.take_front(ANSI_ESC_START_LEN + 1 + end + 1),
+              right.substr(end + 1)};
+
+    // Maintain the invariant that str == left + right at the start of the loop.
+    left = str.substr(0, left.size() + ANSI_ESC_START_LEN + 1);
+  }
+
+  return {str, {}, {}};
+}
+
 inline std::string StripAnsiTerminalCodes(llvm::StringRef str) {
   std::string stripped;
   while (!str.empty()) {
-    llvm::StringRef left, right;
-
-    std::tie(left, right) = str.split(ANSI_ESC_START);
+    auto [left, escape, right] = FindNextAnsiSequence(str);
     stripped += left;
+    str = right;
+  }
+  return stripped;
+}
 
-    // ANSI_ESC_START not found.
-    if (left == str && right.empty())
-      break;
+inline std::string TrimAndPad(llvm::StringRef str, size_t visible_length,
+                              char padding = ' ') {
+  std::string result;
+  size_t result_visibile_length = 0;
+
+  // Trim the string to the given visible length.
+  while (!str.empty()) {
+    auto [left, escape, right] = FindNextAnsiSequence(str);
+    str = right;
 
-    size_t end = right.find_first_not_of("0123456789;");
-    if (end < right.size() && (right[end] == 'm' || right[end] == 'G')) {
-      str = right.substr(end + 1);
-    } else {
-      // ANSI_ESC_END not found.
-      stripped += ANSI_ESC_START;
-      str = right;
+    // Compute the length of the string without escape codes. If it fits, append
+    // it together with the invisible escape code.
+    size_t column_width = llvm::sys::locale::columnWidth(left);
+    if (result_visibile_length + column_width <= visible_length) {
+      result.append(left).append(escape);
+      result_visibile_length += column_width;
+      continue;
+    }
+
+    // The string doesn't fit but doesn't fit but doesn't contain unicode.
+    // Append the substring that fits.
+    if (column_width == left.size()) {
+      llvm::StringRef trimmed =
+          left.take_front(visible_length - result_visibile_length);
+      result.append(trimmed);
+      result_visibile_length += visible_length - result_visibile_length;
+      continue;
+    }
+
+    // The string doesn't fit but contains unicode. Repeatedly trim the string
+    // until it fits.
+    llvm::StringRef trimmed = left;
+    while (!trimmed.empty()) {
+      // This relies on columnWidth returning -2 for invalid/partial unicode
+      // characters, which after conversion to size_t will be larger than the
+      // visible width.
+      column_width = llvm::sys::locale::columnWidth(trimmed);
+      if (result_visibile_length + column_width <= visible_length) {
+        result.append(trimmed);
+        result_visibile_length += column_width;
+        break;
+      }
+      trimmed = trimmed.drop_back();
     }
   }
-  return stripped;
+
+  // Pad the string.
+  if (result_visibile_length < visible_length)
+    result.append(visible_length - result_visibile_length, padding);
+
+  return result;
 }
 
 } // namespace ansi
diff --git a/lldb/unittests/Utility/AnsiTerminalTest.cpp b/lldb/unittests/Utility/AnsiTerminalTest.cpp
index 1ba9565c3f6af..cef73ffaf9136 100644
--- a/lldb/unittests/Utility/AnsiTerminalTest.cpp
+++ b/lldb/unittests/Utility/AnsiTerminalTest.cpp
@@ -67,3 +67,52 @@ TEST(AnsiTerminal, InvalidEscapeCode) {
   EXPECT_EQ("abc\x1B[31kabcabc",
             ansi::StripAnsiTerminalCodes("abc\x1B[31kabc\x1B[0mabc"));
 }
+
+TEST(AnsiTerminal, FindNextAnsiSequenceBasic) {
+  auto [left, escape, right] = ansi::FindNextAnsiSequence("foo\x1B[31mbar");
+  EXPECT_EQ("foo", left);
+  EXPECT_EQ("\x1B[31m", escape);
+  EXPECT_EQ("bar", right);
+}
+
+TEST(AnsiTerminal, FindNextAnsiSequenceIncompleteStart) {
+  auto [left, escape, right] =
+      ansi::FindNextAnsiSequence("foo\x1B[bar\x1B[31mbaz");
+  EXPECT_EQ("foo\x1B[bar", left);
+  EXPECT_EQ("\x1B[31m", escape);
+  EXPECT_EQ("baz", right);
+}
+
+TEST(AnsiTerminal, FindNextAnsiSequenceEscapeStart) {
+  auto [left, escape, right] = ansi::FindNextAnsiSequence("\x1B[31mfoo");
+  EXPECT_EQ("", left);
+  EXPECT_EQ("\x1B[31m", escape);
+  EXPECT_EQ("foo", right);
+}
+
+TEST(AnsiTerminal, TrimAndPad) {
+  // Test basic ASCII.
+  EXPECT_EQ("     ", ansi::TrimAndPad("", 5));
+  EXPECT_EQ("foo  ", ansi::TrimAndPad("foo", 5));
+  EXPECT_EQ("fooba", ansi::TrimAndPad("fooba", 5));
+  EXPECT_EQ("fooba", ansi::TrimAndPad("foobar", 5));
+
+  // Simple test that ANSI escape codes don't contribute to the visible width.
+  EXPECT_EQ("\x1B[30m     ", ansi::TrimAndPad("\x1B[30m", 5));
+  EXPECT_EQ("\x1B[30mfoo  ", ansi::TrimAndPad("\x1B[30mfoo", 5));
+  EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfooba", 5));
+  EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfoobar", 5));
+
+  // Test that we include as many escape codes as we can.
+  EXPECT_EQ("fooba\x1B[30m", ansi::TrimAndPad("fooba\x1B[30m", 5));
+  EXPECT_EQ("fooba\x1B[30m\x1B[34m",
+            ansi::TrimAndPad("fooba\x1B[30m\x1B[34m", 5));
+  EXPECT_EQ("fooba\x1B[30m\x1B[34m",
+            ansi::TrimAndPad("fooba\x1B[30m\x1B[34mr", 5));
+
+  // Test Unicode.
+  EXPECT_EQ("❤️    ", ansi::TrimAndPad("❤️", 5));
+  EXPECT_EQ("    ❤️", ansi::TrimAndPad("    ❤️", 5));
+  EXPECT_EQ("12❤️4❤️", ansi::TrimAndPad("12❤️4❤️", 5));
+  EXPECT_EQ("12❤️45", ansi::TrimAndPad("12❤️45❤️", 5));
+}

This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from llvm#121860.

labath · 2025-03-12T11:54:26Z

lldb/include/lldb/Utility/AnsiTerminal.h

+    // The string doesn't fit but doesn't fit but doesn't contain unicode.
+    // Append the substring that fits.
+    if (column_width == left.size()) {
+      llvm::StringRef trimmed =
+          left.take_front(visible_length - result_visibile_length);
+      result.append(trimmed);
+      result_visibile_length += visible_length - result_visibile_length;
+      continue;
+    }


I think this optimization is not correct due to unicode characters which take up two spaces. If they're encoded using two bytes then this check will pass, but it's not legal to split it in half.

Good point, I didn't consider that.

…m#130878) This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from llvm#121860.

…m#130878) This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from llvm#121860. (cherry picked from commit 78c9fa3)

JDevlieghere requested review from DavidSpickett and labath March 12, 2025 02:01

llvmbot added the lldb label Mar 12, 2025

JDevlieghere mentioned this pull request Mar 12, 2025

[lldb] Implement a statusline in LLDB #121860

Merged

[lldb] Implement ANSI & Unicode aware string stripping & padding

e50c6ae

This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from llvm#121860.

JDevlieghere force-pushed the ansi-unicode-fun branch from f628780 to e50c6ae Compare March 12, 2025 04:49

labath approved these changes Mar 12, 2025

View reviewed changes

Remove optimization as it's not safe

9ef0086

JDevlieghere merged commit 78c9fa3 into llvm:main Mar 12, 2025
8 of 9 checks passed

JDevlieghere deleted the ansi-unicode-fun branch March 12, 2025 17:20

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[lldb] Implement ANSI & Unicode aware string stripping & padding #130878

[lldb] Implement ANSI & Unicode aware string stripping & padding #130878

Uh oh!

JDevlieghere commented Mar 12, 2025

Uh oh!

llvmbot commented Mar 12, 2025

Uh oh!

labath Mar 12, 2025

Uh oh!

JDevlieghere Mar 12, 2025

Uh oh!

Uh oh!

Uh oh!

[lldb] Implement ANSI & Unicode aware string stripping & padding #130878

[lldb] Implement ANSI & Unicode aware string stripping & padding #130878

Uh oh!

Conversation

JDevlieghere commented Mar 12, 2025

Uh oh!

llvmbot commented Mar 12, 2025

Uh oh!

labath Mar 12, 2025

Choose a reason for hiding this comment

Uh oh!

JDevlieghere Mar 12, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!