Skip to content

Commit 78c9fa3

Browse files
authored
[lldb] Implement ANSI & Unicode aware string stripping & padding (llvm#130878)
This PR implements a unicode and ANSI escape code aware function to trim and pad strings. This is a break-out from llvm#121860.
1 parent 6030936 commit 78c9fa3

File tree

2 files changed

+127
-14
lines changed

2 files changed

+127
-14
lines changed

lldb/include/lldb/Utility/AnsiTerminal.h

Lines changed: 78 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,12 @@
7070
#define ANSI_1_CTRL(ctrl1) "\033["##ctrl1 ANSI_ESC_END
7171
#define ANSI_2_CTRL(ctrl1, ctrl2) "\033["##ctrl1 ";"##ctrl2 ANSI_ESC_END
7272

73+
#define ANSI_ESC_START_LEN 2
74+
7375
#include "llvm/ADT/ArrayRef.h"
7476
#include "llvm/ADT/STLExtras.h"
7577
#include "llvm/ADT/StringRef.h"
78+
#include "llvm/Support/Locale.h"
7679

7780
#include <string>
7881

@@ -172,28 +175,89 @@ inline std::string FormatAnsiTerminalCodes(llvm::StringRef format,
172175
return fmt;
173176
}
174177

178+
inline std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef>
179+
FindNextAnsiSequence(llvm::StringRef str) {
180+
llvm::StringRef left;
181+
llvm::StringRef right = str;
182+
183+
while (!right.empty()) {
184+
const size_t start = right.find(ANSI_ESC_START);
185+
186+
// ANSI_ESC_START not found.
187+
if (start == llvm::StringRef::npos)
188+
return {str, {}, {}};
189+
190+
// Split the string around the current ANSI_ESC_START.
191+
left = str.take_front(left.size() + start);
192+
llvm::StringRef escape = right.substr(start);
193+
right = right.substr(start + ANSI_ESC_START_LEN + 1);
194+
195+
const size_t end = right.find_first_not_of("0123456789;");
196+
197+
// ANSI_ESC_END found.
198+
if (end < right.size() && (right[end] == 'm' || right[end] == 'G'))
199+
return {left, escape.take_front(ANSI_ESC_START_LEN + 1 + end + 1),
200+
right.substr(end + 1)};
201+
202+
// Maintain the invariant that str == left + right at the start of the loop.
203+
left = str.take_front(left.size() + ANSI_ESC_START_LEN + 1);
204+
}
205+
206+
return {str, {}, {}};
207+
}
208+
175209
inline std::string StripAnsiTerminalCodes(llvm::StringRef str) {
176210
std::string stripped;
177211
while (!str.empty()) {
178-
llvm::StringRef left, right;
179-
180-
std::tie(left, right) = str.split(ANSI_ESC_START);
212+
auto [left, escape, right] = FindNextAnsiSequence(str);
181213
stripped += left;
214+
str = right;
215+
}
216+
return stripped;
217+
}
182218

183-
// ANSI_ESC_START not found.
184-
if (left == str && right.empty())
185-
break;
219+
inline std::string TrimAndPad(llvm::StringRef str, size_t visible_length,
220+
char padding = ' ') {
221+
std::string result;
222+
result.reserve(visible_length);
223+
size_t result_visibile_length = 0;
186224

187-
size_t end = right.find_first_not_of("0123456789;");
188-
if (end < right.size() && (right[end] == 'm' || right[end] == 'G')) {
189-
str = right.substr(end + 1);
190-
} else {
191-
// ANSI_ESC_END not found.
192-
stripped += ANSI_ESC_START;
193-
str = right;
225+
// Trim the string to the given visible length.
226+
while (!str.empty()) {
227+
auto [left, escape, right] = FindNextAnsiSequence(str);
228+
str = right;
229+
230+
// Compute the length of the string without escape codes. If it fits, append
231+
// it together with the invisible escape code.
232+
size_t column_width = llvm::sys::locale::columnWidth(left);
233+
if (result_visibile_length + column_width <= visible_length) {
234+
result.append(left).append(escape);
235+
result_visibile_length += column_width;
236+
continue;
237+
}
238+
239+
// The string might contain unicode which means it's not safe to truncate.
240+
// Repeatedly trim the string until it its valid unicode and fits.
241+
llvm::StringRef trimmed = left;
242+
while (!trimmed.empty()) {
243+
// This relies on columnWidth returning -2 for invalid/partial unicode
244+
// characters, which after conversion to size_t will be larger than the
245+
// visible width.
246+
column_width = llvm::sys::locale::columnWidth(trimmed);
247+
if (result_visibile_length + column_width <= visible_length) {
248+
result.append(trimmed);
249+
result_visibile_length += column_width;
250+
break;
251+
}
252+
trimmed = trimmed.drop_back();
194253
}
195254
}
196-
return stripped;
255+
256+
// Pad the string.
257+
if (result_visibile_length < visible_length)
258+
result.append(visible_length - result_visibile_length, padding);
259+
260+
return result;
197261
}
198262

199263
} // namespace ansi

lldb/unittests/Utility/AnsiTerminalTest.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,52 @@ TEST(AnsiTerminal, InvalidEscapeCode) {
6767
EXPECT_EQ("abc\x1B[31kabcabc",
6868
ansi::StripAnsiTerminalCodes("abc\x1B[31kabc\x1B[0mabc"));
6969
}
70+
71+
TEST(AnsiTerminal, FindNextAnsiSequenceBasic) {
72+
auto [left, escape, right] = ansi::FindNextAnsiSequence("foo\x1B[31mbar");
73+
EXPECT_EQ("foo", left);
74+
EXPECT_EQ("\x1B[31m", escape);
75+
EXPECT_EQ("bar", right);
76+
}
77+
78+
TEST(AnsiTerminal, FindNextAnsiSequenceIncompleteStart) {
79+
auto [left, escape, right] =
80+
ansi::FindNextAnsiSequence("foo\x1B[bar\x1B[31mbaz");
81+
EXPECT_EQ("foo\x1B[bar", left);
82+
EXPECT_EQ("\x1B[31m", escape);
83+
EXPECT_EQ("baz", right);
84+
}
85+
86+
TEST(AnsiTerminal, FindNextAnsiSequenceEscapeStart) {
87+
auto [left, escape, right] = ansi::FindNextAnsiSequence("\x1B[31mfoo");
88+
EXPECT_EQ("", left);
89+
EXPECT_EQ("\x1B[31m", escape);
90+
EXPECT_EQ("foo", right);
91+
}
92+
93+
TEST(AnsiTerminal, TrimAndPad) {
94+
// Test basic ASCII.
95+
EXPECT_EQ(" ", ansi::TrimAndPad("", 5));
96+
EXPECT_EQ("foo ", ansi::TrimAndPad("foo", 5));
97+
EXPECT_EQ("fooba", ansi::TrimAndPad("fooba", 5));
98+
EXPECT_EQ("fooba", ansi::TrimAndPad("foobar", 5));
99+
100+
// Simple test that ANSI escape codes don't contribute to the visible width.
101+
EXPECT_EQ("\x1B[30m ", ansi::TrimAndPad("\x1B[30m", 5));
102+
EXPECT_EQ("\x1B[30mfoo ", ansi::TrimAndPad("\x1B[30mfoo", 5));
103+
EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfooba", 5));
104+
EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfoobar", 5));
105+
106+
// Test that we include as many escape codes as we can.
107+
EXPECT_EQ("fooba\x1B[30m", ansi::TrimAndPad("fooba\x1B[30m", 5));
108+
EXPECT_EQ("fooba\x1B[30m\x1B[34m",
109+
ansi::TrimAndPad("fooba\x1B[30m\x1B[34m", 5));
110+
EXPECT_EQ("fooba\x1B[30m\x1B[34m",
111+
ansi::TrimAndPad("fooba\x1B[30m\x1B[34mr", 5));
112+
113+
// Test Unicode.
114+
EXPECT_EQ("❤️ ", ansi::TrimAndPad("❤️", 5));
115+
EXPECT_EQ(" ❤️", ansi::TrimAndPad(" ❤️", 5));
116+
EXPECT_EQ("12❤️4❤️", ansi::TrimAndPad("12❤️4❤️", 5));
117+
EXPECT_EQ("12❤️45", ansi::TrimAndPad("12❤️45❤️", 5));
118+
}

0 commit comments

Comments
 (0)