@@ -1008,36 +1008,30 @@ impl VerifiedInvoiceRequest {
1008
1008
}
1009
1009
}
1010
1010
1011
- /// `String.truncate(new_len)` panics if you split on a UTF-8 code point. This
1012
- /// function will instead truncate the string to the next smaller code point
1013
- /// boundary.
1011
+ /// `String::truncate(new_len)` panics if you split inside a UTF-8 code point,
1012
+ /// which would leave the `String` containing invalid UTF-8. This function will
1013
+ /// instead truncate the string to the next smaller code point boundary so the
1014
+ /// truncated string always remains valid UTF-8.
1014
1015
///
1015
1016
/// This can still split a grapheme cluster, but that's probably fine.
1016
1017
/// We'd otherwise have to pull in the `unicode-segmentation` crate and its big
1017
1018
/// unicode tables to find the next smaller grapheme cluster boundary.
1018
1019
fn string_truncate_safe ( mut s : String , new_len : usize ) -> String {
1019
- /// Returns true if a byte is the first byte of a UTF-8 code point sequence.
1020
- // TODO(phlip9): remove when std stabilizes `str::floor_char_boundary`.
1021
- #[ inline]
1022
- const fn u8_is_utf8_char_boundary ( b : u8 ) -> bool {
1023
- // This is bit magic equivalent to: b < 128 || b >= 192
1024
- ( b as i8 ) >= -0x40
1025
- }
1026
-
1027
- /// Finds the closest `x` not exceeding `index` where `s.is_char_boundary(x)`
1028
- /// is true.
1029
- // TODO(phlip9): remove when std stabilizes `str::floor_char_boundary`.
1020
+ /// Finds the largest byte index `x` not exceeding byte index `index` where
1021
+ /// `s.is_char_boundary(x)` is true.
1022
+ // TODO(phlip9): remove when `std::str::floor_char_boundary` stabilizes.
1030
1023
#[ inline]
1031
1024
fn str_floor_char_boundary ( s : & str , index : usize ) -> usize {
1032
1025
if index >= s. len ( ) {
1033
1026
s. len ( )
1034
1027
} else {
1035
- let lower_bound = index. saturating_sub ( 3 ) ;
1036
- let new_index = s. as_bytes ( ) [ lower_bound..=index]
1037
- . iter ( )
1038
- . rposition ( |b| u8_is_utf8_char_boundary ( * b) )
1039
- . unwrap_or ( 0 ) ;
1040
- lower_bound + new_index
1028
+ // UTF-8 code points are 1-4 bytes long, so we can limit our search
1029
+ // to this range: [index - 3, index]
1030
+ let lower_bound_index = index. saturating_sub ( 3 ) ;
1031
+ ( lower_bound_index..=index)
1032
+ . rev ( )
1033
+ . find ( |idx| s. is_char_boundary ( * idx) )
1034
+ . unwrap_or ( lower_bound_index)
1041
1035
}
1042
1036
}
1043
1037
@@ -1465,6 +1459,7 @@ mod tests {
1465
1459
use crate :: ln:: inbound_payment:: ExpandedKey ;
1466
1460
use crate :: ln:: msgs:: { DecodeError , MAX_VALUE_MSAT } ;
1467
1461
use crate :: offers:: invoice:: { Bolt12Invoice , SIGNATURE_TAG as INVOICE_SIGNATURE_TAG } ;
1462
+ use crate :: offers:: invoice_request:: string_truncate_safe;
1468
1463
use crate :: offers:: merkle:: { self , SignatureTlvStreamRef , TaggedHash , TlvStream } ;
1469
1464
use crate :: offers:: nonce:: Nonce ;
1470
1465
#[ cfg( not( c_bindings) ) ]
@@ -3026,4 +3021,31 @@ mod tests {
3026
3021
Err ( _) => panic ! ( "unexpected error" ) ,
3027
3022
}
3028
3023
}
3024
+
3025
+ #[ test]
3026
+ fn test_string_truncate_safe ( ) {
3027
+ // We'll correctly truncate to the nearest UTF-8 code point boundary:
3028
+ // ❤ variation-selector
3029
+ // e29da4 efb88f
3030
+ let s = String :: from ( "❤️" ) ;
3031
+ for idx in 0 ..( s. len ( ) + 5 ) {
3032
+ if idx >= s. len ( ) {
3033
+ assert_eq ! ( s, string_truncate_safe( s. clone( ) , idx) ) ;
3034
+ } else if ( 3 ..s. len ( ) ) . contains ( & idx) {
3035
+ assert_eq ! ( "❤" , string_truncate_safe( s. clone( ) , idx) ) ;
3036
+ } else {
3037
+ assert_eq ! ( "" , string_truncate_safe( s. clone( ) , idx) ) ;
3038
+ }
3039
+ }
3040
+
3041
+ // Every byte in an ASCII string is also a full UTF-8 code point.
3042
+ let s = String :: from ( "my ASCII string!" ) ;
3043
+ for idx in 0 ..( s. len ( ) + 5 ) {
3044
+ if idx >= s. len ( ) {
3045
+ assert_eq ! ( s, string_truncate_safe( s. clone( ) , idx) ) ;
3046
+ } else {
3047
+ assert_eq ! ( s[ ..idx] , string_truncate_safe( s. clone( ) , idx) ) ;
3048
+ }
3049
+ }
3050
+ }
3029
3051
}
0 commit comments