Skip to content

new lint: char_indices_as_byte_indices #13435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5516,6 +5516,7 @@ Released 2018-09-13
[`cast_slice_different_sizes`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_different_sizes
[`cast_slice_from_raw_parts`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_from_raw_parts
[`cfg_not_test`]: https://rust-lang.github.io/rust-clippy/master/index.html#cfg_not_test
[`char_indices_as_byte_indices`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_indices_as_byte_indices
[`char_lit_as_u8`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_lit_as_u8
[`chars_last_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_last_cmp
[`chars_next_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_next_cmp
Expand Down
1 change: 1 addition & 0 deletions clippy_lints/src/declared_lints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ pub static LINTS: &[&crate::LintInfo] = &[
crate::literal_representation::UNREADABLE_LITERAL_INFO,
crate::literal_representation::UNUSUAL_BYTE_GROUPINGS_INFO,
crate::literal_string_with_formatting_args::LITERAL_STRING_WITH_FORMATTING_ARGS_INFO,
crate::loops::CHAR_INDICES_AS_BYTE_INDICES_INFO,
crate::loops::EMPTY_LOOP_INFO,
crate::loops::EXPLICIT_COUNTER_LOOP_INFO,
crate::loops::EXPLICIT_INTO_ITER_LOOP_INFO,
Expand Down
141 changes: 141 additions & 0 deletions clippy_lints/src/loops/char_indices_as_byte_indices.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
use std::ops::ControlFlow;

use clippy_utils::diagnostics::span_lint_hir_and_then;
use clippy_utils::ty::is_type_lang_item;
use clippy_utils::visitors::for_each_expr;
use clippy_utils::{eq_expr_value, higher, path_to_local_id};
use rustc_errors::{Applicability, MultiSpan};
use rustc_hir::{Expr, ExprKind, LangItem, Node, Pat, PatKind};
use rustc_lint::LateContext;
use rustc_middle::ty::Ty;
use rustc_span::{Span, sym};

use super::CHAR_INDICES_AS_BYTE_INDICES;

// The list of `str` methods we want to lint that have a `usize` argument representing a byte index.
// Note: `String` also has methods that work with byte indices,
// but they all take `&mut self` and aren't worth considering since the user couldn't have called
// them while the chars iterator is live anyway.
const BYTE_INDEX_METHODS: &[&str] = &[
"is_char_boundary",
"floor_char_boundary",
"ceil_char_boundary",
"get",
"index",
"index_mut",
"get_mut",
"get_unchecked",
"get_unchecked_mut",
"slice_unchecked",
"slice_mut_unchecked",
"split_at",
"split_at_mut",
"split_at_checked",
"split_at_mut_checked",
];

const CONTINUE: ControlFlow<!, ()> = ControlFlow::Continue(());

pub(super) fn check<'tcx>(cx: &LateContext<'tcx>, pat: &Pat<'_>, iterable: &Expr<'_>, body: &'tcx Expr<'tcx>) {
if let ExprKind::MethodCall(_, enumerate_recv, _, enumerate_span) = iterable.kind
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(iterable.hir_id)
&& cx.tcx.is_diagnostic_item(sym::enumerate_method, method_id)
&& let ExprKind::MethodCall(_, chars_recv, _, chars_span) = enumerate_recv.kind
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(enumerate_recv.hir_id)
&& cx.tcx.is_diagnostic_item(sym::str_chars, method_id)
{
if let PatKind::Tuple([pat, _], _) = pat.kind
&& let PatKind::Binding(_, binding_id, ..) = pat.kind
{
// Destructured iterator element `(idx, _)`, look for uses of the binding
for_each_expr(cx, body, |expr| {
if path_to_local_id(expr, binding_id) {
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
}
CONTINUE
});
} else if let PatKind::Binding(_, binding_id, ..) = pat.kind {
// Bound as a tuple, look for `tup.0`
for_each_expr(cx, body, |expr| {
if let ExprKind::Field(e, field) = expr.kind
&& path_to_local_id(e, binding_id)
&& field.name == sym::integer(0)
{
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
}
CONTINUE
});
}
}
}

fn check_index_usage<'tcx>(
cx: &LateContext<'tcx>,
expr: &'tcx Expr<'tcx>,
pat: &Pat<'_>,
enumerate_span: Span,
chars_span: Span,
chars_recv: &Expr<'_>,
) {
let Some(parent_expr) = index_consumed_at(cx, expr) else {
return;
};

let is_string_like = |ty: Ty<'_>| ty.is_str() || is_type_lang_item(cx, ty, LangItem::String);
let message = match parent_expr.kind {
ExprKind::MethodCall(segment, recv, ..)
// We currently only lint `str` methods (which `String` can deref to), so a `.is_str()` check is sufficient here
// (contrary to the `ExprKind::Index` case which needs to handle both with `is_string_like` because `String` implements
// `Index` directly and no deref to `str` would happen in that case).
if cx.typeck_results().expr_ty_adjusted(recv).peel_refs().is_str()
&& BYTE_INDEX_METHODS.contains(&segment.ident.name.as_str())
&& eq_expr_value(cx, chars_recv, recv) =>
{
"passing a character position to a method that expects a byte index"
},
ExprKind::Index(target, ..)
if is_string_like(cx.typeck_results().expr_ty_adjusted(target).peel_refs())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I think the reason this isn't used in both arms is a bit non-obvious at first, especially if the comment on BYTE_INDEX_METHODS isn't seen. It could perhaps just point to what's already written there. That's about all.

&& eq_expr_value(cx, chars_recv, target) =>
{
"indexing into a string with a character position where a byte index is expected"
},
_ => return,
};

span_lint_hir_and_then(
cx,
CHAR_INDICES_AS_BYTE_INDICES,
expr.hir_id,
expr.span,
message,
|diag| {
diag.note("a character can take up more than one byte, so they are not interchangeable")
.span_note(
MultiSpan::from_spans(vec![pat.span, enumerate_span]),
"position comes from the enumerate iterator",
)
.span_suggestion_verbose(
chars_span.to(enumerate_span),
"consider using `.char_indices()` instead",
"char_indices()",
Applicability::MaybeIncorrect,
);
},
);
}

/// Returns the expression which ultimately consumes the index.
/// This is usually the parent expression, i.e. `.split_at(idx)` for `idx`,
/// but for `.get(..idx)` we want to consider the method call the consuming expression,
/// which requires skipping past the range expression.
fn index_consumed_at<'tcx>(cx: &LateContext<'tcx>, expr: &'tcx Expr<'tcx>) -> Option<&'tcx Expr<'tcx>> {
for (_, node) in cx.tcx.hir_parent_iter(expr.hir_id) {
match node {
Node::Expr(expr) if higher::Range::hir(expr).is_some() => {},
Node::ExprField(_) => {},
Node::Expr(expr) => return Some(expr),
_ => break,
}
}
None
}
46 changes: 46 additions & 0 deletions clippy_lints/src/loops/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod char_indices_as_byte_indices;
mod empty_loop;
mod explicit_counter_loop;
mod explicit_into_iter_loop;
Expand Down Expand Up @@ -740,6 +741,49 @@ declare_clippy_lint! {
"manually filling a slice with a value"
}

declare_clippy_lint! {
/// ### What it does
/// Checks for usage of a character position yielded by `.chars().enumerate()` in a context where a **byte index** is expected,
/// such as an argument to a specific `str` method or indexing into a `str` or `String`.
///
/// ### Why is this bad?
/// A character (more specifically, a Unicode scalar value) that is yielded by `str::chars` can take up multiple bytes,
/// so a character position does not necessarily have the same byte index at which the character is stored.
/// Thus, using the character position where a byte index is expected can unexpectedly return wrong values
/// or panic when the string consists of multibyte characters.
///
/// For example, the character `a` in `äa` is stored at byte index 2 but has the character position 1.
/// Using the character position 1 to index into the string will lead to a panic as it is in the middle of the first character.
///
/// Instead of `.chars().enumerate()`, the correct iterator to use is `.char_indices()`, which yields byte indices.
///
/// This pattern is technically fine if the strings are known to only use the ASCII subset,
/// though in those cases it would be better to use `bytes()` directly to make the intent clearer,
/// but there is also no downside to just using `.char_indices()` directly and supporting non-ASCII strings.
///
/// You may also want to read the [chapter on strings in the Rust Book](https://doc.rust-lang.org/book/ch08-02-strings.html)
/// which goes into this in more detail.
///
/// ### Example
/// ```no_run
/// # let s = "...";
/// for (idx, c) in s.chars().enumerate() {
/// let _ = s[idx..]; // ⚠️ Panics for strings consisting of multibyte characters
/// }
/// ```
/// Use instead:
/// ```no_run
/// # let s = "...";
/// for (idx, c) in s.char_indices() {
/// let _ = s[idx..];
/// }
/// ```
#[clippy::version = "1.83.0"]
pub CHAR_INDICES_AS_BYTE_INDICES,
correctness,
Copy link
Member Author

@y21 y21 Sep 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pattern is technically fine if you know what your strings are (like the description mentions) so it's not always 'outright wrong' like the usual correctness lints, but the fix is also really simple and always applicable so 🤷‍♂️

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As said on Zulip, we could say that bytes should always be used as a replacement if it expects ASCII. I don't think there are any downsides to that. I think blocking compilation on those cases is better than not doing so for where it could actually be a problem.

"using the character position yielded by `.chars().enumerate()` in a context where a byte index is expected"
}

pub struct Loops {
msrv: Msrv,
enforce_iter_loop_reborrow: bool,
Expand Down Expand Up @@ -777,6 +821,7 @@ impl_lint_pass!(Loops => [
UNUSED_ENUMERATE_INDEX,
INFINITE_LOOP,
MANUAL_SLICE_FILL,
CHAR_INDICES_AS_BYTE_INDICES,
]);

impl<'tcx> LateLintPass<'tcx> for Loops {
Expand Down Expand Up @@ -860,6 +905,7 @@ impl Loops {
manual_flatten::check(cx, pat, arg, body, span, self.msrv);
manual_find::check(cx, pat, arg, body, span, expr);
unused_enumerate_index::check(cx, pat, arg, body);
char_indices_as_byte_indices::check(cx, pat, arg, body);
}

fn check_for_loop_arg(&self, cx: &LateContext<'_>, _: &Pat<'_>, arg: &Expr<'_>) {
Expand Down
65 changes: 65 additions & 0 deletions tests/ui/char_indices_as_byte_indices.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#![feature(round_char_boundary)]
#![warn(clippy::char_indices_as_byte_indices)]

trait StrExt {
fn use_index(&self, _: usize);
}
impl StrExt for str {
fn use_index(&self, _: usize) {}
}

fn bad(prim: &str, string: String) {
for (idx, _) in prim.char_indices() {
let _ = prim[..idx];
//~^ char_indices_as_byte_indices
prim.split_at(idx);
//~^ char_indices_as_byte_indices

// This won't panic, but it can still return a wrong substring
let _ = prim[..prim.floor_char_boundary(idx)];
//~^ char_indices_as_byte_indices

// can't use #[expect] here because the .fixed file will still have the attribute and create an
// unfulfilled expectation, but make sure lint level attributes work on the use expression:
#[allow(clippy::char_indices_as_byte_indices)]
let _ = prim[..idx];
}

for c in prim.char_indices() {
let _ = prim[..c.0];
//~^ char_indices_as_byte_indices
prim.split_at(c.0);
//~^ char_indices_as_byte_indices
}

for (idx, _) in string.char_indices() {
let _ = string[..idx];
//~^ char_indices_as_byte_indices
string.split_at(idx);
//~^ char_indices_as_byte_indices
}
}

fn good(prim: &str, prim2: &str) {
for (idx, _) in prim.chars().enumerate() {
// Indexing into a different string
let _ = prim2[..idx];

// Unknown use
std::hint::black_box(idx);

// Method call to user defined extension trait
prim.use_index(idx);

// str method taking a usize that doesn't represent a byte index
prim.splitn(idx, prim2);
}

let mut string = "äa".to_owned();
for (idx, _) in string.clone().chars().enumerate() {
// Even though the receiver is the same expression, it should not be treated as the same value.
string.clone().remove(idx);
}
}

fn main() {}
65 changes: 65 additions & 0 deletions tests/ui/char_indices_as_byte_indices.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#![feature(round_char_boundary)]
#![warn(clippy::char_indices_as_byte_indices)]

trait StrExt {
fn use_index(&self, _: usize);
}
impl StrExt for str {
fn use_index(&self, _: usize) {}
}

fn bad(prim: &str, string: String) {
for (idx, _) in prim.chars().enumerate() {
let _ = prim[..idx];
//~^ char_indices_as_byte_indices
prim.split_at(idx);
//~^ char_indices_as_byte_indices

// This won't panic, but it can still return a wrong substring
let _ = prim[..prim.floor_char_boundary(idx)];
//~^ char_indices_as_byte_indices

// can't use #[expect] here because the .fixed file will still have the attribute and create an
// unfulfilled expectation, but make sure lint level attributes work on the use expression:
#[allow(clippy::char_indices_as_byte_indices)]
let _ = prim[..idx];
}

for c in prim.chars().enumerate() {
let _ = prim[..c.0];
//~^ char_indices_as_byte_indices
prim.split_at(c.0);
//~^ char_indices_as_byte_indices
}

for (idx, _) in string.chars().enumerate() {
let _ = string[..idx];
//~^ char_indices_as_byte_indices
string.split_at(idx);
//~^ char_indices_as_byte_indices
}
}

fn good(prim: &str, prim2: &str) {
for (idx, _) in prim.chars().enumerate() {
// Indexing into a different string
let _ = prim2[..idx];

// Unknown use
std::hint::black_box(idx);

// Method call to user defined extension trait
prim.use_index(idx);

// str method taking a usize that doesn't represent a byte index
prim.splitn(idx, prim2);
}

let mut string = "äa".to_owned();
for (idx, _) in string.clone().chars().enumerate() {
// Even though the receiver is the same expression, it should not be treated as the same value.
string.clone().remove(idx);
}
}

fn main() {}
Loading