-
Notifications
You must be signed in to change notification settings - Fork 1.7k
new lint: char_indices_as_byte_indices
#13435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
use std::ops::ControlFlow; | ||
|
||
use clippy_utils::diagnostics::span_lint_hir_and_then; | ||
use clippy_utils::ty::is_type_lang_item; | ||
use clippy_utils::visitors::for_each_expr; | ||
use clippy_utils::{eq_expr_value, higher, path_to_local_id}; | ||
use rustc_errors::{Applicability, MultiSpan}; | ||
use rustc_hir::{Expr, ExprKind, LangItem, Node, Pat, PatKind}; | ||
use rustc_lint::LateContext; | ||
use rustc_middle::ty::Ty; | ||
use rustc_span::{Span, sym}; | ||
|
||
use super::CHAR_INDICES_AS_BYTE_INDICES; | ||
|
||
// The list of `str` methods we want to lint that have a `usize` argument representing a byte index. | ||
// Note: `String` also has methods that work with byte indices, | ||
// but they all take `&mut self` and aren't worth considering since the user couldn't have called | ||
// them while the chars iterator is live anyway. | ||
const BYTE_INDEX_METHODS: &[&str] = &[ | ||
"is_char_boundary", | ||
"floor_char_boundary", | ||
"ceil_char_boundary", | ||
"get", | ||
"index", | ||
"index_mut", | ||
"get_mut", | ||
"get_unchecked", | ||
"get_unchecked_mut", | ||
"slice_unchecked", | ||
"slice_mut_unchecked", | ||
"split_at", | ||
"split_at_mut", | ||
"split_at_checked", | ||
"split_at_mut_checked", | ||
]; | ||
|
||
const CONTINUE: ControlFlow<!, ()> = ControlFlow::Continue(()); | ||
|
||
pub(super) fn check<'tcx>(cx: &LateContext<'tcx>, pat: &Pat<'_>, iterable: &Expr<'_>, body: &'tcx Expr<'tcx>) { | ||
if let ExprKind::MethodCall(_, enumerate_recv, _, enumerate_span) = iterable.kind | ||
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(iterable.hir_id) | ||
&& cx.tcx.is_diagnostic_item(sym::enumerate_method, method_id) | ||
&& let ExprKind::MethodCall(_, chars_recv, _, chars_span) = enumerate_recv.kind | ||
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(enumerate_recv.hir_id) | ||
&& cx.tcx.is_diagnostic_item(sym::str_chars, method_id) | ||
{ | ||
if let PatKind::Tuple([pat, _], _) = pat.kind | ||
&& let PatKind::Binding(_, binding_id, ..) = pat.kind | ||
{ | ||
// Destructured iterator element `(idx, _)`, look for uses of the binding | ||
for_each_expr(cx, body, |expr| { | ||
if path_to_local_id(expr, binding_id) { | ||
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv); | ||
} | ||
CONTINUE | ||
}); | ||
} else if let PatKind::Binding(_, binding_id, ..) = pat.kind { | ||
// Bound as a tuple, look for `tup.0` | ||
for_each_expr(cx, body, |expr| { | ||
if let ExprKind::Field(e, field) = expr.kind | ||
&& path_to_local_id(e, binding_id) | ||
&& field.name == sym::integer(0) | ||
{ | ||
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv); | ||
} | ||
CONTINUE | ||
}); | ||
} | ||
} | ||
} | ||
|
||
fn check_index_usage<'tcx>( | ||
cx: &LateContext<'tcx>, | ||
expr: &'tcx Expr<'tcx>, | ||
pat: &Pat<'_>, | ||
enumerate_span: Span, | ||
chars_span: Span, | ||
chars_recv: &Expr<'_>, | ||
) { | ||
let Some(parent_expr) = index_consumed_at(cx, expr) else { | ||
return; | ||
}; | ||
|
||
let is_string_like = |ty: Ty<'_>| ty.is_str() || is_type_lang_item(cx, ty, LangItem::String); | ||
let message = match parent_expr.kind { | ||
ExprKind::MethodCall(segment, recv, ..) | ||
// We currently only lint `str` methods (which `String` can deref to), so a `.is_str()` check is sufficient here | ||
// (contrary to the `ExprKind::Index` case which needs to handle both with `is_string_like` because `String` implements | ||
// `Index` directly and no deref to `str` would happen in that case). | ||
if cx.typeck_results().expr_ty_adjusted(recv).peel_refs().is_str() | ||
&& BYTE_INDEX_METHODS.contains(&segment.ident.name.as_str()) | ||
&& eq_expr_value(cx, chars_recv, recv) => | ||
{ | ||
"passing a character position to a method that expects a byte index" | ||
}, | ||
ExprKind::Index(target, ..) | ||
if is_string_like(cx.typeck_results().expr_ty_adjusted(target).peel_refs()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think the reason this isn't used in both arms is a bit non-obvious at first, especially if the comment on |
||
&& eq_expr_value(cx, chars_recv, target) => | ||
{ | ||
"indexing into a string with a character position where a byte index is expected" | ||
}, | ||
_ => return, | ||
}; | ||
|
||
span_lint_hir_and_then( | ||
cx, | ||
CHAR_INDICES_AS_BYTE_INDICES, | ||
expr.hir_id, | ||
expr.span, | ||
message, | ||
|diag| { | ||
diag.note("a character can take up more than one byte, so they are not interchangeable") | ||
.span_note( | ||
MultiSpan::from_spans(vec![pat.span, enumerate_span]), | ||
"position comes from the enumerate iterator", | ||
) | ||
.span_suggestion_verbose( | ||
chars_span.to(enumerate_span), | ||
"consider using `.char_indices()` instead", | ||
"char_indices()", | ||
Applicability::MaybeIncorrect, | ||
); | ||
}, | ||
); | ||
} | ||
|
||
/// Returns the expression which ultimately consumes the index. | ||
/// This is usually the parent expression, i.e. `.split_at(idx)` for `idx`, | ||
/// but for `.get(..idx)` we want to consider the method call the consuming expression, | ||
/// which requires skipping past the range expression. | ||
fn index_consumed_at<'tcx>(cx: &LateContext<'tcx>, expr: &'tcx Expr<'tcx>) -> Option<&'tcx Expr<'tcx>> { | ||
for (_, node) in cx.tcx.hir_parent_iter(expr.hir_id) { | ||
match node { | ||
Node::Expr(expr) if higher::Range::hir(expr).is_some() => {}, | ||
Node::ExprField(_) => {}, | ||
Node::Expr(expr) => return Some(expr), | ||
_ => break, | ||
} | ||
} | ||
None | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
mod char_indices_as_byte_indices; | ||
mod empty_loop; | ||
mod explicit_counter_loop; | ||
mod explicit_into_iter_loop; | ||
|
@@ -740,6 +741,49 @@ declare_clippy_lint! { | |
"manually filling a slice with a value" | ||
} | ||
|
||
declare_clippy_lint! { | ||
/// ### What it does | ||
/// Checks for usage of a character position yielded by `.chars().enumerate()` in a context where a **byte index** is expected, | ||
/// such as an argument to a specific `str` method or indexing into a `str` or `String`. | ||
/// | ||
/// ### Why is this bad? | ||
/// A character (more specifically, a Unicode scalar value) that is yielded by `str::chars` can take up multiple bytes, | ||
/// so a character position does not necessarily have the same byte index at which the character is stored. | ||
/// Thus, using the character position where a byte index is expected can unexpectedly return wrong values | ||
/// or panic when the string consists of multibyte characters. | ||
/// | ||
/// For example, the character `a` in `äa` is stored at byte index 2 but has the character position 1. | ||
/// Using the character position 1 to index into the string will lead to a panic as it is in the middle of the first character. | ||
/// | ||
/// Instead of `.chars().enumerate()`, the correct iterator to use is `.char_indices()`, which yields byte indices. | ||
/// | ||
/// This pattern is technically fine if the strings are known to only use the ASCII subset, | ||
/// though in those cases it would be better to use `bytes()` directly to make the intent clearer, | ||
/// but there is also no downside to just using `.char_indices()` directly and supporting non-ASCII strings. | ||
/// | ||
/// You may also want to read the [chapter on strings in the Rust Book](https://doc.rust-lang.org/book/ch08-02-strings.html) | ||
/// which goes into this in more detail. | ||
/// | ||
/// ### Example | ||
/// ```no_run | ||
/// # let s = "..."; | ||
/// for (idx, c) in s.chars().enumerate() { | ||
/// let _ = s[idx..]; // ⚠️ Panics for strings consisting of multibyte characters | ||
/// } | ||
/// ``` | ||
/// Use instead: | ||
/// ```no_run | ||
/// # let s = "..."; | ||
/// for (idx, c) in s.char_indices() { | ||
/// let _ = s[idx..]; | ||
/// } | ||
/// ``` | ||
#[clippy::version = "1.83.0"] | ||
pub CHAR_INDICES_AS_BYTE_INDICES, | ||
correctness, | ||
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The pattern is technically fine if you know what your strings are (like the description mentions) so it's not always 'outright wrong' like the usual correctness lints, but the fix is also really simple and always applicable so 🤷♂️ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As said on Zulip, we could say that |
||
"using the character position yielded by `.chars().enumerate()` in a context where a byte index is expected" | ||
} | ||
|
||
pub struct Loops { | ||
msrv: Msrv, | ||
enforce_iter_loop_reborrow: bool, | ||
|
@@ -777,6 +821,7 @@ impl_lint_pass!(Loops => [ | |
UNUSED_ENUMERATE_INDEX, | ||
INFINITE_LOOP, | ||
MANUAL_SLICE_FILL, | ||
CHAR_INDICES_AS_BYTE_INDICES, | ||
]); | ||
|
||
impl<'tcx> LateLintPass<'tcx> for Loops { | ||
|
@@ -860,6 +905,7 @@ impl Loops { | |
manual_flatten::check(cx, pat, arg, body, span, self.msrv); | ||
manual_find::check(cx, pat, arg, body, span, expr); | ||
unused_enumerate_index::check(cx, pat, arg, body); | ||
char_indices_as_byte_indices::check(cx, pat, arg, body); | ||
} | ||
|
||
fn check_for_loop_arg(&self, cx: &LateContext<'_>, _: &Pat<'_>, arg: &Expr<'_>) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#![feature(round_char_boundary)] | ||
#![warn(clippy::char_indices_as_byte_indices)] | ||
|
||
trait StrExt { | ||
fn use_index(&self, _: usize); | ||
} | ||
impl StrExt for str { | ||
fn use_index(&self, _: usize) {} | ||
} | ||
|
||
fn bad(prim: &str, string: String) { | ||
for (idx, _) in prim.char_indices() { | ||
let _ = prim[..idx]; | ||
//~^ char_indices_as_byte_indices | ||
prim.split_at(idx); | ||
//~^ char_indices_as_byte_indices | ||
|
||
// This won't panic, but it can still return a wrong substring | ||
let _ = prim[..prim.floor_char_boundary(idx)]; | ||
//~^ char_indices_as_byte_indices | ||
|
||
// can't use #[expect] here because the .fixed file will still have the attribute and create an | ||
// unfulfilled expectation, but make sure lint level attributes work on the use expression: | ||
#[allow(clippy::char_indices_as_byte_indices)] | ||
let _ = prim[..idx]; | ||
} | ||
|
||
for c in prim.char_indices() { | ||
let _ = prim[..c.0]; | ||
//~^ char_indices_as_byte_indices | ||
prim.split_at(c.0); | ||
//~^ char_indices_as_byte_indices | ||
} | ||
|
||
for (idx, _) in string.char_indices() { | ||
let _ = string[..idx]; | ||
//~^ char_indices_as_byte_indices | ||
string.split_at(idx); | ||
//~^ char_indices_as_byte_indices | ||
} | ||
} | ||
|
||
fn good(prim: &str, prim2: &str) { | ||
for (idx, _) in prim.chars().enumerate() { | ||
// Indexing into a different string | ||
let _ = prim2[..idx]; | ||
|
||
// Unknown use | ||
std::hint::black_box(idx); | ||
|
||
// Method call to user defined extension trait | ||
prim.use_index(idx); | ||
|
||
// str method taking a usize that doesn't represent a byte index | ||
prim.splitn(idx, prim2); | ||
} | ||
|
||
let mut string = "äa".to_owned(); | ||
for (idx, _) in string.clone().chars().enumerate() { | ||
// Even though the receiver is the same expression, it should not be treated as the same value. | ||
string.clone().remove(idx); | ||
} | ||
} | ||
|
||
fn main() {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#![feature(round_char_boundary)] | ||
#![warn(clippy::char_indices_as_byte_indices)] | ||
|
||
trait StrExt { | ||
fn use_index(&self, _: usize); | ||
} | ||
impl StrExt for str { | ||
fn use_index(&self, _: usize) {} | ||
} | ||
|
||
fn bad(prim: &str, string: String) { | ||
for (idx, _) in prim.chars().enumerate() { | ||
let _ = prim[..idx]; | ||
//~^ char_indices_as_byte_indices | ||
prim.split_at(idx); | ||
//~^ char_indices_as_byte_indices | ||
|
||
// This won't panic, but it can still return a wrong substring | ||
let _ = prim[..prim.floor_char_boundary(idx)]; | ||
//~^ char_indices_as_byte_indices | ||
|
||
// can't use #[expect] here because the .fixed file will still have the attribute and create an | ||
// unfulfilled expectation, but make sure lint level attributes work on the use expression: | ||
#[allow(clippy::char_indices_as_byte_indices)] | ||
let _ = prim[..idx]; | ||
} | ||
|
||
for c in prim.chars().enumerate() { | ||
let _ = prim[..c.0]; | ||
//~^ char_indices_as_byte_indices | ||
prim.split_at(c.0); | ||
//~^ char_indices_as_byte_indices | ||
} | ||
|
||
for (idx, _) in string.chars().enumerate() { | ||
let _ = string[..idx]; | ||
//~^ char_indices_as_byte_indices | ||
string.split_at(idx); | ||
//~^ char_indices_as_byte_indices | ||
} | ||
} | ||
|
||
fn good(prim: &str, prim2: &str) { | ||
for (idx, _) in prim.chars().enumerate() { | ||
// Indexing into a different string | ||
let _ = prim2[..idx]; | ||
|
||
// Unknown use | ||
std::hint::black_box(idx); | ||
|
||
// Method call to user defined extension trait | ||
prim.use_index(idx); | ||
|
||
// str method taking a usize that doesn't represent a byte index | ||
prim.splitn(idx, prim2); | ||
} | ||
|
||
let mut string = "äa".to_owned(); | ||
for (idx, _) in string.clone().chars().enumerate() { | ||
// Even though the receiver is the same expression, it should not be treated as the same value. | ||
string.clone().remove(idx); | ||
} | ||
} | ||
|
||
fn main() {} |
Uh oh!
There was an error while loading. Please reload this page.