Skip to content

Commit 0fbdeef

Browse files
RUST-2023 Add wrapper type for utf-8 lossy deserialization (#497)
1 parent 8e0fb3b commit 0fbdeef

File tree

7 files changed

+111
-4
lines changed

7 files changed

+111
-4
lines changed

src/de/raw.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use crate::{
1818
RAW_BSON_NEWTYPE,
1919
RAW_DOCUMENT_NEWTYPE,
2020
},
21-
serde_helpers::HUMAN_READABLE_NEWTYPE,
21+
serde_helpers::{HUMAN_READABLE_NEWTYPE, UTF8_LOSSY_NEWTYPE},
2222
spec::{BinarySubtype, ElementType},
2323
uuid::UUID_NEWTYPE_NAME,
2424
DateTime,
@@ -297,6 +297,11 @@ impl<'de> serde::de::Deserializer<'de> for Deserializer<'de> {
297297
inner.options.human_readable = true;
298298
visitor.visit_newtype_struct(inner)
299299
}
300+
UTF8_LOSSY_NEWTYPE => {
301+
let mut inner = self;
302+
inner.options.utf8_lossy = true;
303+
visitor.visit_newtype_struct(inner)
304+
}
300305
_ => visitor.visit_newtype_struct(self),
301306
}
302307
}

src/document.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,7 @@ impl Document {
599599
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
600600
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
601601
/// For most use cases, `Document::from_reader` can be used instead.
602+
#[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"]
602603
pub fn from_reader_utf8_lossy<R: Read>(mut reader: R) -> crate::de::Result<Document> {
603604
Self::decode(&mut reader, true)
604605
}

src/lib.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,9 +295,7 @@ pub use self::{
295295
from_document,
296296
from_document_with_options,
297297
from_reader,
298-
from_reader_utf8_lossy,
299298
from_slice,
300-
from_slice_utf8_lossy,
301299
Deserializer,
302300
DeserializerOptions,
303301
},
@@ -328,6 +326,9 @@ pub use self::{
328326
uuid::{Uuid, UuidRepresentation},
329327
};
330328

329+
#[allow(deprecated)]
330+
pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy,};
331+
331332
#[macro_use]
332333
mod macros;
333334
pub mod binary;

src/serde_helpers.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,3 +886,46 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for HumanReadable<T> {
886886
deserializer.deserialize_newtype_struct(HUMAN_READABLE_NEWTYPE, V(PhantomData))
887887
}
888888
}
889+
890+
/// Wrapper type for deserializing BSON bytes with invalid UTF-8 sequences.
891+
///
892+
/// Any invalid UTF-8 strings contained in the wrapped type will be replaced with the Unicode
893+
/// replacement character. This wrapper type only has an effect when deserializing from BSON bytes.
894+
///
895+
/// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization<T>`
896+
/// will call the `serialize` method for the wrapped `T`.
897+
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
898+
pub struct Utf8LossyDeserialization<T>(pub T);
899+
900+
pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy";
901+
902+
impl<T: Serialize> Serialize for Utf8LossyDeserialization<T> {
903+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
904+
where
905+
S: Serializer,
906+
{
907+
self.0.serialize(serializer)
908+
}
909+
}
910+
911+
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization<T> {
912+
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
913+
where
914+
D: serde::Deserializer<'de>,
915+
{
916+
struct V<T>(PhantomData<fn() -> T>);
917+
impl<'de, T: Deserialize<'de>> Visitor<'de> for V<T> {
918+
type Value = Utf8LossyDeserialization<T>;
919+
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
920+
formatter.write_str("Utf8Lossy wrapper")
921+
}
922+
fn visit_newtype_struct<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
923+
where
924+
D: serde::Deserializer<'de>,
925+
{
926+
T::deserialize(deserializer).map(Utf8LossyDeserialization)
927+
}
928+
}
929+
deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData))
930+
}
931+
}

src/tests/modules/serializer_deserializer.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ fn test_encode_decode_utf8_string_invalid() {
7373
doc.to_writer(&mut buf).unwrap();
7474

7575
let expected = doc! { "key": "��" };
76+
#[allow(deprecated)]
7677
let decoded = Document::from_reader_utf8_lossy(&mut Cursor::new(buf)).unwrap();
7778
assert_eq!(decoded, expected);
7879
}

src/tests/serde_helpers.rs

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
use core::str;
2+
13
use serde::{de::Visitor, Deserialize, Serialize};
24

3-
use crate::serde_helpers::HumanReadable;
5+
use crate::{
6+
from_slice,
7+
serde_helpers::{HumanReadable, Utf8LossyDeserialization},
8+
};
49

510
#[test]
611
fn human_readable_wrapper() {
@@ -135,3 +140,50 @@ fn human_readable_wrapper() {
135140
let raw_tripped: Data = crate::from_slice(&bytes).unwrap();
136141
assert_eq!(&raw_tripped, &expected);
137142
}
143+
144+
#[test]
145+
#[allow(dead_code)] // suppress warning for unread fields
146+
fn utf8_lossy_wrapper() {
147+
let invalid_bytes = b"\x80\xae".to_vec();
148+
let invalid_string = unsafe { String::from_utf8_unchecked(invalid_bytes) };
149+
150+
let both_strings_invalid_bytes =
151+
rawdoc! { "s1": invalid_string.clone(), "s2": invalid_string.clone() }.into_bytes();
152+
let first_string_invalid_bytes =
153+
rawdoc! { "s1": invalid_string.clone(), "s2": ":)" }.into_bytes();
154+
155+
let expected_replacement = "��".to_string();
156+
157+
#[derive(Debug, Deserialize)]
158+
struct NoUtf8Lossy {
159+
s1: String,
160+
s2: String,
161+
}
162+
163+
from_slice::<NoUtf8Lossy>(&both_strings_invalid_bytes).unwrap_err();
164+
165+
let s = from_slice::<Utf8LossyDeserialization<NoUtf8Lossy>>(&both_strings_invalid_bytes)
166+
.unwrap()
167+
.0;
168+
assert_eq!(s.s1, expected_replacement);
169+
assert_eq!(s.s2, expected_replacement);
170+
171+
#[derive(Debug, Deserialize)]
172+
struct FirstStringUtf8Lossy {
173+
s1: Utf8LossyDeserialization<String>,
174+
s2: String,
175+
}
176+
177+
let s = from_slice::<FirstStringUtf8Lossy>(&first_string_invalid_bytes).unwrap();
178+
assert_eq!(s.s1.0, expected_replacement);
179+
assert_eq!(&s.s2, ":)");
180+
181+
from_slice::<FirstStringUtf8Lossy>(&both_strings_invalid_bytes).unwrap_err();
182+
183+
let s =
184+
from_slice::<Utf8LossyDeserialization<FirstStringUtf8Lossy>>(&both_strings_invalid_bytes)
185+
.unwrap()
186+
.0;
187+
assert_eq!(s.s1.0, expected_replacement);
188+
assert_eq!(s.s2, expected_replacement);
189+
}

src/tests/spec/corpus.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::{
77

88
use crate::{
99
raw::{RawBsonRef, RawDocument},
10+
serde_helpers::Utf8LossyDeserialization,
1011
tests::LOCK,
1112
Bson,
1213
Document,
@@ -549,12 +550,15 @@ fn run_test(test: TestFile) {
549550
crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str());
550551

551552
if decode_error.description.contains("invalid UTF-8") {
553+
#[allow(deprecated)]
552554
crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice()).unwrap_or_else(|err| {
553555
panic!(
554556
"{}: utf8_lossy should not fail (failed with {:?})",
555557
description, err
556558
)
557559
});
560+
crate::from_slice::<Utf8LossyDeserialization<Document>>(bson.as_slice())
561+
.expect(&description);
558562
}
559563
}
560564

0 commit comments

Comments
 (0)