Skip to content

Correctly handle negative code point values #317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions core/common/src/Utf8.kt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
/**
* Encodes [codePoint] in UTF-8 and writes it to this sink.
*
* [codePoint] should represent valid Unicode code point, meaning that its value should be within the Unicode codespace
* (`U+000000` .. `U+10ffff`), otherwise [IllegalArgumentException] will be thrown.
*
* Note that in general, a value retrieved from [Char.code] could not be written directly
* as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be
* detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]).
Expand All @@ -136,6 +139,7 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
* @param codePoint the codePoint to be written.
*
* @throws IllegalStateException when the sink is closed.
* @throws IllegalArgumentException when [codePoint] value is negative, or greater than `U+10ffff`.
*
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair
Expand Down Expand Up @@ -510,6 +514,12 @@ private fun Buffer.commonWriteUtf8(string: String, beginIndex: Int, endIndex: In

private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
when {
codePoint < 0 || codePoint > 0x10ffff -> {
throw IllegalArgumentException(
"Code point value is out of Unicode codespace 0..0x10ffff: 0x${codePoint.toHexString()} ($codePoint)"
)
}

codePoint < 0x80 -> {
// Emit a 7-bit code point with 1 byte.
writeByte(codePoint.toByte())
Expand Down Expand Up @@ -539,7 +549,7 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
size += 3L
}

codePoint <= 0x10ffff -> {
else -> { // [0x10000, 0x10ffff]
// Emit a 21-bit code point with 4 bytes.
val tail = writableSegment(4)
tail.data[tail.limit] = (codePoint shr 18 or 0xf0).toByte() // 11110xxx
Expand All @@ -549,10 +559,6 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
tail.limit += 4
size += 4L
}

else -> {
throw IllegalArgumentException("Unexpected code point: 0x${codePoint.toHexString()}")
}
}
}

Expand Down
14 changes: 13 additions & 1 deletion core/common/test/Utf8Test.kt
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,21 @@ class Utf8Test {
@Test
fun writeCodePointBeyondUnicodeMaximum() {
val buffer = Buffer()
assertFailsWith<IllegalArgumentException>("Unexpected code point: 0x110000") {
val ex = assertFailsWith<IllegalArgumentException> {
buffer.writeCodePointValue(0x110000)
}
assertEquals("Code point value is out of Unicode codespace 0..0x10ffff: 0x110000 (1114112)",
ex.message)
}

@Test
fun writeCodePointBelowUnicodeMinimum() {
val buffer = Buffer()
val ex = assertFailsWith<IllegalArgumentException> {
buffer.writeCodePointValue(-1)
}
assertEquals("Code point value is out of Unicode codespace 0..0x10ffff: 0xffffffff (-1)",
ex.message)
}

@Test
Expand Down