Skip to content

Commit dbac7e9

Browse files
authored
Correctly handle negative code point values (#317)
1 parent 204aa40 commit dbac7e9

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

core/common/src/Utf8.kt

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
122122
/**
123123
* Encodes [codePoint] in UTF-8 and writes it to this sink.
124124
*
125+
* [codePoint] should represent valid Unicode code point, meaning that its value should be within the Unicode codespace
126+
* (`U+000000` .. `U+10ffff`), otherwise [IllegalArgumentException] will be thrown.
127+
*
125128
* Note that in general, a value retrieved from [Char.code] could not be written directly
126129
* as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be
127130
* detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]).
@@ -136,6 +139,7 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
136139
* @param codePoint the codePoint to be written.
137140
*
138141
* @throws IllegalStateException when the sink is closed.
142+
* @throws IllegalArgumentException when [codePoint] value is negative, or greater than `U+10ffff`.
139143
*
140144
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample
141145
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair
@@ -510,6 +514,12 @@ private fun Buffer.commonWriteUtf8(string: String, beginIndex: Int, endIndex: In
510514

511515
private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
512516
when {
517+
codePoint < 0 || codePoint > 0x10ffff -> {
518+
throw IllegalArgumentException(
519+
"Code point value is out of Unicode codespace 0..0x10ffff: 0x${codePoint.toHexString()} ($codePoint)"
520+
)
521+
}
522+
513523
codePoint < 0x80 -> {
514524
// Emit a 7-bit code point with 1 byte.
515525
writeByte(codePoint.toByte())
@@ -539,7 +549,7 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
539549
size += 3L
540550
}
541551

542-
codePoint <= 0x10ffff -> {
552+
else -> { // [0x10000, 0x10ffff]
543553
// Emit a 21-bit code point with 4 bytes.
544554
val tail = writableSegment(4)
545555
tail.data[tail.limit] = (codePoint shr 18 or 0xf0).toByte() // 11110xxx
@@ -549,10 +559,6 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
549559
tail.limit += 4
550560
size += 4L
551561
}
552-
553-
else -> {
554-
throw IllegalArgumentException("Unexpected code point: 0x${codePoint.toHexString()}")
555-
}
556562
}
557563
}
558564

core/common/test/Utf8Test.kt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,21 @@ class Utf8Test {
369369
@Test
370370
fun writeCodePointBeyondUnicodeMaximum() {
371371
val buffer = Buffer()
372-
assertFailsWith<IllegalArgumentException>("Unexpected code point: 0x110000") {
372+
val ex = assertFailsWith<IllegalArgumentException> {
373373
buffer.writeCodePointValue(0x110000)
374374
}
375+
assertEquals("Code point value is out of Unicode codespace 0..0x10ffff: 0x110000 (1114112)",
376+
ex.message)
377+
}
378+
379+
@Test
380+
fun writeCodePointBelowUnicodeMinimum() {
381+
val buffer = Buffer()
382+
val ex = assertFailsWith<IllegalArgumentException> {
383+
buffer.writeCodePointValue(-1)
384+
}
385+
assertEquals("Code point value is out of Unicode codespace 0..0x10ffff: 0xffffffff (-1)",
386+
ex.message)
375387
}
376388

377389
@Test

0 commit comments

Comments
 (0)