Skip to content

Commit e9ef837

Browse files
committed
Reimplement UTF-8-related functions using Unsafe API
1 parent 3dc4e6f commit e9ef837

File tree

1 file changed

+78
-68
lines changed

1 file changed

+78
-68
lines changed

core/common/src/Utf8.kt

Lines changed: 78 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@
7070
package kotlinx.io
7171

7272
import kotlinx.io.internal.*
73+
import kotlinx.io.unsafe.UnsafeBufferOperations
74+
import kotlinx.io.unsafe.withData
75+
import kotlin.math.min
7376

7477
/**
7578
* Returns the number of bytes used to encode the slice of `string` as UTF-8 when using [Sink.writeString].
@@ -454,6 +457,7 @@ private fun Buffer.commonReadUtf8CodePoint(): Int {
454457
}
455458
}
456459

460+
@OptIn(UnsafeIoApi::class)
457461
private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt: (Int) -> Char) {
458462
// Transcode a UTF-16 chars to UTF-8 bytes.
459463
var i = beginIndex
@@ -462,45 +466,49 @@ private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt
462466

463467
when {
464468
c < 0x80 -> {
465-
val tail = writableSegment(1)
466-
val data = tail.data
467-
val segmentOffset = tail.limit - i
468-
val runLimit = minOf(endIndex, Segment.SIZE - segmentOffset)
469-
470-
// Emit a 7-bit character with 1 byte.
471-
data[segmentOffset + i++] = c.toByte() // 0xxxxxxx
472-
473-
// Fast-path contiguous runs of ASCII characters. This is ugly, but yields a ~4x performance
474-
// improvement over independent calls to writeByte().
475-
while (i < runLimit) {
476-
c = charAt(i).code
477-
if (c >= 0x80) break
478-
data[segmentOffset + i++] = c.toByte() // 0xxxxxxx
469+
UnsafeBufferOperations.writeToTail(this, 1) { ctx, segment ->
470+
val segmentOffset = -i
471+
val runLimit = minOf(endIndex, i + segment.remainingCapacity)
472+
473+
// Emit a 7-bit character with 1 byte.
474+
ctx.setUnchecked(segment, segmentOffset + i++, c.toByte()) // 0xxxxxxx
475+
476+
// Fast-path contiguous runs of ASCII characters. This is ugly, but yields a ~4x performance
477+
// improvement over independent calls to writeByte().
478+
while (i < runLimit) {
479+
c = charAt(i).code
480+
if (c >= 0x80) break
481+
ctx.setUnchecked(segment, segmentOffset + i++, c.toByte()) // 0xxxxxxx
482+
}
483+
484+
i + segmentOffset // Equivalent to i - (previous i).
479485
}
480-
481-
val runSize = i + segmentOffset - tail.limit // Equivalent to i - (previous i).
482-
tail.limit += runSize
483-
sizeMut += runSize.toLong()
484486
}
485487

486488
c < 0x800 -> {
487489
// Emit a 11-bit character with 2 bytes.
488-
val tail = writableSegment(2)
489-
tail.data[tail.limit] = (c shr 6 or 0xc0).toByte() // 110xxxxx
490-
tail.data[tail.limit + 1] = (c and 0x3f or 0x80).toByte() // 10xxxxxx
491-
tail.limit += 2
492-
sizeMut += 2L
490+
UnsafeBufferOperations.writeToTail(this, 2) { ctx, segment ->
491+
ctx.setUnchecked(
492+
segment, 0,
493+
(c shr 6 or 0xc0).toByte(), // 110xxxxx
494+
(c and 0x3f or 0x80).toByte() // 10xxxxxx
495+
)
496+
2
497+
}
493498
i++
494499
}
495500

496501
c < 0xd800 || c > 0xdfff -> {
497502
// Emit a 16-bit character with 3 bytes.
498-
val tail = writableSegment(3)
499-
tail.data[tail.limit] = (c shr 12 or 0xe0).toByte() // 1110xxxx
500-
tail.data[tail.limit + 1] = (c shr 6 and 0x3f or 0x80).toByte() // 10xxxxxx
501-
tail.data[tail.limit + 2] = (c and 0x3f or 0x80).toByte() // 10xxxxxx
502-
tail.limit += 3
503-
sizeMut += 3L
503+
UnsafeBufferOperations.writeToTail(this, 3) { ctx, segment ->
504+
ctx.setUnchecked(
505+
segment, 0,
506+
(c shr 12 or 0xe0).toByte(), // 1110xxxx
507+
(c shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx
508+
(c and 0x3f or 0x80).toByte() // 10xxxxxx
509+
)
510+
3
511+
}
504512
i++
505513
}
506514

@@ -519,20 +527,23 @@ private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt
519527
val codePoint = 0x010000 + (c and 0x03ff shl 10 or (low and 0x03ff))
520528

521529
// Emit a 21-bit character with 4 bytes.
522-
val tail = writableSegment(4)
523-
tail.data[tail.limit] = (codePoint shr 18 or 0xf0).toByte() // 11110xxx
524-
tail.data[tail.limit + 1] = (codePoint shr 12 and 0x3f or 0x80).toByte() // 10xxxxxx
525-
tail.data[tail.limit + 2] = (codePoint shr 6 and 0x3f or 0x80).toByte() // 10xxyyyy
526-
tail.data[tail.limit + 3] = (codePoint and 0x3f or 0x80).toByte() // 10yyyyyy
527-
tail.limit += 4
528-
sizeMut += 4L
530+
UnsafeBufferOperations.writeToTail(this, 4) { ctx, segment ->
531+
ctx.setUnchecked(segment, 0,
532+
(codePoint shr 18 or 0xf0).toByte(), // 11110xxx
533+
(codePoint shr 12 and 0x3f or 0x80).toByte(), // 10xxxxxx
534+
(codePoint shr 6 and 0x3f or 0x80).toByte(), // 10xxyyyy
535+
(codePoint and 0x3f or 0x80).toByte() // 10yyyyyy
536+
)
537+
4
538+
}
529539
i += 2
530540
}
531541
}
532542
}
533543
}
534544
}
535545

546+
@OptIn(UnsafeIoApi::class)
536547
private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
537548
when {
538549
codePoint < 0 || codePoint > 0x10ffff -> {
@@ -548,11 +559,11 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
548559

549560
codePoint < 0x800 -> {
550561
// Emit a 11-bit code point with 2 bytes.
551-
val tail = writableSegment(2)
552-
tail.data[tail.limit] = (codePoint shr 6 or 0xc0).toByte() // 110xxxxx
553-
tail.data[tail.limit + 1] = (codePoint and 0x3f or 0x80).toByte() // 10xxxxxx
554-
tail.limit += 2
555-
sizeMut += 2L
562+
UnsafeBufferOperations.writeToTail(this, 2) { ctx, segment ->
563+
ctx.setUnchecked(segment, 0, (codePoint shr 6 or 0xc0).toByte()) // 110xxxxx
564+
ctx.setUnchecked(segment, 1, (codePoint and 0x3f or 0x80).toByte()) // 10xxxxxx
565+
2
566+
}
556567
}
557568

558569
codePoint in 0xd800..0xdfff -> {
@@ -562,48 +573,47 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
562573

563574
codePoint < 0x10000 -> {
564575
// Emit a 16-bit code point with 3 bytes.
565-
val tail = writableSegment(3)
566-
tail.data[tail.limit] = (codePoint shr 12 or 0xe0).toByte() // 1110xxxx
567-
tail.data[tail.limit + 1] = (codePoint shr 6 and 0x3f or 0x80).toByte() // 10xxxxxx
568-
tail.data[tail.limit + 2] = (codePoint and 0x3f or 0x80).toByte() // 10xxxxxx
569-
tail.limit += 3
570-
sizeMut += 3L
576+
UnsafeBufferOperations.writeToTail(this, 3) { ctx, segment ->
577+
ctx.setUnchecked(segment, 0, (codePoint shr 12 or 0xe0).toByte()) // 1110xxxx
578+
ctx.setUnchecked(segment, 1, (codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx
579+
ctx.setUnchecked(segment, 2, (codePoint and 0x3f or 0x80).toByte()) // 10xxxxxx
580+
3
581+
}
571582
}
572583

573584
else -> { // [0x10000, 0x10ffff]
574585
// Emit a 21-bit code point with 4 bytes.
575-
val tail = writableSegment(4)
576-
tail.data[tail.limit] = (codePoint shr 18 or 0xf0).toByte() // 11110xxx
577-
tail.data[tail.limit + 1] = (codePoint shr 12 and 0x3f or 0x80).toByte() // 10xxxxxx
578-
tail.data[tail.limit + 2] = (codePoint shr 6 and 0x3f or 0x80).toByte() // 10xxyyyy
579-
tail.data[tail.limit + 3] = (codePoint and 0x3f or 0x80).toByte() // 10yyyyyy
580-
tail.limit += 4
581-
sizeMut += 4L
586+
UnsafeBufferOperations.writeToTail(this, 4) { ctx, segment ->
587+
ctx.setUnchecked(segment,0, (codePoint shr 18 or 0xf0).toByte()) // 11110xxx
588+
ctx.setUnchecked(segment,1, (codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
589+
ctx.setUnchecked(segment,2, (codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy
590+
ctx.setUnchecked(segment,3, (codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy
591+
4
592+
}
582593
}
583594
}
584595
}
585596

597+
@OptIn(UnsafeIoApi::class)
586598
private fun Buffer.commonReadUtf8(byteCount: Long): String {
587599
require(byteCount >= 0 && byteCount <= Int.MAX_VALUE) {
588600
"byteCount ($byteCount) is not within the range [0..${Int.MAX_VALUE})"
589601
}
590602
require(byteCount)
591603
if (byteCount == 0L) return ""
592604

593-
val s = head!!
594-
if (s.pos + byteCount > s.limit) {
595-
// If the string spans multiple segments, delegate to readBytes().
596-
597-
return readByteArray(byteCount.toInt()).commonToUtf8String()
598-
}
599-
600-
val result = s.data.commonToUtf8String(s.pos, s.pos + byteCount.toInt())
601-
s.pos += byteCount.toInt()
602-
sizeMut -= byteCount
603-
604-
if (s.pos == s.limit) {
605-
recycleHead()
605+
UnsafeBufferOperations.iterate(this) { ctx, head ->
606+
head!!
607+
if (head.size >= byteCount) {
608+
var result = ""
609+
ctx.withData(head) { data, pos, limit ->
610+
result = data.commonToUtf8String(pos, min(limit, pos + byteCount.toInt()))
611+
skip(byteCount)
612+
return result
613+
}
614+
}
606615
}
607616

608-
return result
617+
// If the string spans multiple segments, delegate to readBytes().
618+
return readByteArray(byteCount.toInt()).commonToUtf8String()
609619
}

0 commit comments

Comments
 (0)