Skip to content

Commit c779885

Browse files
committed
Bug 36572135 - Add support for deserialization of 4-byte UTF-8 sequences to C++ client
[git-p4: depot-paths = "//dev/main.cpp/": change = 110064]
1 parent 9b7c1aa commit c779885

File tree

3 files changed

+168
-21
lines changed

3 files changed

+168
-21
lines changed

src/coherence/lang/String.cpp

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/*
2-
* Copyright (c) 2000, 2020, Oracle and/or its affiliates.
2+
* Copyright (c) 2000, 2024, Oracle and/or its affiliates.
33
*
44
* Licensed under the Universal Permissive License v 1.0 as shown at
5-
* http://oss.oracle.com/licenses/upl.
5+
* https://oss.oracle.com/licenses/upl.
66
*/
77
#include "coherence/lang/String.hpp"
88

@@ -227,11 +227,29 @@ namespace
227227
switch (ch & 0xF0)
228228
{
229229
case 0xF0:
230-
// 4-octet format: 1111 0xxx, 10 xx xxxx, 10xx xxxx, 10xx xxxx
231-
COH_THROW_STREAM (IllegalArgumentException,
232-
"encountered non-BMP Unicode character 0x" <<
233-
std::hex << uint32_t(ch) << std::dec <<
234-
" at index: " << cb);
230+
// 4-octet format: 1111 xxxx, 10xx xxxx, 10xx xxxx, 10xx xxxx (supplemental plane)
231+
// ensure there is at least one more byte
232+
++cb;
233+
if (cb + 1 >= c)
234+
{
235+
// Octet indicates more bytes in character,
236+
// but cb will go past the end of the string
237+
COH_THROW_STREAM (IllegalArgumentException,
238+
"encountered incomplete UTF-8 character 0x" <<
239+
std::hex << uint32_t(ch) << std::dec <<
240+
" at the end of the string");
241+
}
242+
243+
// validate that the next byte is a continuation
244+
if ( (a[cb] & 0xC0) != 0x80)
245+
{
246+
// Octet indicates not a continuation byte
247+
COH_THROW_STREAM (IllegalArgumentException,
248+
"encountered invalid UTF-8 character 0x" <<
249+
std::hex << uint32_t(ch) << std::dec <<
250+
" at index " << cb);
251+
}
252+
// fall through
235253

236254
case 0xE0:
237255
// 3-octet format: 1110 xxxx, 10xx xxxx, 10xx xxxx
@@ -292,7 +310,7 @@ namespace
292310
// 1-octet format: 0xxx xxxx
293311
++cb;
294312
break;
295-
313+
296314
case 0x00:
297315
// 1-octet format. Possibly NUL
298316
if (0 == ch && String::npos == c)

tests/unit/coherence/io/OctetArrayReadBufferTest.hpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/*
2-
* Copyright (c) 2000, 2020, Oracle and/or its affiliates.
2+
* Copyright (c) 2000, 2024, Oracle and/or its affiliates.
33
*
44
* Licensed under the Universal Permissive License v 1.0 as shown at
5-
* http://oss.oracle.com/licenses/upl.
5+
* https://oss.oracle.com/licenses/upl.
66
*/
77
#include "coherence/lang.ns"
88

@@ -761,6 +761,38 @@ class OctetArrayReadBufferSuite : public CxxTest::TestSuite
761761
TS_ASSERT(hbi->readString() == NULL);
762762
TS_ASSERT(hbi->readString()->equals(vs));
763763
TS_ASSERT(hbi->available() == 0);
764+
765+
// 2 character 4 byte UTF-8 string - 0xf0938080, 0xf09f8ebf
766+
char four[9];
767+
// 0xf0938080
768+
four[0] = (char) 0xf0;
769+
four[1] = (char) 0x93;
770+
four[2] = (char) 0x80;
771+
four[3] = (char) 0x80;
772+
// 0xf09f8ebf
773+
four[4] = (char) 0xf0;
774+
four[5] = (char) 0x9f;
775+
four[6] = (char) 0x8e;
776+
four[7] = (char) 0xbf;
777+
four[8] = '\0';
778+
std::string data(four);
779+
780+
vs = String::create(data);
781+
TS_ASSERT_EQUALS(size32_t(2), vs->length());
782+
hwb = OctetArrayWriteBuffer::create(10);
783+
hbo = hwb->getBufferOutput();
784+
hbo->writeString(vs);
785+
vab = hwb->toOctetArray();
786+
787+
TS_ASSERT_EQUALS(size32_t(9), vab->length);
788+
hrb = OctetArrayReadBuffer::create(vab, 0, 9);
789+
TS_ASSERT_EQUALS(size32_t(9), hrb->length());
790+
hbi = hrb->getBufferInput();
791+
792+
String::View vsOut = hbi->readString();
793+
TS_ASSERT(vsOut->equals(vs));
794+
std::string conv = vsOut;
795+
TS_ASSERT_EQUALS(data, conv);
764796
}
765797

766798
/**

tests/unit/coherence/lang/StringTest.hpp

Lines changed: 108 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/*
2-
* Copyright (c) 2000, 2020, Oracle and/or its affiliates.
2+
* Copyright (c) 2000, 2024, Oracle and/or its affiliates.
33
*
44
* Licensed under the Universal Permissive License v 1.0 as shown at
5-
* http://oss.oracle.com/licenses/upl.
5+
* https://oss.oracle.com/licenses/upl.
66
*/
77
#include "cxxtest/TestSuite.h"
88

@@ -479,7 +479,7 @@ class StringTest : public CxxTest::TestSuite
479479
String::View vString = String::create(data);
480480
std::string conv = vString;
481481
TS_ASSERT(data == conv);
482-
482+
483483
// Test fix for COH-3709
484484
car[0] = (char) 0x01;
485485
car[1] = (char) 0xDD;
@@ -499,7 +499,7 @@ class StringTest : public CxxTest::TestSuite
499499

500500
void testUTF8Validation()
501501
{
502-
// Test Euro character U+20AC; good 3 byte
502+
// test Euro character U+20AC; good 3 byte
503503
char euro[4];
504504
euro[0] = (char) 0xe2;
505505
euro[1] = (char) 0x82;
@@ -509,16 +509,17 @@ class StringTest : public CxxTest::TestSuite
509509
String::View vString = String::create(data);
510510
std::string conv = vString;
511511
TS_ASSERT_EQUALS(data, conv);
512-
513-
// Test bad 3 byte, bad second byte
512+
513+
// test bad 3 byte, bad second byte
514514
char three2[4];
515515
three2[0] = (char) 0xe2;
516516
three2[1] = (char) 0x22; // bad - should have upper bits '10'
517517
three2[2] = (char) 0xa9;
518518
three2[3] = '\0';
519519
std::string data2(three2);
520520
TS_ASSERT_THROWS(String::create(data2), IllegalArgumentException::View);
521-
// Test bad 3 byte, bad third byte
521+
522+
// test bad 3 byte, bad third byte
522523
char three3[4];
523524
three3[0] = (char) 0xe2;
524525
three3[1] = (char) 0x82;
@@ -527,7 +528,7 @@ class StringTest : public CxxTest::TestSuite
527528
std::string data3(three3);
528529
TS_ASSERT_THROWS(String::create(data3), IllegalArgumentException::View);
529530

530-
// Test good 2 byte
531+
// test good 2 byte
531532
char two[3];
532533
two[0] = (char) 0xc2; // c2 a2 is cent character
533534
two[1] = (char) 0xa2;
@@ -537,12 +538,108 @@ class StringTest : public CxxTest::TestSuite
537538
std::string conv4 = vString2;
538539
TS_ASSERT_EQUALS(data4, conv4);
539540

540-
// Test bad 2 byte
541+
// test bad 2 byte
541542
char two2[3];
542-
two2[0] = (char) 0xc2;
543-
two2[1] = (char) 0x22; // bad -should have upper bits '10'
543+
two2[0] = (char) 0xc2;
544+
two2[1] = (char) 0x22; // bad -should have upper bits '10'
544545
two2[2] = '\0';
545546
std::string data5(two2);
546547
TS_ASSERT_THROWS(String::create(data5), IllegalArgumentException::View);
548+
549+
// test good 4 byte single character: 0xf0938080
550+
char four[5];
551+
four[0] = (char) 0xf0;
552+
four[1] = (char) 0x93;
553+
four[2] = (char) 0x80;
554+
four[3] = (char) 0x80;
555+
four[4] = '\0';
556+
std::string data6(four);
557+
String::View vString3 = String::create(data6);
558+
std::string conv5 = vString3;
559+
TS_ASSERT_EQUALS(data6, conv5);
560+
TS_ASSERT_EQUALS(size32_t(1), vString3->length());
561+
562+
// test good 4 byte multiple characters: 0xf09f8ebf, 0xf09f8f80, 0xf09f8e89, 0xf09f9294
563+
char four2[17];
564+
// 0xf09f8ebf
565+
four2[0] = (char) 0xf0;
566+
four2[1] = (char) 0x9f;
567+
four2[2] = (char) 0x8e;
568+
four2[3] = (char) 0xbf;
569+
// 0xf09f8f80
570+
four2[4] = (char) 0xf0;
571+
four2[5] = (char) 0x9f;
572+
four2[6] = (char) 0x8f;
573+
four2[7] = (char) 0x80;
574+
// 0xf09f8e89
575+
four2[8] = (char) 0xf0;
576+
four2[9] = (char) 0x9f;
577+
four2[10] = (char) 0x8e;
578+
four2[11] = (char) 0x89;
579+
// 0xf09f9294
580+
four2[12] = (char) 0xf0;
581+
four2[13] = (char) 0x9f;
582+
four2[14] = (char) 0x92;
583+
four2[15] = (char) 0x94;
584+
four2[16] = '\0';
585+
std::string data7(four2);
586+
vString3 = String::create(data7);
587+
conv5 = vString3;
588+
TS_ASSERT_EQUALS(data7, conv5);
589+
TS_ASSERT_EQUALS(size32_t(4), vString3->length());
590+
591+
// test bad 4 byte, bad second byte
592+
char four3[5];
593+
four3[0] = (char) 0xf0;
594+
four3[1] = (char) 0x22; // bad - should have upper bits '10'
595+
four3[2] = (char) 0x8e;
596+
four3[3] = (char) 0xbf;
597+
four3[4] = '\0';
598+
std::string data8(four3);
599+
TS_ASSERT_THROWS(String::create(data8), IllegalArgumentException::View);
600+
601+
// test bad 4 byte, bad third byte
602+
char four4[5];
603+
four4[0] = (char) 0xf0;
604+
four4[1] = (char) 0x9f;
605+
four4[2] = (char) 0x22; // bad - should have upper bits '10'
606+
four4[3] = (char) 0xbf;
607+
four4[4] = '\0';
608+
std::string data9(four4);
609+
TS_ASSERT_THROWS(String::create(data9), IllegalArgumentException::View);
610+
611+
// test bad 4 byte, bad fourth byte
612+
char four5[5];
613+
four5[0] = (char) 0xf0;
614+
four5[1] = (char) 0x9f;
615+
four5[2] = (char) 0x8e;
616+
four5[3] = (char) 0x29; // bad - should have upper bits '10'
617+
four5[4] = '\0';
618+
std::string data10(four5);
619+
TS_ASSERT_THROWS(String::create(data10), IllegalArgumentException::View);
620+
621+
// test bad 4 byte, incomplete - just one byte
622+
char four6[2];
623+
four6[0] = (char) 0xf0;
624+
four6[1] = '\0';
625+
std::string data11(four6);
626+
TS_ASSERT_THROWS(String::create(data11), IllegalArgumentException::View);
627+
628+
// test bad 4 byte, incomplete - just two bytes
629+
char four7[3];
630+
four7[0] = (char) 0xf0;
631+
four7[1] = (char) 0x93;
632+
four7[2] = '\0';
633+
std::string data12(four7);
634+
TS_ASSERT_THROWS(String::create(data12), IllegalArgumentException::View);
635+
636+
// test bad 4 byte, incomplete - just three bytes
637+
char four8[4];
638+
four8[0] = (char) 0xf0;
639+
four8[1] = (char) 0x93;
640+
four8[2] = (char) 0x80;
641+
four8[3] = '\0';
642+
std::string data13(four8);
643+
TS_ASSERT_THROWS(String::create(data13), IllegalArgumentException::View);
547644
}
548645
};

0 commit comments

Comments
 (0)