|
80 | 80 | \end{note}
|
81 | 81 | If an input file is determined to be a UTF-8 file,
|
82 | 82 | then it shall be a well-formed UTF-8 code unit sequence and
|
83 |
| -it is decoded to produce a sequence of UCS scalar values |
84 |
| -that constitutes the sequence of elements of the translation character set. |
| 83 | +it is decoded to produce a sequence of Unicode scalar values. |
| 84 | +A sequence of translation character set elements is then formed |
| 85 | +by mapping each Unicode scalar value |
| 86 | +to the corresponding translation character set element. |
85 | 87 | In the resulting sequence,
|
86 | 88 | each pair of characters in the input sequence consisting of
|
87 | 89 | \unicode{000d}{carriage return} followed by \unicode{000a}{line feed},
|
|
244 | 246 | The \defnadj{translation}{character set} consists of the following elements:
|
245 | 247 | \begin{itemize}
|
246 | 248 | \item
|
247 |
| -each character named by ISO/IEC 10646, |
248 |
| -as identified by its unique UCS scalar value, and |
| 249 | +each abstract character assigned a code point in the Unicode codespace, and |
249 | 250 | \item
|
250 |
| -a distinct character for each UCS scalar value |
251 |
| -where no named character is assigned. |
| 251 | +a distinct character for each Unicode scalar value |
| 252 | +not assigned to an abstract character. |
252 | 253 | \end{itemize}
|
253 | 254 | \begin{note}
|
254 |
| -ISO/IEC 10646 code points are integers |
| 255 | +Unicode code points are integers |
255 | 256 | in the range $[0, \mathrm{10FFFF}]$ (hexadecimal).
|
256 | 257 | A surrogate code point is a value
|
257 | 258 | in the range $[\mathrm{D800}, \mathrm{DFFF}]$ (hexadecimal).
|
258 |
| -A UCS scalar value is any code point that is not a surrogate code point. |
| 259 | +A Unicode scalar value is any code point that is not a surrogate code point. |
259 | 260 | \end{note}
|
260 | 261 |
|
261 | 262 | \pnum
|
|
355 | 356 | \tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad}, or
|
356 | 357 | \tcode{\textbackslash u\{\grammarterm{simple-hexadecimal-digit-sequence}\}}
|
357 | 358 | designates the character in the translation character set
|
358 |
| -whose UCS scalar value is the hexadecimal number represented by |
| 359 | +whose Unicode scalar value is the hexadecimal number represented by |
359 | 360 | the sequence of \grammarterm{hexadecimal-digit}s
|
360 | 361 | in the \grammarterm{universal-character-name}.
|
361 |
| -The program is ill-formed if that number is not a UCS scalar value. |
| 362 | +The program is ill-formed if that number is not a Unicode scalar value. |
362 | 363 |
|
363 | 364 | \pnum
|
364 | 365 | A \grammarterm{universal-character-name}
|
365 | 366 | that is a \grammarterm{named-universal-character}
|
366 |
| -designates the character named by its \grammarterm{n-char-sequence}. |
367 |
| -A character is so named if the \grammarterm{n-char-sequence} is equal to |
368 |
| -\begin{itemize} |
369 |
| -\item |
370 |
| -the associated character name or associated character name alias |
371 |
| -specified in ISO/IEC 10646 subclause ``Code charts and lists of character names'' |
372 |
| -or |
373 |
| -\item |
374 |
| -the control code alias given in \tref{lex.charset.ucn}. |
| 367 | +designates the corresponding character |
| 368 | +in the Unicode Standard (chapter 4.8 Name) |
| 369 | +if the \grammarterm{n-char-sequence} is equal |
| 370 | +to its character name or |
| 371 | +to one of its character name aliases of |
| 372 | +type ``control'', ``correction'', or ``alternate''; |
| 373 | +otherwise, the program is ill-formed. |
375 | 374 | \begin{note}
|
376 |
| -The aliases in \tref{lex.charset.ucn} are provided for control characters |
377 |
| -which otherwise have no associated character name or character name alias. |
378 |
| -These names are derived from |
| 375 | +These aliases are listed in |
379 | 376 | the Unicode Character Database's \tcode{NameAliases.txt}.
|
380 |
| -For historical reasons, control characters are formally unnamed. |
381 |
| -\end{note} |
382 |
| -\end{itemize} |
383 |
| -\begin{note} |
384 |
| -None of the associated character names, |
385 |
| -associated character name aliases, or |
386 |
| -control code aliases |
387 |
| -have leading or trailing spaces. |
| 377 | +None of these names or aliases have leading or trailing spaces. |
388 | 378 | \end{note}
|
389 | 379 |
|
390 |
| -\begin{multicolfloattable}{Control code aliases}{lex.charset.ucn}{ll} |
391 |
| -\unicode{0000}{null} \\ |
392 |
| -\unicode{0001}{start of heading} \\ |
393 |
| -\unicode{0002}{start of text} \\ |
394 |
| -\unicode{0003}{end of text} \\ |
395 |
| -\unicode{0004}{end of transmission} \\ |
396 |
| -\unicode{0005}{enquiry} \\ |
397 |
| -\unicode{0006}{acknowledge} \\ |
398 |
| -\unicode{0007}{alert} \\ |
399 |
| -\unicode{0008}{backspace} \\ |
400 |
| -\unicode{0009}{character tabulation} \\ |
401 |
| -\unicode{0009}{horizontal tabulation} \\ |
402 |
| -\unicode{000a}{line feed} \\ |
403 |
| -\unicode{000a}{new line} \\ |
404 |
| -\unicode{000a}{end of line} \\ |
405 |
| -\unicode{000b}{line tabulation} \\ |
406 |
| -\unicode{000b}{vertical tabulation} \\ |
407 |
| -\unicode{000c}{form feed} \\ |
408 |
| -\unicode{000d}{carriage return} \\ |
409 |
| -\unicode{000e}{shift out} \\ |
410 |
| -\unicode{000e}{locking-shift one} \\ |
411 |
| -\unicode{000f}{shift in} \\ |
412 |
| -\unicode{000f}{locking-shift zero} \\ |
413 |
| -\unicode{0010}{data link escape} \\ |
414 |
| -\unicode{0011}{device control one} \\ |
415 |
| -\unicode{0012}{device control two} \\ |
416 |
| -\unicode{0013}{device control three} \\ |
417 |
| -\unicode{0014}{device control four} \\ |
418 |
| -\unicode{0015}{negative acknowledge} \\ |
419 |
| -\unicode{0016}{synchronous idle} \\ |
420 |
| -\unicode{0017}{end of transmission block} \\ |
421 |
| -\unicode{0018}{cancel} \\ |
422 |
| -\unicode{0019}{end of medium} \\ |
423 |
| -\unicode{001a}{substitute} \\ |
424 |
| -\unicode{001b}{escape} \\ |
425 |
| -\unicode{001c}{information separator four} \\ |
426 |
| -\unicode{001c}{file separator} \\ |
427 |
| -\unicode{001d}{information separator three} \\ |
428 |
| -\unicode{001d}{group separator} \\ |
429 |
| -\unicode{001e}{information separator two} \\ |
430 |
| -\unicode{001e}{record separator} \\ |
431 |
| -\unicode{001f}{information separator one} \\ |
432 |
| -\unicode{001f}{unit separator} \\ |
433 |
| -\columnbreak |
434 |
| -\unicode{007f}{delete} \\ |
435 |
| -\unicode{0082}{break permitted here} \\ |
436 |
| -\unicode{0083}{no break here} \\ |
437 |
| -\unicode{0084}{index} \\ |
438 |
| -\unicode{0085}{next line} \\ |
439 |
| -\unicode{0086}{start of selected area} \\ |
440 |
| -\unicode{0087}{end of selected area} \\ |
441 |
| -\unicode{0088}{character tabulation set} \\ |
442 |
| -\unicode{0088}{horizontal tabulation set} \\ |
443 |
| -\unicode{0089}{character tabulation with justification} \\ |
444 |
| -\unicode{0089}{horizontal tabulation with justification} \\ |
445 |
| -\unicode{008a}{line tabulation set} \\ |
446 |
| -\unicode{008a}{vertical tabulation set} \\ |
447 |
| -\unicode{008b}{partial line forward} \\ |
448 |
| -\unicode{008b}{partial line down} \\ |
449 |
| -\unicode{008c}{partial line backward} \\ |
450 |
| -\unicode{008c}{partial line up} \\ |
451 |
| -\unicode{008d}{reverse line feed} \\ |
452 |
| -\unicode{008d}{reverse index} \\ |
453 |
| -\unicode{008e}{single shift two} \\ |
454 |
| -\unicode{008e}{single-shift-2} \\ |
455 |
| -\unicode{008f}{single shift three} \\ |
456 |
| -\unicode{008f}{single-shift-3} \\ |
457 |
| -\unicode{0090}{device control string} \\ |
458 |
| -\unicode{0091}{private use one} \\ |
459 |
| -\unicode{0091}{private use-1} \\ |
460 |
| -\unicode{0092}{private use two} \\ |
461 |
| -\unicode{0092}{private use-2} \\ |
462 |
| -\unicode{0093}{set transmit state} \\ |
463 |
| -\unicode{0094}{cancel character} \\ |
464 |
| -\unicode{0095}{message waiting} \\ |
465 |
| -\unicode{0096}{start of guarded area} \\ |
466 |
| -\unicode{0096}{start of protected area} \\ |
467 |
| -\unicode{0097}{end of guarded area} \\ |
468 |
| -\unicode{0097}{end of protected area} \\ |
469 |
| -\unicode{0098}{start of string} \\ |
470 |
| -\unicode{009a}{single character introducer} \\ |
471 |
| -\unicode{009b}{control sequence introducer} \\ |
472 |
| -\unicode{009c}{string terminator} \\ |
473 |
| -\unicode{009d}{operating system command} \\ |
474 |
| -\unicode{009e}{privacy message} \\ |
475 |
| -\unicode{009f}{application program command} \\ |
476 |
| -\end{multicolfloattable} |
477 |
| - |
478 | 380 | \pnum
|
479 | 381 | If a \grammarterm{universal-character-name} outside
|
480 | 382 | the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or
|
|
493 | 395 | The \defnadj{basic literal}{character set} consists of
|
494 | 396 | all characters of the basic character set,
|
495 | 397 | plus the control characters specified in \tref{lex.charset.literal}.
|
496 |
| -\begin{note} |
497 |
| -The alias \uname{bell} for \ucode{0007} shown in ISO 10646 |
498 |
| -is ambiguous with \unicode{1f514}{bell}. |
499 |
| -\end{note} |
500 | 398 |
|
501 | 399 | \begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll}
|
502 | 400 | \topline
|
|
546 | 444 | \indextext{UTF-16}%
|
547 | 445 | \indextext{UTF-32}%
|
548 | 446 | For a UTF-8, UTF-16, or UTF-32 literal,
|
549 |
| -the UCS scalar value |
| 447 | +the Unicode scalar value |
550 | 448 | corresponding to each character of the translation character set
|
551 |
| -is encoded as specified in ISO/IEC 10646 for the respective UCS encoding form. |
| 449 | +is encoded as specified in the Unicode Standard |
| 450 | +for the respective Unicode encoding form. |
552 | 451 | \indextext{character set|)}
|
553 | 452 |
|
554 | 453 | \rSec1[lex.pptoken]{Preprocessing tokens}
|
|
889 | 788 | \begin{bnf}
|
890 | 789 | \nontermdef{identifier-start}\br
|
891 | 790 | nondigit\br
|
892 |
| - \textnormal{an element of the translation character set of class XID_Start} |
| 791 | + \textnormal{an element of the translation character set with the Unicode property XID_Start} |
893 | 792 | \end{bnf}
|
894 | 793 |
|
895 | 794 | \begin{bnf}
|
896 | 795 | \nontermdef{identifier-continue}\br
|
897 | 796 | digit\br
|
898 | 797 | nondigit\br
|
899 |
| - \textnormal{an element of the translation character set of class XID_Continue} |
| 798 | + \textnormal{an element of the translation character set with the Unicode property XID_Continue} |
900 | 799 | \end{bnf}
|
901 | 800 |
|
902 | 801 | \begin{bnf}
|
|
915 | 814 | \pnum
|
916 | 815 | \indextext{name!length of}%
|
917 | 816 | \indextext{name}%
|
918 |
| -The character classes XID_Start and XID_Continue |
919 |
| -are Derived Core Properties as described by \UAX{44}. |
| 817 | +\begin{note} |
| 818 | +The character properties XID_Start and XID_Continue are Derived Core Properties |
| 819 | +as described by \UAX{44} of the Unicode Standard. |
920 | 820 | \begin{footnote}
|
921 | 821 | On systems in which linkers cannot accept extended
|
922 | 822 | characters, an encoding of the \grammarterm{universal-character-name} can be used in
|
|
927 | 827 | place a translation limit on significant characters for external
|
928 | 828 | identifiers.
|
929 | 829 | \end{footnote}
|
| 830 | +\end{note} |
930 | 831 | The program is ill-formed
|
931 | 832 | if an \grammarterm{identifier} does not conform to
|
932 |
| -Normalization Form C as specified in ISO/IEC 10646. |
| 833 | +Normalization Form C as specified in the Unicode Standard. |
933 | 834 | \begin{note}
|
934 | 835 | Identifiers are case-sensitive.
|
935 | 836 | \end{note}
|
|
2102 | 2003 | \impldef{code unit sequence for non-representable \grammarterm{string-literal}}
|
2103 | 2004 | code unit sequence is encoded.
|
2104 | 2005 | \begin{note}
|
2105 |
| -No character lacks representation in any of the UCS encoding forms. |
| 2006 | +No character lacks representation in any Unicode encoding form. |
2106 | 2007 | \end{note}
|
2107 | 2008 | When encoding a stateful character encoding,
|
2108 | 2009 | implementations should encode the first such sequence
|
|
0 commit comments