-
Notifications
You must be signed in to change notification settings - Fork 619
Feature/utf8 support #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
e9ca7d6
Add a UTF-8 encoder/decoder package for node
jeskew f35c22b
Add a UTF-8 encoder/decoder package for browsers &c.
jeskew 94e17a4
Add a cross-platform UTF-8 encode/decoder package
jeskew 172ec13
Add Google Closure library attribution
jeskew 1d7afa1
Add TextEncoder/TextDecoder declaration
jeskew File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,3 +20,4 @@ jspm_packages | |
.yarn-integrity | ||
|
||
lerna-debug.log | ||
packages/*/package-lock.json |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
/node_modules/ | ||
*.js | ||
*.js.map | ||
*.d.ts |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import {fromUtf8, toUtf8} from '../'; | ||
|
||
jest.mock('../lib/pureJs', () => { | ||
return { | ||
fromUtf8: jest.fn(() => new Uint8Array(0)), | ||
toUtf8: jest.fn(() => ''), | ||
}; | ||
}); | ||
import { | ||
fromUtf8 as jsFromUtf8, | ||
toUtf8 as jsToUtf8, | ||
} from '../lib/pureJs'; | ||
|
||
jest.mock('../lib/whatwgEncodingApi', () => { | ||
return { | ||
fromUtf8: jest.fn(() => new Uint8Array(0)), | ||
toUtf8: jest.fn(() => ''), | ||
}; | ||
}); | ||
import { | ||
fromUtf8 as textEncoderFromUtf8, | ||
toUtf8 as textEncoderToUtf8, | ||
} from '../lib/whatwgEncodingApi'; | ||
|
||
beforeEach(() => { | ||
(jsFromUtf8 as any).mockClear(); | ||
(jsToUtf8 as any).mockClear(); | ||
(textEncoderFromUtf8 as any).mockClear(); | ||
(textEncoderToUtf8 as any).mockClear(); | ||
}); | ||
|
||
describe('fromUtf8', () => { | ||
it('should use the Encoding API if available', () => { | ||
(global as any).TextEncoder = jest.fn() as any; | ||
|
||
fromUtf8('foo'); | ||
|
||
expect((textEncoderFromUtf8 as any).mock.calls.length).toBe(1); | ||
expect((jsFromUtf8 as any).mock.calls.length).toBe(0); | ||
}); | ||
|
||
it('should use a JS implementation otherwise', () => { | ||
delete (global as any).TextEncoder; | ||
|
||
fromUtf8('foo'); | ||
|
||
expect((textEncoderFromUtf8 as any).mock.calls.length).toBe(0); | ||
expect((jsFromUtf8 as any).mock.calls.length).toBe(1); | ||
}); | ||
}); | ||
|
||
describe('toUtf8', () => { | ||
it('should use the Encoding API if available', () => { | ||
(global as any).TextDecoder = jest.fn() as any; | ||
|
||
toUtf8(new Uint8Array(0)); | ||
|
||
expect((textEncoderToUtf8 as any).mock.calls.length).toBe(1); | ||
expect((jsToUtf8 as any).mock.calls.length).toBe(0); | ||
}); | ||
|
||
it('should use a JS implementation otherwise', () => { | ||
delete (global as any).TextDecoder; | ||
|
||
toUtf8(new Uint8Array(0)); | ||
|
||
expect((textEncoderToUtf8 as any).mock.calls.length).toBe(0); | ||
expect((jsToUtf8 as any).mock.calls.length).toBe(1); | ||
}); | ||
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import {fromUtf8, toUtf8} from '../lib/pureJs'; | ||
|
||
const utf8StringsToByteArrays: {[key: string]: Uint8Array} = { | ||
'ABC': new Uint8Array(['A'.charCodeAt(0), 'B'.charCodeAt(0), 'C'.charCodeAt(0)]), | ||
'🐎👱❤': new Uint8Array([240, 159, 144, 142, 240, 159, 145, 177, 226, 157, 164]), | ||
'☃💩': new Uint8Array([226, 152, 131, 240, 159, 146, 169]), | ||
'The rain in Spain falls mainly on the plain.': new Uint8Array([84, 104, 101, 32, 114, 97, 105, 110, 32, 105, 110, 32, 83, 112, 97, 105, 110, 32, 102, 97, 108, 108, 115, 32, 109, 97, 105, 110, 108, 121, 32, 111, 110, 32, 116, 104, 101, 32, 112, 108, 97, 105, 110, 46 ]), | ||
'دستنوشتهها نمیسوزند': new Uint8Array([216, 175, 216, 179, 216, 170, 226, 128, 140, 217, 134, 217, 136, 216, 180, 216, 170, 217, 135, 226, 128, 140, 217, 135, 216, 167, 32, 217, 134, 217, 133, 219, 140, 226, 128, 140, 216, 179, 217, 136, 216, 178, 217, 134, 216, 175]), | ||
'Рукописи не горят': new Uint8Array([208, 160, 209, 131, 208, 186, 208, 190, 208, 191, 208, 184, 209, 129, 208, 184, 32, 208, 189, 208, 181, 32, 208, 179, 208, 190, 209, 128, 209, 143, 209, 130 ]), | ||
}; | ||
|
||
describe('fromUtf8', () => { | ||
for (let string of Object.keys(utf8StringsToByteArrays)) { | ||
it(`should UTF-8 decode "${string}" to the correct value`, () => { | ||
expect(fromUtf8(string)).toEqual(utf8StringsToByteArrays[string]); | ||
}); | ||
} | ||
}); | ||
|
||
describe('toUtf8', () => { | ||
for (let string of Object.keys(utf8StringsToByteArrays)) { | ||
it(`should derive "${string}" from the UTF-8 decoded bytes`, () => { | ||
expect(toUtf8(utf8StringsToByteArrays[string])).toBe(string); | ||
}); | ||
} | ||
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import { | ||
fromUtf8, | ||
toUtf8, | ||
} from '../lib/whatwgEncodingApi'; | ||
|
||
beforeEach(() => { | ||
const textDecoderInstance = { | ||
decode: jest.fn(() => ''), | ||
}; | ||
const textEncoderInstance = { | ||
encode: jest.fn(() => new Uint8Array(0)), | ||
}; | ||
|
||
(global as any).TextDecoder = jest.fn(() => textDecoderInstance) as any; | ||
(global as any).TextEncoder = jest.fn(() => textEncoderInstance) as any; | ||
}); | ||
|
||
interface TextDecoderCtor { | ||
new (): any; | ||
} | ||
interface TextEncoderCtor { | ||
new (): any; | ||
} | ||
declare const TextDecoder: TextDecoderCtor; | ||
declare const TextEncoder: TextEncoderCtor; | ||
|
||
describe('WHATWG encoding spec compliant environment UTF-8 handling', () => { | ||
it('should use the global TextDecoder to decode UTF-8', () => { | ||
const decoder = new TextDecoder(); | ||
(TextDecoder as any).mockClear(); | ||
|
||
expect((TextDecoder as any).mock.calls.length).toBe(0); | ||
|
||
toUtf8(new Uint8Array(0)); | ||
|
||
expect((TextDecoder as any).mock.calls.length).toBe(1); | ||
expect((decoder.decode as any).mock.calls.length).toBe(1); | ||
}); | ||
|
||
it('should use the global TextEncoder to encode UTF-8', () => { | ||
const encoder = new TextEncoder(); | ||
(TextEncoder as any).mockClear(); | ||
|
||
expect((TextEncoder as any).mock.calls.length).toBe(0); | ||
|
||
fromUtf8('string'); | ||
|
||
expect((TextEncoder as any).mock.calls.length).toBe(1); | ||
expect((encoder.encode as any).mock.calls.length).toBe(1); | ||
}); | ||
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import { | ||
fromUtf8 as jsFromUtf8, | ||
toUtf8 as jsToUtf8, | ||
} from './lib/pureJs'; | ||
import { | ||
fromUtf8 as textEncoderFromUtf8, | ||
toUtf8 as textEncoderToUtf8, | ||
} from './lib/whatwgEncodingApi'; | ||
|
||
declare const TextDecoder: Function|undefined; | ||
declare const TextEncoder: Function|undefined; | ||
|
||
export function fromUtf8(input: string): Uint8Array { | ||
if (typeof TextEncoder === 'function') { | ||
return textEncoderFromUtf8(input); | ||
} | ||
|
||
return jsFromUtf8(input); | ||
} | ||
|
||
export function toUtf8(input: Uint8Array): string { | ||
if (typeof TextDecoder === 'function') { | ||
return textEncoderToUtf8(input); | ||
} | ||
|
||
return jsToUtf8(input); | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
/** | ||
* Converts a JS string from its native UCS-2/UTF-16 representation into a | ||
* Uint8Array of the bytes used to represent the equivalent characters in UTF-8. | ||
* | ||
* Cribbed from the `goog.crypt.stringToUtf8ByteArray` function in the Google | ||
* Closure library, though updated to use typed arrays. | ||
*/ | ||
export function fromUtf8(input: string): Uint8Array { | ||
const bytes: Array<number> = []; | ||
for (let i = 0, len = input.length; i < len; i++) { | ||
const value = input.charCodeAt(i); | ||
if (value < 0x80) { | ||
bytes.push(value); | ||
} else if (value < 0x800) { | ||
bytes.push( | ||
(value >> 6) | 0b11000000, | ||
(value & 0b111111) | 0b10000000 | ||
); | ||
} else if ( | ||
i + 1 < input.length && | ||
((value & 0xfc00) === 0xd800) && | ||
((input.charCodeAt(i + 1) & 0xfc00) === 0xdc00) | ||
) { | ||
const surrogatePair = 0x10000 + | ||
((value & 0b1111111111) << 10) + | ||
(input.charCodeAt(++i) & 0b1111111111); | ||
bytes.push( | ||
(surrogatePair >> 18) | 0b11110000, | ||
((surrogatePair >> 12) & 0b111111) | 0b10000000, | ||
((surrogatePair >> 6) & 0b111111) | 0b10000000, | ||
(surrogatePair & 0b111111) | 0b10000000 | ||
); | ||
} else { | ||
bytes.push( | ||
(value >> 12) | 0b11100000, | ||
((value >> 6) & 0b111111) | 0b10000000, | ||
(value & 0b111111) | 0b10000000, | ||
); | ||
} | ||
} | ||
|
||
return Uint8Array.from(bytes); | ||
} | ||
|
||
/** | ||
* Converts a typed array of bytes containing UTF-8 data into a native JS | ||
* string. | ||
* | ||
* Partly cribbed from the `goog.crypt.utf8ByteArrayToString` function in the | ||
* Google Closure library, though updated to use typed arrays and to better | ||
* handle astral plane code points. | ||
*/ | ||
export function toUtf8(input: Uint8Array): string { | ||
let decoded = ''; | ||
for (let i = 0, len = input.length; i < len; i++) { | ||
const byte = input[i]; | ||
if (byte < 0x80) { | ||
decoded += String.fromCharCode(byte); | ||
} else if (0b11000000 <= byte && byte < 0b11100000) { | ||
const nextByte = input[++i]; | ||
decoded += String.fromCharCode( | ||
(byte & 0b11111) << 6 | (nextByte & 0b111111) | ||
); | ||
} else if (0b11110000 <= byte && byte < 0b101101101) { | ||
const surrogatePair = [byte, input[++i], input[++i], input[++i]]; | ||
const encoded = '%' + surrogatePair | ||
.map(byteValue => byteValue.toString(16)) | ||
.join('%'); | ||
decoded += decodeURIComponent(encoded); | ||
} else { | ||
decoded += String.fromCharCode( | ||
(byte & 0b1111) << 12 | | ||
(input[++i] & 0b111111) << 6 | | ||
(input[++i] & 0b111111) | ||
); | ||
} | ||
} | ||
|
||
return decoded; | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/** | ||
* A declaration of the global TextEncoder and TextDecoder constructors. | ||
* | ||
* @see https://encoding.spec.whatwg.org/ | ||
*/ | ||
namespace Encoding { | ||
interface TextDecoderOptions { | ||
fatal?: boolean; | ||
ignoreBOM?: boolean; | ||
} | ||
|
||
interface TextDecodeOptions { | ||
stream?: boolean; | ||
} | ||
|
||
interface TextDecoder { | ||
readonly encoding: string; | ||
readonly fatal: boolean; | ||
readonly ignoreBOM: boolean; | ||
decode( | ||
input?: ArrayBuffer|ArrayBufferView, | ||
options?: TextDecodeOptions | ||
): string; | ||
} | ||
|
||
export interface TextDecoderConstructor { | ||
new (label?: string, options?: TextDecoderOptions): TextDecoder; | ||
} | ||
|
||
interface TextEncoder { | ||
readonly encoding: 'utf-8'; | ||
encode(input?: string): Uint8Array; | ||
} | ||
|
||
export interface TextEncoderConstructor { | ||
new (): TextEncoder; | ||
} | ||
} | ||
|
||
declare const TextDecoder: Encoding.TextDecoderConstructor; | ||
|
||
declare const TextEncoder: Encoding.TextEncoderConstructor; | ||
|
||
export function fromUtf8(input: string): Uint8Array { | ||
return new TextEncoder().encode(input); | ||
} | ||
|
||
export function toUtf8(input: Uint8Array): string { | ||
return new TextDecoder('utf-8').decode(input); | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{ | ||
"name": "@aws/util-utf8-browser", | ||
"private": true, | ||
"version": "0.0.1", | ||
"description": "A browser UTF-8 string <-> UInt8Array converter", | ||
"main": "index.js", | ||
"scripts": { | ||
"prepublishOnly": "tsc", | ||
"pretest": "tsc", | ||
"test": "jest" | ||
}, | ||
"author": "[email protected]", | ||
"license": "UNLICENSED", | ||
"devDependencies": { | ||
"@types/jest": "^19.2.2", | ||
"@types/node": "^7.0.12", | ||
"jest": "^19.0.2", | ||
"typescript": "^2.3" | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"compilerOptions": { | ||
"target": "es5", | ||
"module": "commonjs", | ||
"declaration": true, | ||
"sourceMap": true, | ||
"strict": true, | ||
"stripInternal": true | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
/node_modules/ | ||
*.js | ||
*.js.map | ||
*.d.ts |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import {fromUtf8, toUtf8} from "../"; | ||
|
||
const utf8StringsToByteArrays: {[key: string]: Uint8Array} = { | ||
'ABC': new Uint8Array(['A'.charCodeAt(0), 'B'.charCodeAt(0), 'C'.charCodeAt(0)]), | ||
'🐎👱❤': new Uint8Array([240, 159, 144, 142, 240, 159, 145, 177, 226, 157, 164]), | ||
'☃💩': new Uint8Array([226, 152, 131, 240, 159, 146, 169]), | ||
'The rain in Spain falls mainly on the plain.': new Uint8Array([84, 104, 101, 32, 114, 97, 105, 110, 32, 105, 110, 32, 83, 112, 97, 105, 110, 32, 102, 97, 108, 108, 115, 32, 109, 97, 105, 110, 108, 121, 32, 111, 110, 32, 116, 104, 101, 32, 112, 108, 97, 105, 110, 46 ]), | ||
'دستنوشتهها نمیسوزند': new Uint8Array([216, 175, 216, 179, 216, 170, 226, 128, 140, 217, 134, 217, 136, 216, 180, 216, 170, 217, 135, 226, 128, 140, 217, 135, 216, 167, 32, 217, 134, 217, 133, 219, 140, 226, 128, 140, 216, 179, 217, 136, 216, 178, 217, 134, 216, 175]), | ||
'Рукописи не горят': new Uint8Array([208, 160, 209, 131, 208, 186, 208, 190, 208, 191, 208, 184, 209, 129, 208, 184, 32, 208, 189, 208, 181, 32, 208, 179, 208, 190, 209, 128, 209, 143, 209, 130 ]), | ||
}; | ||
|
||
describe('fromUtf8', () => { | ||
for (let string of Object.keys(utf8StringsToByteArrays)) { | ||
it(`should UTF-8 decode "${string}" to the correct value`, () => { | ||
expect(fromUtf8(string)).toEqual(utf8StringsToByteArrays[string]); | ||
}); | ||
} | ||
|
||
it('should throw when given a number', () => { | ||
expect(() => fromUtf8(255 as any)).toThrow(); | ||
}); | ||
}); | ||
|
||
describe('toUtf8', () => { | ||
for (let string of Object.keys(utf8StringsToByteArrays)) { | ||
it(`should derive "${string}" from the UTF-8 decoded bytes`, () => { | ||
expect(toUtf8(utf8StringsToByteArrays[string])).toBe(string); | ||
}); | ||
} | ||
|
||
it('should throw when given a number', () => { | ||
expect(() => toUtf8(255 as any)).toThrow(); | ||
}); | ||
}); |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this code taken from the Buffer module?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is mostly cribbed from https://github.com/google/closure-library/blob/master/closure/goog/crypt/crypt.js#L110, though I modernized it a bit. I should include a mention of that in a comment.