Skip to content

Feature/utf8 support #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ jspm_packages
.yarn-integrity

lerna-debug.log
packages/*/package-lock.json
4 changes: 4 additions & 0 deletions packages/util-utf8-browser/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/node_modules/
*.js
*.js.map
*.d.ts
70 changes: 70 additions & 0 deletions packages/util-utf8-browser/__tests__/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import {fromUtf8, toUtf8} from '../';

jest.mock('../lib/pureJs', () => {
return {
fromUtf8: jest.fn(() => new Uint8Array(0)),
toUtf8: jest.fn(() => ''),
};
});
import {
fromUtf8 as jsFromUtf8,
toUtf8 as jsToUtf8,
} from '../lib/pureJs';

jest.mock('../lib/whatwgEncodingApi', () => {
return {
fromUtf8: jest.fn(() => new Uint8Array(0)),
toUtf8: jest.fn(() => ''),
};
});
import {
fromUtf8 as textEncoderFromUtf8,
toUtf8 as textEncoderToUtf8,
} from '../lib/whatwgEncodingApi';

beforeEach(() => {
(jsFromUtf8 as any).mockClear();
(jsToUtf8 as any).mockClear();
(textEncoderFromUtf8 as any).mockClear();
(textEncoderToUtf8 as any).mockClear();
});

describe('fromUtf8', () => {
it('should use the Encoding API if available', () => {
(global as any).TextEncoder = jest.fn() as any;

fromUtf8('foo');

expect((textEncoderFromUtf8 as any).mock.calls.length).toBe(1);
expect((jsFromUtf8 as any).mock.calls.length).toBe(0);
});

it('should use a JS implementation otherwise', () => {
delete (global as any).TextEncoder;

fromUtf8('foo');

expect((textEncoderFromUtf8 as any).mock.calls.length).toBe(0);
expect((jsFromUtf8 as any).mock.calls.length).toBe(1);
});
});

describe('toUtf8', () => {
it('should use the Encoding API if available', () => {
(global as any).TextDecoder = jest.fn() as any;

toUtf8(new Uint8Array(0));

expect((textEncoderToUtf8 as any).mock.calls.length).toBe(1);
expect((jsToUtf8 as any).mock.calls.length).toBe(0);
});

it('should use a JS implementation otherwise', () => {
delete (global as any).TextDecoder;

toUtf8(new Uint8Array(0));

expect((textEncoderToUtf8 as any).mock.calls.length).toBe(0);
expect((jsToUtf8 as any).mock.calls.length).toBe(1);
});
});
26 changes: 26 additions & 0 deletions packages/util-utf8-browser/__tests__/pureJs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import {fromUtf8, toUtf8} from '../lib/pureJs';

const utf8StringsToByteArrays: {[key: string]: Uint8Array} = {
'ABC': new Uint8Array(['A'.charCodeAt(0), 'B'.charCodeAt(0), 'C'.charCodeAt(0)]),
'🐎👱❤': new Uint8Array([240, 159, 144, 142, 240, 159, 145, 177, 226, 157, 164]),
'☃💩': new Uint8Array([226, 152, 131, 240, 159, 146, 169]),
'The rain in Spain falls mainly on the plain.': new Uint8Array([84, 104, 101, 32, 114, 97, 105, 110, 32, 105, 110, 32, 83, 112, 97, 105, 110, 32, 102, 97, 108, 108, 115, 32, 109, 97, 105, 110, 108, 121, 32, 111, 110, 32, 116, 104, 101, 32, 112, 108, 97, 105, 110, 46 ]),
'دست‌نوشته‌ها نمی‌سوزند': new Uint8Array([216, 175, 216, 179, 216, 170, 226, 128, 140, 217, 134, 217, 136, 216, 180, 216, 170, 217, 135, 226, 128, 140, 217, 135, 216, 167, 32, 217, 134, 217, 133, 219, 140, 226, 128, 140, 216, 179, 217, 136, 216, 178, 217, 134, 216, 175]),
'Рукописи не горят': new Uint8Array([208, 160, 209, 131, 208, 186, 208, 190, 208, 191, 208, 184, 209, 129, 208, 184, 32, 208, 189, 208, 181, 32, 208, 179, 208, 190, 209, 128, 209, 143, 209, 130 ]),
};

describe('fromUtf8', () => {
for (let string of Object.keys(utf8StringsToByteArrays)) {
it(`should UTF-8 decode "${string}" to the correct value`, () => {
expect(fromUtf8(string)).toEqual(utf8StringsToByteArrays[string]);
});
}
});

describe('toUtf8', () => {
for (let string of Object.keys(utf8StringsToByteArrays)) {
it(`should derive "${string}" from the UTF-8 decoded bytes`, () => {
expect(toUtf8(utf8StringsToByteArrays[string])).toBe(string);
});
}
});
51 changes: 51 additions & 0 deletions packages/util-utf8-browser/__tests__/whatwgEncodingApi.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import {
fromUtf8,
toUtf8,
} from '../lib/whatwgEncodingApi';

beforeEach(() => {
const textDecoderInstance = {
decode: jest.fn(() => ''),
};
const textEncoderInstance = {
encode: jest.fn(() => new Uint8Array(0)),
};

(global as any).TextDecoder = jest.fn(() => textDecoderInstance) as any;
(global as any).TextEncoder = jest.fn(() => textEncoderInstance) as any;
});

interface TextDecoderCtor {
new (): any;
}
interface TextEncoderCtor {
new (): any;
}
declare const TextDecoder: TextDecoderCtor;
declare const TextEncoder: TextEncoderCtor;

describe('WHATWG encoding spec compliant environment UTF-8 handling', () => {
it('should use the global TextDecoder to decode UTF-8', () => {
const decoder = new TextDecoder();
(TextDecoder as any).mockClear();

expect((TextDecoder as any).mock.calls.length).toBe(0);

toUtf8(new Uint8Array(0));

expect((TextDecoder as any).mock.calls.length).toBe(1);
expect((decoder.decode as any).mock.calls.length).toBe(1);
});

it('should use the global TextEncoder to encode UTF-8', () => {
const encoder = new TextEncoder();
(TextEncoder as any).mockClear();

expect((TextEncoder as any).mock.calls.length).toBe(0);

fromUtf8('string');

expect((TextEncoder as any).mock.calls.length).toBe(1);
expect((encoder.encode as any).mock.calls.length).toBe(1);
});
});
27 changes: 27 additions & 0 deletions packages/util-utf8-browser/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import {
fromUtf8 as jsFromUtf8,
toUtf8 as jsToUtf8,
} from './lib/pureJs';
import {
fromUtf8 as textEncoderFromUtf8,
toUtf8 as textEncoderToUtf8,
} from './lib/whatwgEncodingApi';

declare const TextDecoder: Function|undefined;
declare const TextEncoder: Function|undefined;

export function fromUtf8(input: string): Uint8Array {
if (typeof TextEncoder === 'function') {
return textEncoderFromUtf8(input);
}

return jsFromUtf8(input);
}

export function toUtf8(input: Uint8Array): string {
if (typeof TextDecoder === 'function') {
return textEncoderToUtf8(input);
}

return jsToUtf8(input);
}
80 changes: 80 additions & 0 deletions packages/util-utf8-browser/lib/pureJs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* Converts a JS string from its native UCS-2/UTF-16 representation into a
* Uint8Array of the bytes used to represent the equivalent characters in UTF-8.
*
* Cribbed from the `goog.crypt.stringToUtf8ByteArray` function in the Google
* Closure library, though updated to use typed arrays.
*/
export function fromUtf8(input: string): Uint8Array {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this code taken from the Buffer module?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is mostly cribbed from https://github.com/google/closure-library/blob/master/closure/goog/crypt/crypt.js#L110, though I modernized it a bit. I should include a mention of that in a comment.

const bytes: Array<number> = [];
for (let i = 0, len = input.length; i < len; i++) {
const value = input.charCodeAt(i);
if (value < 0x80) {
bytes.push(value);
} else if (value < 0x800) {
bytes.push(
(value >> 6) | 0b11000000,
(value & 0b111111) | 0b10000000
);
} else if (
i + 1 < input.length &&
((value & 0xfc00) === 0xd800) &&
((input.charCodeAt(i + 1) & 0xfc00) === 0xdc00)
) {
const surrogatePair = 0x10000 +
((value & 0b1111111111) << 10) +
(input.charCodeAt(++i) & 0b1111111111);
bytes.push(
(surrogatePair >> 18) | 0b11110000,
((surrogatePair >> 12) & 0b111111) | 0b10000000,
((surrogatePair >> 6) & 0b111111) | 0b10000000,
(surrogatePair & 0b111111) | 0b10000000
);
} else {
bytes.push(
(value >> 12) | 0b11100000,
((value >> 6) & 0b111111) | 0b10000000,
(value & 0b111111) | 0b10000000,
);
}
}

return Uint8Array.from(bytes);
}

/**
* Converts a typed array of bytes containing UTF-8 data into a native JS
* string.
*
* Partly cribbed from the `goog.crypt.utf8ByteArrayToString` function in the
* Google Closure library, though updated to use typed arrays and to better
* handle astral plane code points.
*/
export function toUtf8(input: Uint8Array): string {
let decoded = '';
for (let i = 0, len = input.length; i < len; i++) {
const byte = input[i];
if (byte < 0x80) {
decoded += String.fromCharCode(byte);
} else if (0b11000000 <= byte && byte < 0b11100000) {
const nextByte = input[++i];
decoded += String.fromCharCode(
(byte & 0b11111) << 6 | (nextByte & 0b111111)
);
} else if (0b11110000 <= byte && byte < 0b101101101) {
const surrogatePair = [byte, input[++i], input[++i], input[++i]];
const encoded = '%' + surrogatePair
.map(byteValue => byteValue.toString(16))
.join('%');
decoded += decodeURIComponent(encoded);
} else {
decoded += String.fromCharCode(
(byte & 0b1111) << 12 |
(input[++i] & 0b111111) << 6 |
(input[++i] & 0b111111)
);
}
}

return decoded;
}
50 changes: 50 additions & 0 deletions packages/util-utf8-browser/lib/whatwgEncodingApi.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
* A declaration of the global TextEncoder and TextDecoder constructors.
*
* @see https://encoding.spec.whatwg.org/
*/
namespace Encoding {
interface TextDecoderOptions {
fatal?: boolean;
ignoreBOM?: boolean;
}

interface TextDecodeOptions {
stream?: boolean;
}

interface TextDecoder {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(
input?: ArrayBuffer|ArrayBufferView,
options?: TextDecodeOptions
): string;
}

export interface TextDecoderConstructor {
new (label?: string, options?: TextDecoderOptions): TextDecoder;
}

interface TextEncoder {
readonly encoding: 'utf-8';
encode(input?: string): Uint8Array;
}

export interface TextEncoderConstructor {
new (): TextEncoder;
}
}

declare const TextDecoder: Encoding.TextDecoderConstructor;

declare const TextEncoder: Encoding.TextEncoderConstructor;

export function fromUtf8(input: string): Uint8Array {
return new TextEncoder().encode(input);
}

export function toUtf8(input: Uint8Array): string {
return new TextDecoder('utf-8').decode(input);
}
20 changes: 20 additions & 0 deletions packages/util-utf8-browser/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "@aws/util-utf8-browser",
"private": true,
"version": "0.0.1",
"description": "A browser UTF-8 string <-> UInt8Array converter",
"main": "index.js",
"scripts": {
"prepublishOnly": "tsc",
"pretest": "tsc",
"test": "jest"
},
"author": "[email protected]",
"license": "UNLICENSED",
"devDependencies": {
"@types/jest": "^19.2.2",
"@types/node": "^7.0.12",
"jest": "^19.0.2",
"typescript": "^2.3"
}
}
10 changes: 10 additions & 0 deletions packages/util-utf8-browser/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"compilerOptions": {
"target": "es5",
"module": "commonjs",
"declaration": true,
"sourceMap": true,
"strict": true,
"stripInternal": true
}
}
4 changes: 4 additions & 0 deletions packages/util-utf8-node/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/node_modules/
*.js
*.js.map
*.d.ts
34 changes: 34 additions & 0 deletions packages/util-utf8-node/__tests__/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import {fromUtf8, toUtf8} from "../";

const utf8StringsToByteArrays: {[key: string]: Uint8Array} = {
'ABC': new Uint8Array(['A'.charCodeAt(0), 'B'.charCodeAt(0), 'C'.charCodeAt(0)]),
'🐎👱❤': new Uint8Array([240, 159, 144, 142, 240, 159, 145, 177, 226, 157, 164]),
'☃💩': new Uint8Array([226, 152, 131, 240, 159, 146, 169]),
'The rain in Spain falls mainly on the plain.': new Uint8Array([84, 104, 101, 32, 114, 97, 105, 110, 32, 105, 110, 32, 83, 112, 97, 105, 110, 32, 102, 97, 108, 108, 115, 32, 109, 97, 105, 110, 108, 121, 32, 111, 110, 32, 116, 104, 101, 32, 112, 108, 97, 105, 110, 46 ]),
'دست‌نوشته‌ها نمی‌سوزند': new Uint8Array([216, 175, 216, 179, 216, 170, 226, 128, 140, 217, 134, 217, 136, 216, 180, 216, 170, 217, 135, 226, 128, 140, 217, 135, 216, 167, 32, 217, 134, 217, 133, 219, 140, 226, 128, 140, 216, 179, 217, 136, 216, 178, 217, 134, 216, 175]),
'Рукописи не горят': new Uint8Array([208, 160, 209, 131, 208, 186, 208, 190, 208, 191, 208, 184, 209, 129, 208, 184, 32, 208, 189, 208, 181, 32, 208, 179, 208, 190, 209, 128, 209, 143, 209, 130 ]),
};

describe('fromUtf8', () => {
for (let string of Object.keys(utf8StringsToByteArrays)) {
it(`should UTF-8 decode "${string}" to the correct value`, () => {
expect(fromUtf8(string)).toEqual(utf8StringsToByteArrays[string]);
});
}

it('should throw when given a number', () => {
expect(() => fromUtf8(255 as any)).toThrow();
});
});

describe('toUtf8', () => {
for (let string of Object.keys(utf8StringsToByteArrays)) {
it(`should derive "${string}" from the UTF-8 decoded bytes`, () => {
expect(toUtf8(utf8StringsToByteArrays[string])).toBe(string);
});
}

it('should throw when given a number', () => {
expect(() => toUtf8(255 as any)).toThrow();
});
});
Loading