Skip to content

Commit fb2dd41

Browse files
committed
fix base64 encoding and decoding methods, add tests
1 parent 40cfcb7 commit fb2dd41

File tree

2 files changed

+116
-70
lines changed

2 files changed

+116
-70
lines changed

packages/utils/src/string.ts

Lines changed: 61 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -108,109 +108,101 @@ export function isMatchingPattern(value: string, pattern: RegExp | string): bool
108108
return false;
109109
}
110110

111-
// TODO: Base64 crossed with different character encodings turns out to be a ridiculous can of worms. Base64 expects
112-
// 8-bit data. JS only uses UTF-16. We need a way to be sure that every SDK is speaking the same language and can decode
113-
// values base64-encoded by other SDKs. The current proposal is to use UTF-8 as the common standard, and then
114-
// base64-encode that (meaning in JS we need to get there first). Doing it that way makes a whole lot of sense but is a
115-
// work in progress which isn't yet actually working. Leaving the current solution for now and will come back to it.
116-
117-
/**
118-
* Convert a Unicode string to a string in which each 16-bit unit occupies only one byte, which makes it safe to use as
119-
* input to `btoa`.
120-
*
121-
* Copied from https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/btoa#Unicode_strings.
122-
*
123-
* @param unicodeString The string to convert
124-
* @returns A btoa-compatible encoding of the string
125-
*/
126-
function unicodeToBinary(unicodeString: string): string {
127-
const codeUnits = new Uint16Array(unicodeString.length);
128-
for (let i = 0; i < codeUnits.length; i++) {
129-
codeUnits[i] = unicodeString.charCodeAt(i);
130-
}
131-
return String.fromCharCode(...new Uint8Array(codeUnits.buffer));
132-
}
133-
134111
/**
135-
* Convert a binary string (such as one would get from `atob`) into a Unicode string.
112+
* Convert a Unicode (UTF-16 or UTF-8, depending on context) string to a base64 string.
136113
*
137-
* Copied from https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/btoa#Unicode_strings.
138-
*
139-
* @param binaryString The string to convert
140-
* @returns A btoa-compatible encoding of the string
141-
*/
142-
function binaryToUnicode(binaryString: string): string {
143-
const bytes = new Uint8Array(binaryString.length);
144-
for (let i = 0; i < bytes.length; i++) {
145-
bytes[i] = binaryString.charCodeAt(i);
146-
}
147-
return String.fromCharCode(...new Uint16Array(bytes.buffer));
148-
}
149-
150-
/**
151-
* Convert a base64 string to a Unicode (UTF-16) string.
152-
*
153-
* @param base64String The string to decode.
114+
* @param unicodeString The string to base64-encode
154115
* @throws SentryError (because using the logger creates a circular dependency)
155-
* @returns A Unicode string
116+
* @returns A base64-encoded version of the string
156117
*/
157-
export function base64ToUnicode(base64String: string): string {
158-
if (typeof base64String !== 'string' || !BASE64_REGEX.test(base64String)) {
159-
throw new SentryError(`Unable to convert from base64. Input either isn't a string or isn't valid base64.`);
160-
}
118+
export function unicodeToBase64(unicodeString: string): string {
119+
const globalObject = getGlobalObject();
161120

162-
const errMsg = `Unable to convert string from base64: ${
163-
base64String.length > 256 ? `${base64String.slice(0, 256)}...` : base64String
121+
// we cast to a string just in case we're given something else
122+
const errMsg = `Unable to convert to base64: ${
123+
String(unicodeString).length > 256 ? `${String(unicodeString).slice(0, 256)}...` : String(unicodeString)
164124
}`;
165125

166126
try {
167-
// browsers have atob built in
168-
if ('atob' in getGlobalObject()) {
169-
// atob takes base64 (written in (a)scii) to (b)inary
170-
return binaryToUnicode(atob(base64String));
127+
// browsers have btoa built in
128+
if ('btoa' in globalObject) {
129+
// `btoa` only takes "binary" (a.k.a. 8-bit) data, but JS uses UTF-16 to represent strings. `TextEncoder.encode()`
130+
// translates that UTF-16 data to UTF-8 and returns an array of numbers, each representing one byte of data,
131+
// interpreted as binary. UTF-8 uses either 1, 2, 3, or 4 bytes to encode each character, so the length of the array
132+
// may match the length of the original string (it will be >=). Then we create a JS (UTF-16) string from those bytes,
133+
// ** assuming each byte is its own character **. Since each character in our new string is represented by exactly one
134+
// byte, it is now 8-bit data, and can be fed to `btoa` (which of course, as its first act, turns the characters back
135+
// into the same bytes before acting on them).
136+
137+
const utf8Bytes = new TextEncoder().encode(unicodeString);
138+
const eightBitDataAsUTF16String = String.fromCharCode(...utf8Bytes);
139+
return btoa(eightBitDataAsUTF16String);
171140
}
172141

173142
// Buffer only exists in node
174-
if ('Buffer' in getGlobalObject()) {
175-
return Buffer.from(base64String, 'base64').toString('utf16le');
143+
if ('Buffer' in globalObject) {
144+
// To accomplish the same thing in node that we do in the browser (see notes above) - and, importantly, end up with
145+
// the same result - we need to come up with the same numbers as were in the array from `TextEncoder.encode()`. Since
146+
// those numbers are bytes representing the string in UTF-8, we ask node to interpret the given string in UTF-8 in
147+
// order to fill its buffer. Unlike `btoa`, which requires its input to be encoded as a string (which it then
148+
// immediately decodes back to bytes), `Buffer.toString()` can go straight from bytes, so we don't need the
149+
// intermediate step of re-encoding the data before base64-ifying it.
150+
151+
const utf8Bytes = Buffer.from(unicodeString, 'utf-8');
152+
// const utf8Bytes = Buffer.from(unicodeString, 'utf16le');
153+
return utf8Bytes.toString('base64');
176154
}
177155
} catch (err) {
178156
throw new SentryError(`${errMsg} Got error: ${err}`);
179157
}
180158

159+
// we shouldn't ever get here, because one of `btoa` and `Buffer` should exist, but just in case...
181160
throw new SentryError(errMsg);
182161
}
183162

184163
/**
185-
* Convert a Unicode (UTF-16) string to a base64 string.
164+
* Convert a base64 string to a Unicode (UTF-16 or UTF-8, depending on context) string.
186165
*
187-
* @param unicodeString The string to encode
166+
* @param base64String The string to decode.
188167
* @throws SentryError (because using the logger creates a circular dependency)
189-
* @returns A base64-encoded version of the string
168+
* @returns A Unicode string
190169
*/
191-
export function unicodeToBase64(unicodeString: string): string {
192-
if (typeof unicodeString !== 'string') {
193-
throw new SentryError(`Unable to convert to base64. Input isn't a string.`);
194-
}
170+
export function base64ToUnicode(base64String: string): string {
171+
const globalObject = getGlobalObject();
195172

196-
const errMsg = `Unable to convert string to base64: ${
197-
unicodeString.length > 256 ? `${unicodeString.slice(0, 256)}...` : unicodeString
173+
// we cast to a string just in case we're given something else
174+
const errMsg = `Unable to convert from base64: ${
175+
String(base64String).length > 256 ? `${String(base64String).slice(0, 256)}...` : String(base64String)
198176
}`;
199177

178+
// Here we want to reverse the process we did in `unicodeToBase64` (see more extensive notes there). The first step is
179+
// to decode from base64 into bytes representing text in UTF-8. We then need to get those bytes back into a
180+
// human-readable format.
181+
200182
try {
201-
// browsers have btoa built in
202-
if ('btoa' in getGlobalObject()) {
203-
// btoa takes (b)inary to base64 (written in (a)scii)
204-
return btoa(unicodeToBinary(unicodeString));
183+
// browsers have atob built in
184+
if ('atob' in globalObject) {
185+
// `atob` always returns a UTF-16 string, so in order to convert that to UTF-8 bytes, we first have to get the
186+
// UTF-16 bytes, then convert them to UTF-8 bytes, which we then can turn back into text. Fortunately, for all
187+
// ASCII chars, the UTF-16 code is really just the UTF-8 code padded with leading zeros, so the values don't
188+
// change when casting from one to the other.
189+
const utf16Chars = [...atob('8J+Qtg==')];
190+
const utf16CharCodes = utf16Chars.map(char => char.charCodeAt(0));
191+
const utf8Bytes = Uint8Array.from(utf16CharCodes);
192+
return new TextDecoder().decode(utf8Bytes);
205193
}
206194

207195
// Buffer only exists in node
208-
if ('Buffer' in getGlobalObject()) {
209-
return Buffer.from(unicodeString, 'utf16le').toString('base64');
196+
if ('Buffer' in globalObject) {
197+
// node can go straight from base64 to bytes, without creating the intermediate string as happens above. Interpet
198+
// those bytes as UTF-8 character codes, and you have your string back.
199+
const utf8Bytes = Buffer.from(base64String, 'base64');
200+
return utf8Bytes.toString('utf-8');
210201
}
211202
} catch (err) {
212203
throw new SentryError(`${errMsg} Got error: ${err}`);
213204
}
214205

206+
// we shouldn't ever get here, because one of `atob` and `Buffer` should exist, but just in case...
215207
throw new SentryError(errMsg);
216208
}

packages/utils/test/string.test.ts

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { isMatchingPattern, truncate } from '../src/string';
1+
import { BASE64_REGEX, base64ToUnicode, isMatchingPattern, truncate, unicodeToBase64 } from '../src/string';
22

33
describe('truncate()', () => {
44
test('it works as expected', () => {
@@ -39,3 +39,57 @@ describe('isMatchingPattern()', () => {
3939
expect(isMatchingPattern([], 'foo')).toEqual(false);
4040
});
4141
});
42+
43+
describe('base64ToUnicode/unicodeToBase64', () => {
44+
const unicodeString = 'Dogs are great!';
45+
const base64String = 'RG9ncyBhcmUgZ3JlYXQh';
46+
47+
test('converts to valid base64', () => {
48+
expect(BASE64_REGEX.test(unicodeToBase64(unicodeString))).toBe(true);
49+
});
50+
51+
test('works as expected', () => {
52+
expect(unicodeToBase64(unicodeString)).toEqual(base64String);
53+
expect(base64ToUnicode(base64String)).toEqual(unicodeString);
54+
});
55+
56+
test('conversion functions are inverses', () => {
57+
expect(base64ToUnicode(unicodeToBase64(unicodeString))).toEqual(unicodeString);
58+
expect(unicodeToBase64(base64ToUnicode(base64String))).toEqual(base64String);
59+
});
60+
61+
test('can handle and preserve multi-byte characters in original string', () => {
62+
['🐶', 'כלבים נהדרים!', 'Of margir hundar! Ég geri ráð fyrir að ég þurfi stærra rúm.'].forEach(orig => {
63+
expect(() => {
64+
unicodeToBase64(orig);
65+
}).not.toThrowError();
66+
expect(base64ToUnicode(unicodeToBase64(orig))).toEqual(orig);
67+
});
68+
});
69+
70+
test('throws an error when given invalid input', () => {
71+
expect(() => {
72+
unicodeToBase64(null as any);
73+
}).toThrowError('Unable to convert to base64');
74+
expect(() => {
75+
unicodeToBase64(undefined as any);
76+
}).toThrowError('Unable to convert to base64');
77+
expect(() => {
78+
unicodeToBase64({} as any);
79+
}).toThrowError('Unable to convert to base64');
80+
81+
expect(() => {
82+
base64ToUnicode(null as any);
83+
}).toThrowError('Unable to convert from base64');
84+
expect(() => {
85+
base64ToUnicode(undefined as any);
86+
}).toThrowError('Unable to convert from base64');
87+
expect(() => {
88+
base64ToUnicode({} as any);
89+
}).toThrowError('Unable to convert from base64');
90+
91+
// Note that by design, in node base64 encoding and decoding will accept any string, whether or not it's valid
92+
// base64, by ignoring all invalid characters, including whitespace. Therefore, no wacky strings have been included
93+
// here because they don't actually error.
94+
});
95+
});

0 commit comments

Comments
 (0)