fix base64 encoding and decoding methods, add tests

lobsterkatie · lobsterkatie · commit fb2dd41eb2cd · 2020-12-10T17:25:32.000-08:00
diff --git a/packages/utils/src/string.ts b/packages/utils/src/string.ts
@@ -108,109 +108,101 @@ export function isMatchingPattern(value: string, pattern: RegExp | string): bool
   return false;
 }
 
-// TODO: Base64 crossed with different character encodings turns out to be a ridiculous can of worms. Base64 expects
-// 8-bit data. JS only uses UTF-16. We need a way to be sure that every SDK is speaking the same language and can decode
-// values base64-encoded by other SDKs. The current proposal is to use UTF-8 as the common standard, and then
-// base64-encode that (meaning in JS we need to get there first). Doing it that way makes a whole lot of sense but is a
-// work in progress which isn't yet actually working. Leaving the current solution for now and will come back to it.
-
-/**
- * Convert a Unicode string to a string in which each 16-bit unit occupies only one byte, which makes it safe to use as
- * input to `btoa`.
- *
- * Copied from https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/btoa#Unicode_strings.
- *
- * @param unicodeString The string to convert
- * @returns A btoa-compatible encoding of the string
- */
-function unicodeToBinary(unicodeString: string): string {
-  const codeUnits = new Uint16Array(unicodeString.length);
-  for (let i = 0; i < codeUnits.length; i++) {
-    codeUnits[i] = unicodeString.charCodeAt(i);
-  }
-  return String.fromCharCode(...new Uint8Array(codeUnits.buffer));
-}
-
 /**
- * Convert a binary string (such as one would get from `atob`) into a Unicode string.
+ * Convert a Unicode (UTF-16 or UTF-8, depending on context) string to a base64 string.
  *
- * Copied from https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/btoa#Unicode_strings.
- *
- * @param binaryString The string to convert
- * @returns A btoa-compatible encoding of the string
- */
-function binaryToUnicode(binaryString: string): string {
-  const bytes = new Uint8Array(binaryString.length);
-  for (let i = 0; i < bytes.length; i++) {
-    bytes[i] = binaryString.charCodeAt(i);
-  }
-  return String.fromCharCode(...new Uint16Array(bytes.buffer));
-}
-
-/**
- * Convert a base64 string to a Unicode (UTF-16) string.
- *
- * @param base64String The string to decode.
+ * @param unicodeString The string to base64-encode
  * @throws SentryError (because using the logger creates a circular dependency)
- * @returns A Unicode string
+ * @returns A base64-encoded version of the string
  */
-export function base64ToUnicode(base64String: string): string {
-  if (typeof base64String !== 'string' || !BASE64_REGEX.test(base64String)) {
-    throw new SentryError(`Unable to convert from base64. Input either isn't a string or isn't valid base64.`);
-  }
+export function unicodeToBase64(unicodeString: string): string {
+  const globalObject = getGlobalObject();
 
-  const errMsg = `Unable to convert string from base64: ${
-    base64String.length > 256 ? `${base64String.slice(0, 256)}...` : base64String
+  // we cast to a string just in case we're given something else
+  const errMsg = `Unable to convert to base64: ${
+    String(unicodeString).length > 256 ? `${String(unicodeString).slice(0, 256)}...` : String(unicodeString)
   }`;
 
   try {
-    // browsers have atob built in
-    if ('atob' in getGlobalObject()) {
-      // atob takes base64 (written in (a)scii) to (b)inary
-      return binaryToUnicode(atob(base64String));
+    // browsers have btoa built in
+    if ('btoa' in globalObject) {
+      // `btoa` only takes "binary" (a.k.a. 8-bit) data, but JS uses UTF-16 to represent strings. `TextEncoder.encode()`
+      // translates that UTF-16 data to UTF-8 and returns an array of numbers, each representing one byte of data,
+      // interpreted as binary. UTF-8 uses either 1, 2, 3, or 4 bytes to encode each character, so the length of the array
+      // may match the length of the original string (it will be >=). Then we create a JS (UTF-16) string from those bytes,
+      // ** assuming each byte is its own character **. Since each character in our new string is represented by exactly one
+      // byte, it is now 8-bit data, and can be fed to `btoa` (which of course, as its first act, turns the characters back
+      // into the same bytes before acting on them).
+
+      const utf8Bytes = new TextEncoder().encode(unicodeString);
+      const eightBitDataAsUTF16String = String.fromCharCode(...utf8Bytes);
+      return btoa(eightBitDataAsUTF16String);
     }
 
     // Buffer only exists in node
-    if ('Buffer' in getGlobalObject()) {
-      return Buffer.from(base64String, 'base64').toString('utf16le');
+    if ('Buffer' in globalObject) {
+      // To accomplish the same thing in node that we do in the browser (see notes above) - and, importantly, end up with
+      // the same result - we need to come up with the same numbers as were in the array from `TextEncoder.encode()`. Since
+      // those numbers are bytes representing the string in UTF-8, we ask node to interpret the given string in UTF-8 in
+      // order to fill its buffer. Unlike `btoa`, which requires its input to be encoded as a string (which it then
+      // immediately decodes back to bytes), `Buffer.toString()` can go straight from bytes, so we don't need the
+      // intermediate step of re-encoding the data before base64-ifying it.
+
+      const utf8Bytes = Buffer.from(unicodeString, 'utf-8');
+      // const utf8Bytes = Buffer.from(unicodeString, 'utf16le');
+      return utf8Bytes.toString('base64');
     }
   } catch (err) {
     throw new SentryError(`${errMsg} Got error: ${err}`);
   }
 
+  // we shouldn't ever get here, because one of `btoa` and `Buffer` should exist, but just in case...
   throw new SentryError(errMsg);
 }
 
 /**
- * Convert a Unicode (UTF-16) string to a base64 string.
+ * Convert a base64 string to a Unicode (UTF-16 or UTF-8, depending on context) string.
  *
- * @param unicodeString The string to encode
+ * @param base64String The string to decode.
  * @throws SentryError (because using the logger creates a circular dependency)
- * @returns A base64-encoded version of the string
+ * @returns A Unicode string
  */
-export function unicodeToBase64(unicodeString: string): string {
-  if (typeof unicodeString !== 'string') {
-    throw new SentryError(`Unable to convert to base64. Input isn't a string.`);
-  }
+export function base64ToUnicode(base64String: string): string {
+  const globalObject = getGlobalObject();
 
-  const errMsg = `Unable to convert string to base64: ${
-    unicodeString.length > 256 ? `${unicodeString.slice(0, 256)}...` : unicodeString
+  // we cast to a string just in case we're given something else
+  const errMsg = `Unable to convert from base64: ${
+    String(base64String).length > 256 ? `${String(base64String).slice(0, 256)}...` : String(base64String)
   }`;
 
+  // Here we want to reverse the process we did in `unicodeToBase64` (see more extensive notes there). The first step is
+  // to decode from base64 into bytes representing text in UTF-8. We then need to get those bytes back into a
+  // human-readable format.
+
   try {
-    // browsers have btoa built in
-    if ('btoa' in getGlobalObject()) {
-      // btoa takes (b)inary to base64 (written in (a)scii)
-      return btoa(unicodeToBinary(unicodeString));
+    // browsers have atob built in
+    if ('atob' in globalObject) {
+      // `atob` always returns a UTF-16 string, so in order to convert that to UTF-8 bytes, we first have to get the
+      // UTF-16 bytes, then convert them to UTF-8 bytes, which we then can turn back into text. Fortunately, for all
+      // ASCII chars, the UTF-16 code is really just the UTF-8 code padded with leading zeros, so the values don't
+      // change when casting from one to the other.
+      const utf16Chars = [...atob('8J+Qtg==')];
+      const utf16CharCodes = utf16Chars.map(char => char.charCodeAt(0));
+      const utf8Bytes = Uint8Array.from(utf16CharCodes);
+      return new TextDecoder().decode(utf8Bytes);
     }
 
     // Buffer only exists in node
-    if ('Buffer' in getGlobalObject()) {
-      return Buffer.from(unicodeString, 'utf16le').toString('base64');
+    if ('Buffer' in globalObject) {
+      // node can go straight from base64 to bytes, without creating the intermediate string as happens above. Interpet
+      // those bytes as UTF-8 character codes, and you have your string back.
+      const utf8Bytes = Buffer.from(base64String, 'base64');
+      return utf8Bytes.toString('utf-8');
     }
   } catch (err) {
     throw new SentryError(`${errMsg} Got error: ${err}`);
   }
 
+  // we shouldn't ever get here, because one of `atob` and `Buffer` should exist, but just in case...
   throw new SentryError(errMsg);
 }
diff --git a/packages/utils/test/string.test.ts b/packages/utils/test/string.test.ts
@@ -1,4 +1,4 @@
-import { isMatchingPattern, truncate } from '../src/string';
+import { BASE64_REGEX, base64ToUnicode, isMatchingPattern, truncate, unicodeToBase64 } from '../src/string';
 
 describe('truncate()', () => {
   test('it works as expected', () => {
@@ -39,3 +39,57 @@ describe('isMatchingPattern()', () => {
     expect(isMatchingPattern([], 'foo')).toEqual(false);
   });
 });
+
+describe('base64ToUnicode/unicodeToBase64', () => {
+  const unicodeString = 'Dogs are great!';
+  const base64String = 'RG9ncyBhcmUgZ3JlYXQh';
+
+  test('converts to valid base64', () => {
+    expect(BASE64_REGEX.test(unicodeToBase64(unicodeString))).toBe(true);
+  });
+
+  test('works as expected', () => {
+    expect(unicodeToBase64(unicodeString)).toEqual(base64String);
+    expect(base64ToUnicode(base64String)).toEqual(unicodeString);
+  });
+
+  test('conversion functions are inverses', () => {
+    expect(base64ToUnicode(unicodeToBase64(unicodeString))).toEqual(unicodeString);
+    expect(unicodeToBase64(base64ToUnicode(base64String))).toEqual(base64String);
+  });
+
+  test('can handle and preserve multi-byte characters in original string', () => {
+    ['🐶', 'כלבים נהדרים!', 'Of margir hundar! Ég geri ráð fyrir að ég þurfi stærra rúm.'].forEach(orig => {
+      expect(() => {
+        unicodeToBase64(orig);
+      }).not.toThrowError();
+      expect(base64ToUnicode(unicodeToBase64(orig))).toEqual(orig);
+    });
+  });
+
+  test('throws an error when given invalid input', () => {
+    expect(() => {
+      unicodeToBase64(null as any);
+    }).toThrowError('Unable to convert to base64');
+    expect(() => {
+      unicodeToBase64(undefined as any);
+    }).toThrowError('Unable to convert to base64');
+    expect(() => {
+      unicodeToBase64({} as any);
+    }).toThrowError('Unable to convert to base64');
+
+    expect(() => {
+      base64ToUnicode(null as any);
+    }).toThrowError('Unable to convert from base64');
+    expect(() => {
+      base64ToUnicode(undefined as any);
+    }).toThrowError('Unable to convert from base64');
+    expect(() => {
+      base64ToUnicode({} as any);
+    }).toThrowError('Unable to convert from base64');
+
+    // Note that by design, in node base64 encoding and decoding will accept any string, whether or not it's valid
+    // base64, by ignoring all invalid characters, including whitespace. Therefore, no wacky strings have been included
+    // here because they don't actually error.
+  });
+});