@@ -108,109 +108,101 @@ export function isMatchingPattern(value: string, pattern: RegExp | string): bool
108
108
return false ;
109
109
}
110
110
111
- // TODO: Base64 crossed with different character encodings turns out to be a ridiculous can of worms. Base64 expects
112
- // 8-bit data. JS only uses UTF-16. We need a way to be sure that every SDK is speaking the same language and can decode
113
- // values base64-encoded by other SDKs. The current proposal is to use UTF-8 as the common standard, and then
114
- // base64-encode that (meaning in JS we need to get there first). Doing it that way makes a whole lot of sense but is a
115
- // work in progress which isn't yet actually working. Leaving the current solution for now and will come back to it.
116
-
117
- /**
118
- * Convert a Unicode string to a string in which each 16-bit unit occupies only one byte, which makes it safe to use as
119
- * input to `btoa`.
120
- *
121
- * Copied from https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/btoa#Unicode_strings.
122
- *
123
- * @param unicodeString The string to convert
124
- * @returns A btoa-compatible encoding of the string
125
- */
126
- function unicodeToBinary ( unicodeString : string ) : string {
127
- const codeUnits = new Uint16Array ( unicodeString . length ) ;
128
- for ( let i = 0 ; i < codeUnits . length ; i ++ ) {
129
- codeUnits [ i ] = unicodeString . charCodeAt ( i ) ;
130
- }
131
- return String . fromCharCode ( ...new Uint8Array ( codeUnits . buffer ) ) ;
132
- }
133
-
134
111
/**
135
- * Convert a binary string (such as one would get from `atob`) into a Unicode string.
112
+ * Convert a Unicode (UTF-16 or UTF-8, depending on context) string to a base64 string.
136
113
*
137
- * Copied from https://developer.mozilla.org/en-US/docs/Web/API/WindowOrWorkerGlobalScope/btoa#Unicode_strings.
138
- *
139
- * @param binaryString The string to convert
140
- * @returns A btoa-compatible encoding of the string
141
- */
142
- function binaryToUnicode ( binaryString : string ) : string {
143
- const bytes = new Uint8Array ( binaryString . length ) ;
144
- for ( let i = 0 ; i < bytes . length ; i ++ ) {
145
- bytes [ i ] = binaryString . charCodeAt ( i ) ;
146
- }
147
- return String . fromCharCode ( ...new Uint16Array ( bytes . buffer ) ) ;
148
- }
149
-
150
- /**
151
- * Convert a base64 string to a Unicode (UTF-16) string.
152
- *
153
- * @param base64String The string to decode.
114
+ * @param unicodeString The string to base64-encode
154
115
* @throws SentryError (because using the logger creates a circular dependency)
155
- * @returns A Unicode string
116
+ * @returns A base64-encoded version of the string
156
117
*/
157
- export function base64ToUnicode ( base64String : string ) : string {
158
- if ( typeof base64String !== 'string' || ! BASE64_REGEX . test ( base64String ) ) {
159
- throw new SentryError ( `Unable to convert from base64. Input either isn't a string or isn't valid base64.` ) ;
160
- }
118
+ export function unicodeToBase64 ( unicodeString : string ) : string {
119
+ const globalObject = getGlobalObject ( ) ;
161
120
162
- const errMsg = `Unable to convert string from base64: ${
163
- base64String . length > 256 ? `${ base64String . slice ( 0 , 256 ) } ...` : base64String
121
+ // we cast to a string just in case we're given something else
122
+ const errMsg = `Unable to convert to base64: ${
123
+ String ( unicodeString ) . length > 256 ? `${ String ( unicodeString ) . slice ( 0 , 256 ) } ...` : String ( unicodeString )
164
124
} `;
165
125
166
126
try {
167
- // browsers have atob built in
168
- if ( 'atob' in getGlobalObject ( ) ) {
169
- // atob takes base64 (written in (a)scii) to (b)inary
170
- return binaryToUnicode ( atob ( base64String ) ) ;
127
+ // browsers have btoa built in
128
+ if ( 'btoa' in globalObject ) {
129
+ // `btoa` only takes "binary" (a.k.a. 8-bit) data, but JS uses UTF-16 to represent strings. `TextEncoder.encode()`
130
+ // translates that UTF-16 data to UTF-8 and returns an array of numbers, each representing one byte of data,
131
+ // interpreted as binary. UTF-8 uses either 1, 2, 3, or 4 bytes to encode each character, so the length of the array
132
+ // may match the length of the original string (it will be >=). Then we create a JS (UTF-16) string from those bytes,
133
+ // ** assuming each byte is its own character **. Since each character in our new string is represented by exactly one
134
+ // byte, it is now 8-bit data, and can be fed to `btoa` (which of course, as its first act, turns the characters back
135
+ // into the same bytes before acting on them).
136
+
137
+ const utf8Bytes = new TextEncoder ( ) . encode ( unicodeString ) ;
138
+ const eightBitDataAsUTF16String = String . fromCharCode ( ...utf8Bytes ) ;
139
+ return btoa ( eightBitDataAsUTF16String ) ;
171
140
}
172
141
173
142
// Buffer only exists in node
174
- if ( 'Buffer' in getGlobalObject ( ) ) {
175
- return Buffer . from ( base64String , 'base64' ) . toString ( 'utf16le' ) ;
143
+ if ( 'Buffer' in globalObject ) {
144
+ // To accomplish the same thing in node that we do in the browser (see notes above) - and, importantly, end up with
145
+ // the same result - we need to come up with the same numbers as were in the array from `TextEncoder.encode()`. Since
146
+ // those numbers are bytes representing the string in UTF-8, we ask node to interpret the given string in UTF-8 in
147
+ // order to fill its buffer. Unlike `btoa`, which requires its input to be encoded as a string (which it then
148
+ // immediately decodes back to bytes), `Buffer.toString()` can go straight from bytes, so we don't need the
149
+ // intermediate step of re-encoding the data before base64-ifying it.
150
+
151
+ const utf8Bytes = Buffer . from ( unicodeString , 'utf-8' ) ;
152
+ // const utf8Bytes = Buffer.from(unicodeString, 'utf16le');
153
+ return utf8Bytes . toString ( 'base64' ) ;
176
154
}
177
155
} catch ( err ) {
178
156
throw new SentryError ( `${ errMsg } Got error: ${ err } ` ) ;
179
157
}
180
158
159
+ // we shouldn't ever get here, because one of `btoa` and `Buffer` should exist, but just in case...
181
160
throw new SentryError ( errMsg ) ;
182
161
}
183
162
184
163
/**
185
- * Convert a Unicode (UTF-16) string to a base64 string.
164
+ * Convert a base64 string to a Unicode (UTF-16 or UTF-8, depending on context) string.
186
165
*
187
- * @param unicodeString The string to encode
166
+ * @param base64String The string to decode.
188
167
* @throws SentryError (because using the logger creates a circular dependency)
189
- * @returns A base64-encoded version of the string
168
+ * @returns A Unicode string
190
169
*/
191
- export function unicodeToBase64 ( unicodeString : string ) : string {
192
- if ( typeof unicodeString !== 'string' ) {
193
- throw new SentryError ( `Unable to convert to base64. Input isn't a string.` ) ;
194
- }
170
+ export function base64ToUnicode ( base64String : string ) : string {
171
+ const globalObject = getGlobalObject ( ) ;
195
172
196
- const errMsg = `Unable to convert string to base64: ${
197
- unicodeString . length > 256 ? `${ unicodeString . slice ( 0 , 256 ) } ...` : unicodeString
173
+ // we cast to a string just in case we're given something else
174
+ const errMsg = `Unable to convert from base64: ${
175
+ String ( base64String ) . length > 256 ? `${ String ( base64String ) . slice ( 0 , 256 ) } ...` : String ( base64String )
198
176
} `;
199
177
178
+ // Here we want to reverse the process we did in `unicodeToBase64` (see more extensive notes there). The first step is
179
+ // to decode from base64 into bytes representing text in UTF-8. We then need to get those bytes back into a
180
+ // human-readable format.
181
+
200
182
try {
201
- // browsers have btoa built in
202
- if ( 'btoa' in getGlobalObject ( ) ) {
203
- // btoa takes (b)inary to base64 (written in (a)scii)
204
- return btoa ( unicodeToBinary ( unicodeString ) ) ;
183
+ // browsers have atob built in
184
+ if ( 'atob' in globalObject ) {
185
+ // `atob` always returns a UTF-16 string, so in order to convert that to UTF-8 bytes, we first have to get the
186
+ // UTF-16 bytes, then convert them to UTF-8 bytes, which we then can turn back into text. Fortunately, for all
187
+ // ASCII chars, the UTF-16 code is really just the UTF-8 code padded with leading zeros, so the values don't
188
+ // change when casting from one to the other.
189
+ const utf16Chars = [ ...atob ( '8J+Qtg==' ) ] ;
190
+ const utf16CharCodes = utf16Chars . map ( char => char . charCodeAt ( 0 ) ) ;
191
+ const utf8Bytes = Uint8Array . from ( utf16CharCodes ) ;
192
+ return new TextDecoder ( ) . decode ( utf8Bytes ) ;
205
193
}
206
194
207
195
// Buffer only exists in node
208
- if ( 'Buffer' in getGlobalObject ( ) ) {
209
- return Buffer . from ( unicodeString , 'utf16le' ) . toString ( 'base64' ) ;
196
+ if ( 'Buffer' in globalObject ) {
197
+ // node can go straight from base64 to bytes, without creating the intermediate string as happens above. Interpet
198
+ // those bytes as UTF-8 character codes, and you have your string back.
199
+ const utf8Bytes = Buffer . from ( base64String , 'base64' ) ;
200
+ return utf8Bytes . toString ( 'utf-8' ) ;
210
201
}
211
202
} catch ( err ) {
212
203
throw new SentryError ( `${ errMsg } Got error: ${ err } ` ) ;
213
204
}
214
205
206
+ // we shouldn't ever get here, because one of `atob` and `Buffer` should exist, but just in case...
215
207
throw new SentryError ( errMsg ) ;
216
208
}
0 commit comments