Skip to content

Commit 39c0274

Browse files
committed
Fix the uniqueItems implementation to accommodate non-primitive values
JavaScript Sets do not work in the same fashion as HashSets do in Java, in that object equivalence is determined by reference, not by value. In order to work around this, this commit adds a duplicate finder that works on the same principals as a value-based hashtable: it creates a hash of the object's value to bucket objects and then uses deep equality to determine if a hash collision is actually a duplicate value.
1 parent 2f013f6 commit 39c0274

File tree

5 files changed

+226
-10
lines changed

5 files changed

+226
-10
lines changed

smithy-typescript-ssdk-libs/server-common/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"postbuild": "rimraf dist/types/ts3.4 && downlevel-dts dist/types dist/types/ts3.4",
1212
"test": "jest",
1313
"clean": "rimraf dist",
14+
"lint": "yarn run eslint -c ../.eslintrc.js **/src/**/*.ts",
1415
"format": "prettier --config ../prettier.config.js --write **/*.{ts,js,md,json}"
1516
},
1617
"repository": {
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License").
5+
* You may not use this file except in compliance with the License.
6+
* A copy of the License is located at
7+
*
8+
* http://aws.amazon.com/apache2.0
9+
*
10+
* or in the "license" file accompanying this file. This file is distributed
11+
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12+
* express or implied. See the License for the specific language governing
13+
* permissions and limitations under the License.
14+
*/
15+
16+
import * as util from "util";
17+
18+
import { findDuplicates, Input } from "./unique";
19+
20+
describe("findDuplicates", () => {
21+
describe("finds duplicates in", () => {
22+
it("strings", () => {
23+
expect(findDuplicates(["a", "b", "c", "a", "b", "a"])).toEqual(["a", "b"]);
24+
expect(findDuplicates(["x", "y", "z", "a", "b", "c", "a", "b"])).toEqual(["a", "b"]);
25+
});
26+
it("numbers", () => {
27+
expect(findDuplicates([1, 2, 3, 4, 1, 2])).toEqual([1, 2]);
28+
});
29+
it("booleans", () => {
30+
expect(findDuplicates([true, false, true])).toEqual([true]);
31+
});
32+
it("arrays", () => {
33+
expect(
34+
findDuplicates([
35+
[5, 6],
36+
[2, 3],
37+
[1, 2],
38+
[3, 4],
39+
[1, 2],
40+
])
41+
).toEqual([[1, 2]]);
42+
});
43+
it("objects", () => {
44+
expect(findDuplicates([{ a: "b" }, { b: [1, 2, 3] }, { a: "b" }, { a: "b" }])).toEqual([{ a: "b" }]);
45+
expect(findDuplicates([{ a: "b" }, { b: 1, c: 2 }, { c: 2, b: 1 }])).toEqual([{ b: 1, c: 2 }]);
46+
});
47+
it("deeply nested objects", () => {
48+
expect(
49+
findDuplicates([
50+
{ a: { b: { c: [1, { d: 2 }, [3]] } } },
51+
2,
52+
[3, 4],
53+
{ b: "c" },
54+
{ a: { b: { c: [1, { d: 2 }, [3]] } } },
55+
])
56+
).toEqual([{ a: { b: { c: [1, { d: 2 }, [3]] } } }]);
57+
});
58+
});
59+
describe("correctly does not find duplicates in", () => {
60+
it("strings", () => {
61+
expect(findDuplicates(["a", "b", "c"])).toEqual([]);
62+
});
63+
it("numbers", () => {
64+
expect(findDuplicates([1, 2, 3, 4])).toEqual([]);
65+
expect(findDuplicates([1, 2, "1", "2"])).toEqual([]);
66+
});
67+
it("booleans", () => {
68+
expect(findDuplicates([true, false])).toEqual([]);
69+
expect(findDuplicates([true, false, "true", "false"])).toEqual([]);
70+
});
71+
it("arrays", () => {
72+
expect(
73+
findDuplicates([
74+
[1, 2],
75+
[2, 3],
76+
[3, 4],
77+
])
78+
).toEqual([]);
79+
expect(
80+
findDuplicates([
81+
[1, 2],
82+
["1", "2"],
83+
])
84+
).toEqual([]);
85+
});
86+
it("objects", () => {
87+
expect(findDuplicates([{ a: "b" }, { b: [1, 2, 3] }])).toEqual([]);
88+
expect(findDuplicates([{ a: 1 }, { a: "1" }])).toEqual([]);
89+
});
90+
});
91+
92+
// This is relatively slow and may be flaky if the input size is tuned to let it run reasonably fast
93+
it.skip("is faster than the naive implementation", () => {
94+
const input: Input[] = [true, false, 1, 2, 3, 4, 5, 6];
95+
for (let i = 0; i < 10_000; i++) {
96+
input.push({ a: [1, 2, 3, i], b: { nested: [true] } });
97+
}
98+
99+
const uniqueStart = Date.now();
100+
expect(findDuplicates(input)).toEqual([]);
101+
const uniqueEnd = Date.now();
102+
const uniqueDuration = (uniqueEnd - uniqueStart) / 1000;
103+
104+
console.log(`findDuplicates finished in ${uniqueDuration} seconds`);
105+
106+
const naiveStart = Date.now();
107+
expect(naivefindDuplicates(input)).toEqual([]);
108+
const naiveEnd = Date.now();
109+
const naiveDuration = (naiveEnd - naiveStart) / 1000;
110+
111+
console.log(`naivefindDuplicates finished in ${naiveDuration} seconds`);
112+
113+
expect(naiveDuration).toBeGreaterThan(uniqueDuration);
114+
});
115+
116+
// This isn't even correct! It should be even slower, it just returns the first duplicate!
117+
function naivefindDuplicates(input: Array<Input>): Input | undefined {
118+
for (let i = 0; i < input.length - 1; i++) {
119+
for (let j = i + 1; j < input.length; j++) {
120+
if (util.isDeepStrictEqual(input[i], input[j])) {
121+
return [input[i]];
122+
}
123+
}
124+
}
125+
return undefined;
126+
}
127+
});
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License").
5+
* You may not use this file except in compliance with the License.
6+
* A copy of the License is located at
7+
*
8+
* http://aws.amazon.com/apache2.0
9+
*
10+
* or in the "license" file accompanying this file. This file is distributed
11+
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12+
* express or implied. See the License for the specific language governing
13+
* permissions and limitations under the License.
14+
*/
15+
16+
import { createHash } from "crypto";
17+
import * as util from "util";
18+
19+
/**
20+
* A shortcut for JSON-like objects.
21+
*/
22+
export type Input = { [key: string]: Input } | Array<Input> | string | number | boolean | null;
23+
24+
/**
25+
* Returns an array of duplicated values in the input. Unlike equals, duplicate
26+
* is determined by semantic equivalence (two objects representing the same
27+
* data, not just two objects at the same location in memory).
28+
*
29+
* @param input an array of JSON-like objects.
30+
*/
31+
export const findDuplicates = (input: Array<Input>): Array<Input> => {
32+
const potentialCollisions: { [hash: string]: { value: Input; alreadyFound: boolean }[] } = {};
33+
const collisions: Array<Input> = [];
34+
35+
for (const value of input) {
36+
const valueHash = hash(value);
37+
if (!potentialCollisions.hasOwnProperty(valueHash)) {
38+
potentialCollisions[valueHash] = [{ value: value, alreadyFound: false }];
39+
} else {
40+
let duplicateFound = false;
41+
for (const potentialCollision of potentialCollisions[valueHash]) {
42+
if (util.isDeepStrictEqual(value, potentialCollision.value)) {
43+
duplicateFound = true;
44+
if (!potentialCollision.alreadyFound) {
45+
collisions.push(value);
46+
}
47+
potentialCollision.alreadyFound = true;
48+
}
49+
}
50+
if (!duplicateFound) {
51+
potentialCollisions[valueHash].push({ value: value, alreadyFound: false });
52+
}
53+
}
54+
}
55+
return collisions;
56+
};
57+
58+
const hash = (input: Input): string => {
59+
return createHash("sha256").update(canonicalize(input)).digest("base64");
60+
};
61+
62+
/**
63+
* Since node's hash functions operate on strings or buffers, we need a canonical format for
64+
* our objects in order to hash them correctly. This function turns them into string representations
65+
* where the types are encoded in order to avoid ambiguity, for instance, between the string "1" and
66+
* the number 1. This method sorts object keys lexicographically in order to maintain consistency.
67+
*
68+
* @param input a JSON-like object
69+
* @return a canonical string representation
70+
*/
71+
const canonicalize = (input: Input): string => {
72+
if (input === null) {
73+
return "null()";
74+
}
75+
if (typeof input === "string" || typeof input === "number" || typeof input === "boolean") {
76+
return `${typeof input}(${input.toString()})`;
77+
}
78+
if (Array.isArray(input)) {
79+
return "array(" + input.map((i) => canonicalize(i)).join(",") + ")";
80+
}
81+
82+
const contents: Array<string> = [];
83+
for (const key of Object.keys(input).slice().sort()) {
84+
contents.push("key(" + key + ")->value(" + canonicalize(input[key]) + ")");
85+
}
86+
return "map(" + contents.join(",") + ")";
87+
};

smithy-typescript-ssdk-libs/server-common/src/validation/validators.spec.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,4 +227,12 @@ describe("uniqueItems", () => {
227227
path: "aField",
228228
});
229229
});
230+
describe("supports documents", () => {
231+
expect(validator.validate([{ a: 1 }, { b: 2 }], "aField")).toBeNull();
232+
expect(validator.validate([{ a: 1 }, { b: 2 }, { a: 1 }], "aField")).toEqual({
233+
constraintType: "uniqueItems",
234+
failureValue: [{ a: 1 }],
235+
path: "aField",
236+
});
237+
});
230238
});

smithy-typescript-ssdk-libs/server-common/src/validation/validators.ts

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import { RE2 } from "re2-wasm";
1717

18+
import { findDuplicates } from "../unique";
1819
import {
1920
EnumValidationFailure,
2021
LengthValidationFailure,
@@ -298,17 +299,9 @@ export class UniqueItemsValidator implements SingleConstraintValidator<Array<any
298299
return null;
299300
}
300301

301-
const repeats = new Set<any>();
302-
const uniqueValues = new Set<any>();
303-
for (const i of input) {
304-
if (uniqueValues.has(i)) {
305-
repeats.add(i);
306-
} else {
307-
uniqueValues.add(i);
308-
}
309-
}
302+
const repeats = findDuplicates(input);
310303

311-
if (repeats.size > 0) {
304+
if (repeats.length > 0) {
312305
return {
313306
constraintType: "uniqueItems",
314307
path: path,

0 commit comments

Comments
 (0)