Skip to content

Commit 1d07106

Browse files
mwdaubdzlier-gcp
authored andcommitted
Add DLP code samples for custom info types (#1121)
* Add custom info types to code samples Include samples of using custom dictionaries and custom regexes. * Add missing imports * Fix build errors * Add tests for custom info types * Fix bad regexes. * Fix bad regexes, part 2. * Update README.md Add custom info type flags and fix existing examples so they work with the V2 API. * Fix import order * Fix line length violations * Fix line length formatting violations * Fix broken character mask test DLP now requires the client to specify info types to search for when using DeID with wildcard info types. * Add SSN info type to tests * Add info types to DeID with FPE test
1 parent 3213bdb commit 1d07106

File tree

5 files changed

+169
-21
lines changed

5 files changed

+169
-21
lines changed

dlp/README.md

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,49 +66,50 @@ Options:
6666
-f, --maxFindings [number] [default: 0]
6767
maximum number of results to retrieve
6868
-q, --includeQuote [boolean] [default: true] include matching string in results
69-
-t, --infoTypes restrict to limited set of infoTypes [ default: []]
70-
[ eg. PHONE_NUMBER US_PASSPORT]
69+
-t, --infoTypes set of infoTypes to search for [eg. PHONE_NUMBER US_PASSPORT]
70+
-customDictionaries set of comma-separated dictionary words to search for as customInfoTypes
71+
-customRegexes set of regex patterns to search for as customInfoTypes
7172
```
7273
### Examples
7374
- Inspect a string:
7475
```
75-
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -s "My phone number is (123) 456-7890 and my email address is [email protected]"
76+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -s "My phone number is (123) 456-7890 and my email address is [email protected]" --infoTypes PHONE_NUMBER EMAIL_ADDRESS
77+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -s "My phone number is (123) 456-7890 and my email address is [email protected]" -customDictionaries [email protected] -customRegexes "\(\d{3}\) \d{3}-\d{4}"
7678
```
7779
- Inspect a local file (text / image):
7880
```
79-
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f resources/test.txt
80-
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f resources/test.png
81+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f src/test/resources/test.txt --infoTypes PHONE_NUMBER EMAIL_ADDRESS
82+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f src/test/resources/test.png --infoTypes PHONE_NUMBER EMAIL_ADDRESS
8183
```
8284
- Inspect a file on Google Cloud Storage:
8385
```
84-
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -gcs -bucketName my-bucket -fileName my-file.txt
86+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -gcs -bucketName my-bucket -fileName my-file.txt --infoTypes PHONE_NUMBER EMAIL_ADDRESS
8587
```
8688
- Inspect a Google Cloud Datastore kind:
8789
```
88-
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -ds -kind my-kind
90+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -ds -kind my-kind --infoTypes PHONE_NUMBER EMAIL_ADDRESS
8991
```
9092

91-
## Automatic redaction of sensitive data
92-
[Automatic redaction](https://cloud.google.com/dlp/docs/classification-redaction) produces an output with sensitive data matches removed.
93+
## Automatic redaction of sensitive data from images
94+
[Automatic redaction](https://cloud.google.com/dlp/docs/redacting-sensitive-data-images) produces an output image with sensitive data matches removed.
9395

9496
```
9597
Commands:
96-
-s <string> Source input string
97-
-r <replacement string> String to replace detected info types
98+
-f <string> Source image file
99+
-o <string> Destination image file
98100
Options:
99101
--help Show help
100102
-minLikelihood choices: "LIKELIHOOD_UNSPECIFIED", "VERY_UNLIKELY", "UNLIKELY", "POSSIBLE", "LIKELY", "VERY_LIKELY"]
101103
[default: "LIKELIHOOD_UNSPECIFIED"]
102104
specifies the minimum reporting likelihood threshold.
103105
104-
-infoTypes restrict operation to limited set of info types [ default: []]
105-
[ eg. PHONE_NUMBER US_PASSPORT]
106+
-infoTypes set of infoTypes to search for [eg. PHONE_NUMBER US_PASSPORT]
106107
```
107108

108109
### Example
109-
- Replace sensitive data in text with `_REDACTED_`:
110+
- Redact phone numbers and email addresses from `test.png`:
110111
```
111-
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Redact -s "My phone number is (123) 456-7890 and my email address is [email protected]" -r "_REDACTED_"
112+
java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Redact -f src/test/resources/test.png -o test-redacted.png -infoTypes PHONE_NUMBER EMAIL_ADDRESS
112113
```
113114

114115
## Integration tests

dlp/src/main/java/com/example/dlp/DeIdentification.java

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
import java.time.format.DateTimeParseException;
5858
import java.util.ArrayList;
5959
import java.util.Arrays;
60+
import java.util.Collections;
6061
import java.util.List;
6162
import java.util.stream.Collectors;
6263
import org.apache.commons.cli.CommandLine;
@@ -81,7 +82,11 @@ public class DeIdentification {
8182
* @param projectId ID of Google Cloud project to run the API under.
8283
*/
8384
private static void deIdentifyWithMask(
84-
String string, Character maskingCharacter, int numberToMask, String projectId) {
85+
String string,
86+
List<InfoType> infoTypes,
87+
Character maskingCharacter,
88+
int numberToMask,
89+
String projectId) {
8590

8691
// instantiate a client
8792
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
@@ -108,6 +113,11 @@ private static void deIdentifyWithMask(
108113
.addTransformations(infoTypeTransformationObject)
109114
.build();
110115

116+
InspectConfig inspectConfig =
117+
InspectConfig.newBuilder()
118+
.addAllInfoTypes(infoTypes)
119+
.build();
120+
111121
DeidentifyConfig deidentifyConfig =
112122
DeidentifyConfig.newBuilder()
113123
.setInfoTypeTransformations(infoTypeTransformationArray)
@@ -117,6 +127,7 @@ private static void deIdentifyWithMask(
117127
DeidentifyContentRequest request =
118128
DeidentifyContentRequest.newBuilder()
119129
.setParent(ProjectName.of(projectId).toString())
130+
.setInspectConfig(inspectConfig)
120131
.setDeidentifyConfig(deidentifyConfig)
121132
.setItem(contentItem)
122133
.build();
@@ -147,6 +158,7 @@ private static void deIdentifyWithMask(
147158
*/
148159
private static void deIdentifyWithFpe(
149160
String string,
161+
List<InfoType> infoTypes,
150162
FfxCommonNativeAlphabet alphabet,
151163
String keyName,
152164
String wrappedKey,
@@ -188,6 +200,11 @@ private static void deIdentifyWithFpe(
188200
.addTransformations(infoTypeTransformationObject)
189201
.build();
190202

203+
InspectConfig inspectConfig =
204+
InspectConfig.newBuilder()
205+
.addAllInfoTypes(infoTypes)
206+
.build();
207+
191208
// Create the deidentification request object
192209
DeidentifyConfig deidentifyConfig =
193210
DeidentifyConfig.newBuilder()
@@ -197,6 +214,7 @@ private static void deIdentifyWithFpe(
197214
DeidentifyContentRequest request =
198215
DeidentifyContentRequest.newBuilder()
199216
.setParent(ProjectName.of(projectId).toString())
217+
.setInspectConfig(inspectConfig)
200218
.setDeidentifyConfig(deidentifyConfig)
201219
.setItem(contentItem)
202220
.build();
@@ -513,6 +531,10 @@ public static void main(String[] args) throws Exception {
513531
Options commandLineOptions = new Options();
514532
commandLineOptions.addOptionGroup(optionsGroup);
515533

534+
Option infoTypesOption = Option.builder("infoTypes").hasArg(true).required(false).build();
535+
infoTypesOption.setArgs(Option.UNLIMITED_VALUES);
536+
commandLineOptions.addOption(infoTypesOption);
537+
516538
Option maskingCharacterOption =
517539
Option.builder("maskingCharacter").hasArg(true).required(false).build();
518540
commandLineOptions.addOption(maskingCharacterOption);
@@ -575,12 +597,21 @@ public static void main(String[] args) throws Exception {
575597
String projectId =
576598
cmd.getOptionValue(projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
577599

600+
List<InfoType> infoTypesList = Collections.emptyList();
601+
if (cmd.hasOption(infoTypesOption.getOpt())) {
602+
infoTypesList = new ArrayList<>();
603+
String[] infoTypes = cmd.getOptionValues(infoTypesOption.getOpt());
604+
for (String infoType : infoTypes) {
605+
infoTypesList.add(InfoType.newBuilder().setName(infoType).build());
606+
}
607+
}
608+
578609
if (cmd.hasOption("m")) {
579610
// deidentification with character masking
580611
int numberToMask = Integer.parseInt(cmd.getOptionValue(numberToMaskOption.getOpt(), "0"));
581612
char maskingCharacter = cmd.getOptionValue(maskingCharacterOption.getOpt(), "*").charAt(0);
582613
String val = cmd.getOptionValue(deidentifyMaskingOption.getOpt());
583-
deIdentifyWithMask(val, maskingCharacter, numberToMask, projectId);
614+
deIdentifyWithMask(val, infoTypesList, maskingCharacter, numberToMask, projectId);
584615
} else if (cmd.hasOption("f")) {
585616
// deidentification with FPE
586617
String wrappedKey = cmd.getOptionValue(wrappedKeyOption.getOpt());
@@ -591,7 +622,8 @@ public static void main(String[] args) throws Exception {
591622
FfxCommonNativeAlphabet.valueOf(
592623
cmd.getOptionValue(
593624
alphabetOption.getOpt(), FfxCommonNativeAlphabet.ALPHA_NUMERIC.name()));
594-
deIdentifyWithFpe(val, alphabet, keyName, wrappedKey, projectId, surrogateType);
625+
deIdentifyWithFpe(
626+
val, infoTypesList, alphabet, keyName, wrappedKey, projectId, surrogateType);
595627
} else if (cmd.hasOption("d")) {
596628
//deidentify with date shift
597629
String inputCsv = cmd.getOptionValue(inputCsvPathOption.getOpt());

dlp/src/main/java/com/example/dlp/Inspect.java

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
import com.google.privacy.dlp.v2.CloudStorageOptions;
2828
import com.google.privacy.dlp.v2.ContentItem;
2929
import com.google.privacy.dlp.v2.CreateDlpJobRequest;
30+
import com.google.privacy.dlp.v2.CustomInfoType;
31+
import com.google.privacy.dlp.v2.CustomInfoType.Dictionary;
32+
import com.google.privacy.dlp.v2.CustomInfoType.Dictionary.WordList;
33+
import com.google.privacy.dlp.v2.CustomInfoType.Regex;
3034
import com.google.privacy.dlp.v2.DatastoreOptions;
3135
import com.google.privacy.dlp.v2.DlpJob;
3236
import com.google.privacy.dlp.v2.Finding;
@@ -52,6 +56,7 @@
5256
import java.nio.file.Files;
5357
import java.nio.file.Paths;
5458
import java.util.ArrayList;
59+
import java.util.Arrays;
5560
import java.util.Collections;
5661
import java.util.List;
5762
import java.util.concurrent.TimeUnit;
@@ -82,6 +87,7 @@ private static void inspectString(
8287
Likelihood minLikelihood,
8388
int maxFindings,
8489
List<InfoType> infoTypes,
90+
List<CustomInfoType> customInfoTypes,
8591
boolean includeQuote,
8692
String projectId) {
8793
// instantiate a client
@@ -91,6 +97,7 @@ private static void inspectString(
9197
InspectConfig inspectConfig =
9298
InspectConfig.newBuilder()
9399
.addAllInfoTypes(infoTypes)
100+
.addAllCustomInfoTypes(customInfoTypes)
94101
.setMinLikelihood(minLikelihood)
95102
.setLimits(findingLimits)
96103
.setIncludeQuote(includeQuote)
@@ -146,6 +153,7 @@ private static void inspectFile(
146153
Likelihood minLikelihood,
147154
int maxFindings,
148155
List<InfoType> infoTypes,
156+
List<CustomInfoType> customInfoTypes,
149157
boolean includeQuote,
150158
String projectId) {
151159
// Instantiates a client
@@ -189,6 +197,7 @@ private static void inspectFile(
189197
InspectConfig inspectConfig =
190198
InspectConfig.newBuilder()
191199
.addAllInfoTypes(infoTypes)
200+
.addAllCustomInfoTypes(customInfoTypes)
192201
.setMinLikelihood(minLikelihood)
193202
.setLimits(findingLimits)
194203
.setIncludeQuote(includeQuote)
@@ -242,6 +251,7 @@ private static void inspectGcsFile(
242251
String fileName,
243252
Likelihood minLikelihood,
244253
List<InfoType> infoTypes,
254+
List<CustomInfoType> customInfoTypes,
245255
int maxFindings,
246256
String topicId,
247257
String subscriptionId,
@@ -266,6 +276,7 @@ private static void inspectGcsFile(
266276
InspectConfig inspectConfig =
267277
InspectConfig.newBuilder()
268278
.addAllInfoTypes(infoTypes)
279+
.addAllCustomInfoTypes(customInfoTypes)
269280
.setMinLikelihood(minLikelihood)
270281
.setLimits(findingLimits)
271282
.build();
@@ -363,6 +374,7 @@ private static void inspectDatastore(
363374
String kind,
364375
Likelihood minLikelihood,
365376
List<InfoType> infoTypes,
377+
List<CustomInfoType> customInfoTypes,
366378
int maxFindings,
367379
String topicId,
368380
String subscriptionId) {
@@ -388,6 +400,7 @@ private static void inspectDatastore(
388400
InspectConfig inspectConfig =
389401
InspectConfig.newBuilder()
390402
.addAllInfoTypes(infoTypes)
403+
.addAllCustomInfoTypes(customInfoTypes)
391404
.setMinLikelihood(minLikelihood)
392405
.setLimits(findingLimits)
393406
.build();
@@ -486,6 +499,7 @@ private static void inspectBigquery(
486499
String tableId,
487500
Likelihood minLikelihood,
488501
List<InfoType> infoTypes,
502+
List<CustomInfoType> customInfoTypes,
489503
int maxFindings,
490504
String topicId,
491505
String subscriptionId) {
@@ -511,6 +525,7 @@ private static void inspectBigquery(
511525
InspectConfig inspectConfig =
512526
InspectConfig.newBuilder()
513527
.addAllInfoTypes(infoTypes)
528+
.addAllCustomInfoTypes(customInfoTypes)
514529
.setMinLikelihood(minLikelihood)
515530
.setLimits(findingLimits)
516531
.build();
@@ -629,6 +644,16 @@ public static void main(String[] args) throws Exception {
629644
infoTypesOption.setArgs(Option.UNLIMITED_VALUES);
630645
commandLineOptions.addOption(infoTypesOption);
631646

647+
Option customDictionariesOption =
648+
Option.builder("customDictionaries").hasArg(true).required(false).build();
649+
customDictionariesOption.setArgs(Option.UNLIMITED_VALUES);
650+
commandLineOptions.addOption(customDictionariesOption);
651+
652+
Option customRegexesOption =
653+
Option.builder("customRegexes").hasArg(true).required(false).build();
654+
customRegexesOption.setArgs(Option.UNLIMITED_VALUES);
655+
commandLineOptions.addOption(customRegexesOption);
656+
632657
Option includeQuoteOption = Option.builder("includeQuote").hasArg(true).required(false).build();
633658
commandLineOptions.addOption(includeQuoteOption);
634659

@@ -695,13 +720,62 @@ public static void main(String[] args) throws Exception {
695720
infoTypesList.add(InfoType.newBuilder().setName(infoType).build());
696721
}
697722
}
723+
724+
List<CustomInfoType> customInfoTypesList = new ArrayList<>();
725+
if (cmd.hasOption(customDictionariesOption.getOpt())) {
726+
String[] dictionaryStrings = cmd.getOptionValues(customDictionariesOption.getOpt());
727+
for (int i = 0; i < dictionaryStrings.length; i++) {
728+
String[] dictionaryWords = dictionaryStrings[i].split(",");
729+
CustomInfoType customInfoType =
730+
CustomInfoType
731+
.newBuilder()
732+
.setInfoType(
733+
InfoType.newBuilder().setName(String.format("CUSTOM_DICTIONARY_%s", i)))
734+
.setDictionary(
735+
Dictionary
736+
.newBuilder()
737+
.setWordList(
738+
WordList
739+
.newBuilder()
740+
.addAllWords(Arrays.<String>asList(dictionaryWords))))
741+
.build();
742+
customInfoTypesList.add(customInfoType);
743+
}
744+
}
745+
if (cmd.hasOption(customRegexesOption.getOpt())) {
746+
String[] patterns = cmd.getOptionValues(customRegexesOption.getOpt());
747+
for (int i = 0; i < patterns.length; i++) {
748+
CustomInfoType customInfoType =
749+
CustomInfoType
750+
.newBuilder()
751+
.setInfoType(InfoType.newBuilder().setName(String.format("CUSTOM_REGEX_%s", i)))
752+
.setRegex(Regex.newBuilder().setPattern(patterns[i]))
753+
.build();
754+
customInfoTypesList.add(customInfoType);
755+
}
756+
}
757+
698758
// string inspection
699759
if (cmd.hasOption("s")) {
700760
String val = cmd.getOptionValue(stringOption.getOpt());
701-
inspectString(val, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId);
761+
inspectString(
762+
val,
763+
minLikelihood,
764+
maxFindings,
765+
infoTypesList,
766+
customInfoTypesList,
767+
includeQuote,
768+
projectId);
702769
} else if (cmd.hasOption("f")) {
703770
String filePath = cmd.getOptionValue(fileOption.getOpt());
704-
inspectFile(filePath, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId);
771+
inspectFile(
772+
filePath,
773+
minLikelihood,
774+
maxFindings,
775+
infoTypesList,
776+
customInfoTypesList,
777+
includeQuote,
778+
projectId);
705779
// gcs file inspection
706780
} else if (cmd.hasOption("gcs")) {
707781
String bucketName = cmd.getOptionValue(bucketNameOption.getOpt());
@@ -711,6 +785,7 @@ public static void main(String[] args) throws Exception {
711785
fileName,
712786
minLikelihood,
713787
infoTypesList,
788+
customInfoTypesList,
714789
maxFindings,
715790
topicId,
716791
subscriptionId,
@@ -726,6 +801,7 @@ public static void main(String[] args) throws Exception {
726801
kind,
727802
minLikelihood,
728803
infoTypesList,
804+
customInfoTypesList,
729805
maxFindings,
730806
topicId,
731807
subscriptionId);
@@ -739,6 +815,7 @@ public static void main(String[] args) throws Exception {
739815
tableId,
740816
minLikelihood,
741817
infoTypesList,
818+
customInfoTypesList,
742819
maxFindings,
743820
topicId,
744821
subscriptionId);

0 commit comments

Comments
 (0)