Skip to content

Commit 8ba4acb

Browse files
author
Ace Nassri
committed
Add k-map sample
1 parent cd75921 commit 8ba4acb

File tree

2 files changed

+216
-5
lines changed

2 files changed

+216
-5
lines changed

dlp/src/main/java/com/example/dlp/RiskAnalysis.java

Lines changed: 195 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KAnonymityResult;
2828
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KAnonymityResult.KAnonymityEquivalenceClass;
2929
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KAnonymityResult.KAnonymityHistogramBucket;
30+
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KMapEstimationResult;
31+
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KMapEstimationResult.KMapEstimationHistogramBucket;
32+
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.KMapEstimationResult.KMapEstimationQuasiIdValues;
3033
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.LDiversityResult;
3134
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.LDiversityResult.LDiversityEquivalenceClass;
3235
import com.google.privacy.dlp.v2.AnalyzeDataSourceRiskDetails.LDiversityResult.LDiversityHistogramBucket;
@@ -35,9 +38,12 @@
3538
import com.google.privacy.dlp.v2.DlpJob;
3639
import com.google.privacy.dlp.v2.FieldId;
3740
import com.google.privacy.dlp.v2.GetDlpJobRequest;
41+
import com.google.privacy.dlp.v2.InfoType;
3842
import com.google.privacy.dlp.v2.PrivacyMetric;
3943
import com.google.privacy.dlp.v2.PrivacyMetric.CategoricalStatsConfig;
4044
import com.google.privacy.dlp.v2.PrivacyMetric.KAnonymityConfig;
45+
import com.google.privacy.dlp.v2.PrivacyMetric.KMapEstimationConfig;
46+
import com.google.privacy.dlp.v2.PrivacyMetric.KMapEstimationConfig.TaggedField;
4147
import com.google.privacy.dlp.v2.PrivacyMetric.LDiversityConfig;
4248
import com.google.privacy.dlp.v2.PrivacyMetric.NumericalStatsConfig;
4349
import com.google.privacy.dlp.v2.ProjectName;
@@ -46,7 +52,10 @@
4652
import com.google.privacy.dlp.v2.ValueFrequency;
4753
import com.google.pubsub.v1.ProjectSubscriptionName;
4854
import com.google.pubsub.v1.ProjectTopicName;
55+
56+
import java.util.ArrayList;
4957
import java.util.Arrays;
58+
import java.util.Collections;
5059
import java.util.List;
5160
import java.util.concurrent.TimeUnit;
5261
import java.util.concurrent.TimeoutException;
@@ -59,6 +68,7 @@
5968
import org.apache.commons.cli.OptionGroup;
6069
import org.apache.commons.cli.Options;
6170
import org.apache.commons.cli.ParseException;
71+
import java.util.Iterator;
6272

6373
public class RiskAnalysis {
6474

@@ -175,6 +185,8 @@ private static void numericalStatsAnalysis(
175185
}
176186
lastValue = currentValue;
177187
}
188+
} catch (Exception e) {
189+
System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage());
178190
}
179191
}
180192
// [END dlp_numerical_stats]
@@ -419,7 +431,7 @@ private static void calculateKAnonymity(
419431
}
420432
}
421433
} catch (Exception e) {
422-
System.out.println("Error in kAnonymityAnalysis: " + e.getMessage());
434+
System.out.println("Error in calculateKAnonymity: " + e.getMessage());
423435
}
424436
}
425437
// [END dlp_k_anonymity]
@@ -555,11 +567,162 @@ private static void calculateLDiversity(
555567
}
556568
}
557569
} catch (Exception e) {
558-
System.out.println("Error in lDiversityAnalysis: " + e.getMessage());
570+
System.out.println("Error in calculateLDiversity: " + e.getMessage());
559571
}
560572
}
561573
// [END dlp_l_diversity]
562574

575+
// [START dlp_k_map]
576+
/**
577+
* Calculate k-map risk estimation for an attribute relative to quasi-identifiers in a BigQuery table.
578+
*
579+
* @param projectId The Google Cloud Platform project ID to run the API call under.
580+
* @param datasetId The BigQuery dataset to analyze.
581+
* @param tableId The BigQuery table to analyze.
582+
* @param quasiIds A set of column names that form a composite key ('quasi-identifiers').
583+
* @param infoTypes The infoTypes corresponding to each quasi-id column
584+
* @param regionCode An ISO-3166-1 region code specifying the k-map distribution region
585+
* @param topicId The name of the Pub/Sub topic to notify once the job completes
586+
* @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
587+
* completion status.
588+
*/
589+
private static void calculateKMap(
590+
String projectId,
591+
String datasetId,
592+
String tableId,
593+
List<String> quasiIds,
594+
List<InfoType> infoTypes,
595+
String regionCode,
596+
String topicId,
597+
String subscriptionId)
598+
throws Exception {
599+
600+
// Instantiates a client
601+
try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
602+
603+
Iterator<String> quasiIdsIterator = quasiIds.iterator();
604+
Iterator<InfoType> infoTypesIterator = infoTypes.iterator();
605+
606+
if (quasiIds.size() != infoTypes.size()) {
607+
throw new IllegalArgumentException("The numbers of quasi-IDs and infoTypes must be equal!");
608+
}
609+
610+
ArrayList<TaggedField> taggedFields = new ArrayList();
611+
612+
while (quasiIdsIterator.hasNext() || infoTypesIterator.hasNext()) {
613+
taggedFields.add(TaggedField.newBuilder()
614+
.setField(FieldId.newBuilder().setName(quasiIdsIterator.next()).build())
615+
.setInfoType(infoTypesIterator.next())
616+
.build());
617+
}
618+
619+
KMapEstimationConfig kmapConfig =
620+
KMapEstimationConfig.newBuilder()
621+
.addAllQuasiIds(taggedFields)
622+
.setRegionCode(regionCode)
623+
.build();
624+
625+
BigQueryTable bigQueryTable =
626+
BigQueryTable.newBuilder()
627+
.setProjectId(projectId)
628+
.setDatasetId(datasetId)
629+
.setTableId(tableId)
630+
.build();
631+
632+
PrivacyMetric privacyMetric =
633+
PrivacyMetric.newBuilder().setKMapEstimationConfig(kmapConfig).build();
634+
635+
String topicName = String.format("projects/%s/topics/%s", projectId, topicId);
636+
637+
PublishToPubSub publishToPubSub = PublishToPubSub.newBuilder().setTopic(topicName).build();
638+
639+
// Create action to publish job status notifications over Google Cloud Pub/Sub
640+
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
641+
642+
RiskAnalysisJobConfig riskAnalysisJobConfig =
643+
RiskAnalysisJobConfig.newBuilder()
644+
.setSourceTable(bigQueryTable)
645+
.setPrivacyMetric(privacyMetric)
646+
.addActions(action)
647+
.build();
648+
649+
CreateDlpJobRequest createDlpJobRequest =
650+
CreateDlpJobRequest.newBuilder()
651+
.setParent(ProjectName.of(projectId).toString())
652+
.setRiskJob(riskAnalysisJobConfig)
653+
.build();
654+
655+
DlpJob dlpJob = dlpServiceClient.createDlpJob(createDlpJobRequest);
656+
String dlpJobName = dlpJob.getName();
657+
658+
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
659+
660+
// Set up a Pub/Sub subscriber to listen on the job completion status
661+
Subscriber subscriber =
662+
Subscriber.newBuilder(
663+
ProjectSubscriptionName.newBuilder()
664+
.setProject(projectId)
665+
.setSubscription(subscriptionId)
666+
.build(),
667+
(pubsubMessage, ackReplyConsumer) -> {
668+
if (pubsubMessage.getAttributesCount() > 0
669+
&& pubsubMessage.getAttributesMap().get("DlpJobName").equals(dlpJobName)) {
670+
// notify job completion
671+
done.set(true);
672+
ackReplyConsumer.ack();
673+
}
674+
})
675+
.build();
676+
subscriber.startAsync();
677+
678+
// Wait for job completion semi-synchronously
679+
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
680+
try{
681+
done.get(1, TimeUnit.MINUTES);
682+
Thread.sleep(500); // Wait for the job to become available
683+
} catch (TimeoutException e) {
684+
System.out.println("Unable to verify job completion.");
685+
}
686+
687+
// retrieve completed job status
688+
DlpJob completedJob =
689+
dlpServiceClient.getDlpJob(GetDlpJobRequest.newBuilder().setName(dlpJobName).build());
690+
691+
System.out.println("Job status: " + completedJob.getState());
692+
AnalyzeDataSourceRiskDetails riskDetails = completedJob.getRiskDetails();
693+
694+
KMapEstimationResult kmapResult = riskDetails.getKMapEstimationResult();
695+
for (KMapEstimationHistogramBucket result :
696+
kmapResult.getKMapEstimationHistogramList()) {
697+
698+
System.out.printf("\tAnonymity range: [%d, %d]\n",
699+
result.getMinAnonymity(),
700+
result.getMaxAnonymity());
701+
System.out.printf("\tSize: %d\n", result.getBucketSize());
702+
703+
for (KMapEstimationQuasiIdValues valueBucket : result.getBucketValuesList()) {
704+
String quasiIdValues =
705+
valueBucket
706+
.getQuasiIdsValuesList()
707+
.stream()
708+
.map(v -> {
709+
String s = v.toString();
710+
return s.substring(s.indexOf(':') + 1).trim();
711+
})
712+
.collect(Collectors.joining(", "));
713+
714+
715+
System.out.printf("\tValues: {%s}\n", quasiIdValues);
716+
System.out.printf("\tEstimated k-map anonymity: %d\n",
717+
valueBucket.getEstimatedAnonymity());
718+
}
719+
}
720+
} catch (Exception e) {
721+
System.out.println("Error in calculateKMap: " + e.getMessage());
722+
}
723+
}
724+
// [END dlp_k_map]
725+
563726
/**
564727
* Command line application to perform risk analysis using the Data Loss Prevention API. Supported
565728
* data format: BigQuery tables
@@ -575,9 +738,12 @@ public static void main(String[] args) throws Exception {
575738
Option categoricalAnalysisOption = new Option("c", "categorical");
576739
optionsGroup.addOption(categoricalAnalysisOption);
577740

578-
Option kanonymityOption = new Option("k", "kAnonymity");
741+
Option kanonymityOption = new Option("a", "kAnonymity");
579742
optionsGroup.addOption(kanonymityOption);
580743

744+
Option kmapOption = new Option("m", "kAnonymity");
745+
optionsGroup.addOption(kmapOption);
746+
581747
Option ldiversityOption = new Option("l", "lDiversity");
582748
optionsGroup.addOption(ldiversityOption);
583749

@@ -607,10 +773,19 @@ public static void main(String[] args) throws Exception {
607773
Option.builder("sensitiveAttribute").hasArg(true).required(false).build();
608774
commandLineOptions.addOption(sensitiveAttributeOption);
609775

776+
Option regionCodeOption =
777+
Option.builder("regionCode").hasArg(true).required(false).build();
778+
commandLineOptions.addOption(regionCodeOption);
779+
610780
Option quasiIdColumnNamesOption =
611781
Option.builder("quasiIdColumnNames").hasArg(true).required(false).build();
782+
quasiIdColumnNamesOption.setArgs(Option.UNLIMITED_VALUES);
612783
commandLineOptions.addOption(quasiIdColumnNamesOption);
613784

785+
Option infoTypesOption = Option.builder("infoTypes").hasArg(true).required(false).build();
786+
infoTypesOption.setArgs(Option.UNLIMITED_VALUES);
787+
commandLineOptions.addOption(infoTypesOption);
788+
614789
CommandLineParser parser = new DefaultParser();
615790
HelpFormatter formatter = new HelpFormatter();
616791
CommandLine cmd;
@@ -630,9 +805,20 @@ public static void main(String[] args) throws Exception {
630805
String projectId =
631806
cmd.getOptionValue(projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
632807

808+
String regionCode = cmd.getOptionValue(regionCodeOption.getOpt(), "US");
809+
633810
String topicId = cmd.getOptionValue(topicIdOption.getOpt());
634811
String subscriptionId = cmd.getOptionValue(subscriptionIdOption.getOpt());
635812

813+
List<InfoType> infoTypesList = Collections.emptyList();
814+
if (cmd.hasOption(infoTypesOption.getOpt())) {
815+
infoTypesList = new ArrayList<>();
816+
String[] infoTypes = cmd.getOptionValues(infoTypesOption.getOpt());
817+
for (String infoType : infoTypes) {
818+
infoTypesList.add(InfoType.newBuilder().setName(infoType).build());
819+
}
820+
}
821+
636822
if (cmd.hasOption("n")) {
637823
// numerical stats analysis
638824
String columnName = cmd.getOptionValue(columnNameOption.getOpt());
@@ -641,12 +827,17 @@ public static void main(String[] args) throws Exception {
641827
// categorical stats analysis
642828
String columnName = cmd.getOptionValue(columnNameOption.getOpt());
643829
categoricalStatsAnalysis(projectId, datasetId, tableId, columnName, topicId, subscriptionId);
644-
} else if (cmd.hasOption("k")) {
830+
} else if (cmd.hasOption("a")) {
645831
// k-anonymity analysis
646832
List<String> quasiIdColumnNames =
647833
Arrays.asList(cmd.getOptionValues(quasiIdColumnNamesOption.getOpt()));
648834
calculateKAnonymity(
649835
projectId, datasetId, tableId, quasiIdColumnNames, topicId, subscriptionId);
836+
} else if (cmd.hasOption("m")) {
837+
// k-map analysis
838+
List<String> quasiIdColumnNames =
839+
Arrays.asList(cmd.getOptionValues(quasiIdColumnNamesOption.getOpt()));
840+
calculateKMap(projectId, datasetId, tableId, quasiIdColumnNames, infoTypesList, regionCode, topicId, subscriptionId);
650841
} else if (cmd.hasOption("l")) {
651842
// l-diversity analysis
652843
String sensitiveAttribute = cmd.getOptionValue(sensitiveAttributeOption.getOpt());

dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public void testCategoricalStats() throws Exception {
8686
@Test
8787
public void testKAnonymity() throws Exception {
8888
RiskAnalysis.main(new String[]{
89-
"-k",
89+
"-a",
9090
"-datasetId", "integration_tests_dlp",
9191
"-tableId", "harmful",
9292
"-quasiIdColumnNames", "Age", "Mystery",
@@ -117,6 +117,26 @@ public void testLDiversity() throws Exception {
117117
assertTrue(output.contains("Sensitive value string_value: \"James\""));
118118
}
119119

120+
@Test
121+
public void testKMap() throws Exception {
122+
RiskAnalysis.main(
123+
new String[] {
124+
"-m",
125+
"-datasetId", "integration_tests_dlp",
126+
"-tableId", "harmful",
127+
"-topicId", topicId,
128+
"-subscriptionId", subscriptionId,
129+
"-regionCode", "US",
130+
"-quasiIdColumnNames", "Age", "Gender",
131+
"-infoTypes", "AGE", "GENDER"
132+
});
133+
String output = bout.toString();
134+
135+
assertTrue(Pattern.compile("Anonymity range: \\[\\d, \\d]").matcher(output).find());
136+
assertTrue(Pattern.compile("Size: \\d").matcher(output).find());
137+
assertTrue(Pattern.compile("Values: \\{\\d{2}, \"Female\"\\}").matcher(output).find());
138+
}
139+
120140
@After
121141
public void tearDown() {
122142
System.setOut(null);

0 commit comments

Comments
 (0)