27
27
import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .KAnonymityResult ;
28
28
import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .KAnonymityResult .KAnonymityEquivalenceClass ;
29
29
import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .KAnonymityResult .KAnonymityHistogramBucket ;
30
+ import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .KMapEstimationResult ;
31
+ import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .KMapEstimationResult .KMapEstimationHistogramBucket ;
32
+ import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .KMapEstimationResult .KMapEstimationQuasiIdValues ;
30
33
import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .LDiversityResult ;
31
34
import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .LDiversityResult .LDiversityEquivalenceClass ;
32
35
import com .google .privacy .dlp .v2 .AnalyzeDataSourceRiskDetails .LDiversityResult .LDiversityHistogramBucket ;
35
38
import com .google .privacy .dlp .v2 .DlpJob ;
36
39
import com .google .privacy .dlp .v2 .FieldId ;
37
40
import com .google .privacy .dlp .v2 .GetDlpJobRequest ;
41
+ import com .google .privacy .dlp .v2 .InfoType ;
38
42
import com .google .privacy .dlp .v2 .PrivacyMetric ;
39
43
import com .google .privacy .dlp .v2 .PrivacyMetric .CategoricalStatsConfig ;
40
44
import com .google .privacy .dlp .v2 .PrivacyMetric .KAnonymityConfig ;
45
+ import com .google .privacy .dlp .v2 .PrivacyMetric .KMapEstimationConfig ;
46
+ import com .google .privacy .dlp .v2 .PrivacyMetric .KMapEstimationConfig .TaggedField ;
41
47
import com .google .privacy .dlp .v2 .PrivacyMetric .LDiversityConfig ;
42
48
import com .google .privacy .dlp .v2 .PrivacyMetric .NumericalStatsConfig ;
43
49
import com .google .privacy .dlp .v2 .ProjectName ;
46
52
import com .google .privacy .dlp .v2 .ValueFrequency ;
47
53
import com .google .pubsub .v1 .ProjectSubscriptionName ;
48
54
import com .google .pubsub .v1 .ProjectTopicName ;
55
+
56
+ import java .util .ArrayList ;
49
57
import java .util .Arrays ;
58
+ import java .util .Collections ;
50
59
import java .util .List ;
51
60
import java .util .concurrent .TimeUnit ;
52
61
import java .util .concurrent .TimeoutException ;
59
68
import org .apache .commons .cli .OptionGroup ;
60
69
import org .apache .commons .cli .Options ;
61
70
import org .apache .commons .cli .ParseException ;
71
+ import java .util .Iterator ;
62
72
63
73
public class RiskAnalysis {
64
74
@@ -175,6 +185,8 @@ private static void numericalStatsAnalysis(
175
185
}
176
186
lastValue = currentValue ;
177
187
}
188
+ } catch (Exception e ) {
189
+ System .out .println ("Error in categoricalStatsAnalysis: " + e .getMessage ());
178
190
}
179
191
}
180
192
// [END dlp_numerical_stats]
@@ -419,7 +431,7 @@ private static void calculateKAnonymity(
419
431
}
420
432
}
421
433
} catch (Exception e ) {
422
- System .out .println ("Error in kAnonymityAnalysis : " + e .getMessage ());
434
+ System .out .println ("Error in calculateKAnonymity : " + e .getMessage ());
423
435
}
424
436
}
425
437
// [END dlp_k_anonymity]
@@ -555,11 +567,162 @@ private static void calculateLDiversity(
555
567
}
556
568
}
557
569
} catch (Exception e ) {
558
- System .out .println ("Error in lDiversityAnalysis : " + e .getMessage ());
570
+ System .out .println ("Error in calculateLDiversity : " + e .getMessage ());
559
571
}
560
572
}
561
573
// [END dlp_l_diversity]
562
574
575
+ // [START dlp_k_map]
576
+ /**
577
+ * Calculate k-map risk estimation for an attribute relative to quasi-identifiers in a BigQuery table.
578
+ *
579
+ * @param projectId The Google Cloud Platform project ID to run the API call under.
580
+ * @param datasetId The BigQuery dataset to analyze.
581
+ * @param tableId The BigQuery table to analyze.
582
+ * @param quasiIds A set of column names that form a composite key ('quasi-identifiers').
583
+ * @param infoTypes The infoTypes corresponding to each quasi-id column
584
+ * @param regionCode An ISO-3166-1 region code specifying the k-map distribution region
585
+ * @param topicId The name of the Pub/Sub topic to notify once the job completes
586
+ * @param subscriptionId The name of the Pub/Sub subscription to use when listening for job
587
+ * completion status.
588
+ */
589
+ private static void calculateKMap (
590
+ String projectId ,
591
+ String datasetId ,
592
+ String tableId ,
593
+ List <String > quasiIds ,
594
+ List <InfoType > infoTypes ,
595
+ String regionCode ,
596
+ String topicId ,
597
+ String subscriptionId )
598
+ throws Exception {
599
+
600
+ // Instantiates a client
601
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient .create ()) {
602
+
603
+ Iterator <String > quasiIdsIterator = quasiIds .iterator ();
604
+ Iterator <InfoType > infoTypesIterator = infoTypes .iterator ();
605
+
606
+ if (quasiIds .size () != infoTypes .size ()) {
607
+ throw new IllegalArgumentException ("The numbers of quasi-IDs and infoTypes must be equal!" );
608
+ }
609
+
610
+ ArrayList <TaggedField > taggedFields = new ArrayList ();
611
+
612
+ while (quasiIdsIterator .hasNext () || infoTypesIterator .hasNext ()) {
613
+ taggedFields .add (TaggedField .newBuilder ()
614
+ .setField (FieldId .newBuilder ().setName (quasiIdsIterator .next ()).build ())
615
+ .setInfoType (infoTypesIterator .next ())
616
+ .build ());
617
+ }
618
+
619
+ KMapEstimationConfig kmapConfig =
620
+ KMapEstimationConfig .newBuilder ()
621
+ .addAllQuasiIds (taggedFields )
622
+ .setRegionCode (regionCode )
623
+ .build ();
624
+
625
+ BigQueryTable bigQueryTable =
626
+ BigQueryTable .newBuilder ()
627
+ .setProjectId (projectId )
628
+ .setDatasetId (datasetId )
629
+ .setTableId (tableId )
630
+ .build ();
631
+
632
+ PrivacyMetric privacyMetric =
633
+ PrivacyMetric .newBuilder ().setKMapEstimationConfig (kmapConfig ).build ();
634
+
635
+ String topicName = String .format ("projects/%s/topics/%s" , projectId , topicId );
636
+
637
+ PublishToPubSub publishToPubSub = PublishToPubSub .newBuilder ().setTopic (topicName ).build ();
638
+
639
+ // Create action to publish job status notifications over Google Cloud Pub/Sub
640
+ Action action = Action .newBuilder ().setPubSub (publishToPubSub ).build ();
641
+
642
+ RiskAnalysisJobConfig riskAnalysisJobConfig =
643
+ RiskAnalysisJobConfig .newBuilder ()
644
+ .setSourceTable (bigQueryTable )
645
+ .setPrivacyMetric (privacyMetric )
646
+ .addActions (action )
647
+ .build ();
648
+
649
+ CreateDlpJobRequest createDlpJobRequest =
650
+ CreateDlpJobRequest .newBuilder ()
651
+ .setParent (ProjectName .of (projectId ).toString ())
652
+ .setRiskJob (riskAnalysisJobConfig )
653
+ .build ();
654
+
655
+ DlpJob dlpJob = dlpServiceClient .createDlpJob (createDlpJobRequest );
656
+ String dlpJobName = dlpJob .getName ();
657
+
658
+ final SettableApiFuture <Boolean > done = SettableApiFuture .create ();
659
+
660
+ // Set up a Pub/Sub subscriber to listen on the job completion status
661
+ Subscriber subscriber =
662
+ Subscriber .newBuilder (
663
+ ProjectSubscriptionName .newBuilder ()
664
+ .setProject (projectId )
665
+ .setSubscription (subscriptionId )
666
+ .build (),
667
+ (pubsubMessage , ackReplyConsumer ) -> {
668
+ if (pubsubMessage .getAttributesCount () > 0
669
+ && pubsubMessage .getAttributesMap ().get ("DlpJobName" ).equals (dlpJobName )) {
670
+ // notify job completion
671
+ done .set (true );
672
+ ackReplyConsumer .ack ();
673
+ }
674
+ })
675
+ .build ();
676
+ subscriber .startAsync ();
677
+
678
+ // Wait for job completion semi-synchronously
679
+ // For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
680
+ try {
681
+ done .get (1 , TimeUnit .MINUTES );
682
+ Thread .sleep (500 ); // Wait for the job to become available
683
+ } catch (TimeoutException e ) {
684
+ System .out .println ("Unable to verify job completion." );
685
+ }
686
+
687
+ // retrieve completed job status
688
+ DlpJob completedJob =
689
+ dlpServiceClient .getDlpJob (GetDlpJobRequest .newBuilder ().setName (dlpJobName ).build ());
690
+
691
+ System .out .println ("Job status: " + completedJob .getState ());
692
+ AnalyzeDataSourceRiskDetails riskDetails = completedJob .getRiskDetails ();
693
+
694
+ KMapEstimationResult kmapResult = riskDetails .getKMapEstimationResult ();
695
+ for (KMapEstimationHistogramBucket result :
696
+ kmapResult .getKMapEstimationHistogramList ()) {
697
+
698
+ System .out .printf ("\t Anonymity range: [%d, %d]\n " ,
699
+ result .getMinAnonymity (),
700
+ result .getMaxAnonymity ());
701
+ System .out .printf ("\t Size: %d\n " , result .getBucketSize ());
702
+
703
+ for (KMapEstimationQuasiIdValues valueBucket : result .getBucketValuesList ()) {
704
+ String quasiIdValues =
705
+ valueBucket
706
+ .getQuasiIdsValuesList ()
707
+ .stream ()
708
+ .map (v -> {
709
+ String s = v .toString ();
710
+ return s .substring (s .indexOf (':' ) + 1 ).trim ();
711
+ })
712
+ .collect (Collectors .joining (", " ));
713
+
714
+
715
+ System .out .printf ("\t Values: {%s}\n " , quasiIdValues );
716
+ System .out .printf ("\t Estimated k-map anonymity: %d\n " ,
717
+ valueBucket .getEstimatedAnonymity ());
718
+ }
719
+ }
720
+ } catch (Exception e ) {
721
+ System .out .println ("Error in calculateKMap: " + e .getMessage ());
722
+ }
723
+ }
724
+ // [END dlp_k_map]
725
+
563
726
/**
564
727
* Command line application to perform risk analysis using the Data Loss Prevention API. Supported
565
728
* data format: BigQuery tables
@@ -575,9 +738,12 @@ public static void main(String[] args) throws Exception {
575
738
Option categoricalAnalysisOption = new Option ("c" , "categorical" );
576
739
optionsGroup .addOption (categoricalAnalysisOption );
577
740
578
- Option kanonymityOption = new Option ("k " , "kAnonymity" );
741
+ Option kanonymityOption = new Option ("a " , "kAnonymity" );
579
742
optionsGroup .addOption (kanonymityOption );
580
743
744
+ Option kmapOption = new Option ("m" , "kAnonymity" );
745
+ optionsGroup .addOption (kmapOption );
746
+
581
747
Option ldiversityOption = new Option ("l" , "lDiversity" );
582
748
optionsGroup .addOption (ldiversityOption );
583
749
@@ -607,10 +773,19 @@ public static void main(String[] args) throws Exception {
607
773
Option .builder ("sensitiveAttribute" ).hasArg (true ).required (false ).build ();
608
774
commandLineOptions .addOption (sensitiveAttributeOption );
609
775
776
+ Option regionCodeOption =
777
+ Option .builder ("regionCode" ).hasArg (true ).required (false ).build ();
778
+ commandLineOptions .addOption (regionCodeOption );
779
+
610
780
Option quasiIdColumnNamesOption =
611
781
Option .builder ("quasiIdColumnNames" ).hasArg (true ).required (false ).build ();
782
+ quasiIdColumnNamesOption .setArgs (Option .UNLIMITED_VALUES );
612
783
commandLineOptions .addOption (quasiIdColumnNamesOption );
613
784
785
+ Option infoTypesOption = Option .builder ("infoTypes" ).hasArg (true ).required (false ).build ();
786
+ infoTypesOption .setArgs (Option .UNLIMITED_VALUES );
787
+ commandLineOptions .addOption (infoTypesOption );
788
+
614
789
CommandLineParser parser = new DefaultParser ();
615
790
HelpFormatter formatter = new HelpFormatter ();
616
791
CommandLine cmd ;
@@ -630,9 +805,20 @@ public static void main(String[] args) throws Exception {
630
805
String projectId =
631
806
cmd .getOptionValue (projectIdOption .getOpt (), ServiceOptions .getDefaultProjectId ());
632
807
808
+ String regionCode = cmd .getOptionValue (regionCodeOption .getOpt (), "US" );
809
+
633
810
String topicId = cmd .getOptionValue (topicIdOption .getOpt ());
634
811
String subscriptionId = cmd .getOptionValue (subscriptionIdOption .getOpt ());
635
812
813
+ List <InfoType > infoTypesList = Collections .emptyList ();
814
+ if (cmd .hasOption (infoTypesOption .getOpt ())) {
815
+ infoTypesList = new ArrayList <>();
816
+ String [] infoTypes = cmd .getOptionValues (infoTypesOption .getOpt ());
817
+ for (String infoType : infoTypes ) {
818
+ infoTypesList .add (InfoType .newBuilder ().setName (infoType ).build ());
819
+ }
820
+ }
821
+
636
822
if (cmd .hasOption ("n" )) {
637
823
// numerical stats analysis
638
824
String columnName = cmd .getOptionValue (columnNameOption .getOpt ());
@@ -641,12 +827,17 @@ public static void main(String[] args) throws Exception {
641
827
// categorical stats analysis
642
828
String columnName = cmd .getOptionValue (columnNameOption .getOpt ());
643
829
categoricalStatsAnalysis (projectId , datasetId , tableId , columnName , topicId , subscriptionId );
644
- } else if (cmd .hasOption ("k " )) {
830
+ } else if (cmd .hasOption ("a " )) {
645
831
// k-anonymity analysis
646
832
List <String > quasiIdColumnNames =
647
833
Arrays .asList (cmd .getOptionValues (quasiIdColumnNamesOption .getOpt ()));
648
834
calculateKAnonymity (
649
835
projectId , datasetId , tableId , quasiIdColumnNames , topicId , subscriptionId );
836
+ } else if (cmd .hasOption ("m" )) {
837
+ // k-map analysis
838
+ List <String > quasiIdColumnNames =
839
+ Arrays .asList (cmd .getOptionValues (quasiIdColumnNamesOption .getOpt ()));
840
+ calculateKMap (projectId , datasetId , tableId , quasiIdColumnNames , infoTypesList , regionCode , topicId , subscriptionId );
650
841
} else if (cmd .hasOption ("l" )) {
651
842
// l-diversity analysis
652
843
String sensitiveAttribute = cmd .getOptionValue (sensitiveAttributeOption .getOpt ());
0 commit comments