xref: /aosp_15_r20/external/googleapis/google/privacy/dlp/v2/storage.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.privacy.dlp.v2;
18
19import "google/api/resource.proto";
20import "google/protobuf/timestamp.proto";
21
22option csharp_namespace = "Google.Cloud.Dlp.V2";
23option go_package = "cloud.google.com/go/dlp/apiv2/dlppb;dlppb";
24option java_multiple_files = true;
25option java_outer_classname = "DlpStorage";
26option java_package = "com.google.privacy.dlp.v2";
27option php_namespace = "Google\\Cloud\\Dlp\\V2";
28option ruby_package = "Google::Cloud::Dlp::V2";
29
30// Type of information detected by the API.
31message InfoType {
32  // Name of the information type. Either a name of your choosing when
33  // creating a CustomInfoType, or one of the names listed
34  // at
35  // https://cloud.google.com/sensitive-data-protection/docs/infotypes-reference
36  // when specifying a built-in type.  When sending Cloud DLP results to Data
37  // Catalog, infoType names should conform to the pattern
38  // `[A-Za-z0-9$_-]{1,64}`.
39  string name = 1;
40
41  // Optional version name for this InfoType.
42  string version = 2;
43
44  // Optional custom sensitivity for this InfoType.
45  // This only applies to data profiling.
46  SensitivityScore sensitivity_score = 3;
47}
48
49// Score is calculated from of all elements in the data profile.
50// A higher level means the data is more sensitive.
51message SensitivityScore {
52  // Various sensitivity score levels for resources.
53  enum SensitivityScoreLevel {
54    // Unused.
55    SENSITIVITY_SCORE_UNSPECIFIED = 0;
56
57    // No sensitive information detected. The resource isn't publicly
58    // accessible.
59    SENSITIVITY_LOW = 10;
60
61    // Medium risk. Contains personally identifiable information (PII),
62    // potentially sensitive data, or fields with free-text data that are at a
63    // higher risk of having intermittent sensitive data. Consider limiting
64    // access.
65    SENSITIVITY_MODERATE = 20;
66
67    // High risk. Sensitive personally identifiable information (SPII) can be
68    // present. Exfiltration of data can lead to user data loss.
69    // Re-identification of users might be possible. Consider limiting usage and
70    // or removing SPII.
71    SENSITIVITY_HIGH = 30;
72  }
73
74  // The sensitivity score applied to the resource.
75  SensitivityScoreLevel score = 1;
76}
77
78// Coarse-grained confidence level of how well a particular finding
79// satisfies the criteria to match a particular infoType.
80//
81// Likelihood is calculated based on the number of signals a
82// finding has that implies that the finding matches the infoType. For
83// example, a string that has an '@' and a '.com' is more likely to be a
84// match for an email address than a string that only has an '@'.
85//
86// In general, the highest likelihood level has the strongest signals that
87// indicate a match. That is, a finding with a high likelihood has a low chance
88// of being a false positive.
89//
90// For more information about each likelihood level
91// and how likelihood works, see [Match
92// likelihood](https://cloud.google.com/sensitive-data-protection/docs/likelihood).
93enum Likelihood {
94  // Default value; same as POSSIBLE.
95  LIKELIHOOD_UNSPECIFIED = 0;
96
97  // Highest chance of a false positive.
98  VERY_UNLIKELY = 1;
99
100  // High chance of a false positive.
101  UNLIKELY = 2;
102
103  // Some matching signals. The default value.
104  POSSIBLE = 3;
105
106  // Low chance of a false positive.
107  LIKELY = 4;
108
109  // Confidence level is high. Lowest chance of a false positive.
110  VERY_LIKELY = 5;
111}
112
113// A reference to a StoredInfoType to use with scanning.
114message StoredType {
115  // Resource name of the requested `StoredInfoType`, for example
116  // `organizations/433245324/storedInfoTypes/432452342` or
117  // `projects/project-id/storedInfoTypes/432452342`.
118  string name = 1;
119
120  // Timestamp indicating when the version of the `StoredInfoType` used for
121  // inspection was created. Output-only field, populated by the system.
122  google.protobuf.Timestamp create_time = 2;
123}
124
125// Custom information type provided by the user. Used to find domain-specific
126// sensitive information configurable to the data in question.
127message CustomInfoType {
128  // Custom information type based on a dictionary of words or phrases. This can
129  // be used to match sensitive information specific to the data, such as a list
130  // of employee IDs or job titles.
131  //
132  // Dictionary words are case-insensitive and all characters other than letters
133  // and digits in the unicode [Basic Multilingual
134  // Plane](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane)
135  // will be replaced with whitespace when scanning for matches, so the
136  // dictionary phrase "Sam Johnson" will match all three phrases "sam johnson",
137  // "Sam, Johnson", and "Sam (Johnson)". Additionally, the characters
138  // surrounding any match must be of a different type than the adjacent
139  // characters within the word, so letters must be next to non-letters and
140  // digits next to non-digits. For example, the dictionary word "jen" will
141  // match the first three letters of the text "jen123" but will return no
142  // matches for "jennifer".
143  //
144  // Dictionary words containing a large number of characters that are not
145  // letters or digits may result in unexpected findings because such characters
146  // are treated as whitespace. The
147  // [limits](https://cloud.google.com/sensitive-data-protection/limits) page
148  // contains details about the size limits of dictionaries. For dictionaries
149  // that do not fit within these constraints, consider using
150  // `LargeCustomDictionaryConfig` in the `StoredInfoType` API.
151  message Dictionary {
152    // Message defining a list of words or phrases to search for in the data.
153    message WordList {
154      // Words or phrases defining the dictionary. The dictionary must contain
155      // at least one phrase and every phrase must contain at least 2 characters
156      // that are letters or digits. [required]
157      repeated string words = 1;
158    }
159
160    // The potential places the data can be read from.
161    oneof source {
162      // List of words or phrases to search for.
163      WordList word_list = 1;
164
165      // Newline-delimited file of words in Cloud Storage. Only a single file
166      // is accepted.
167      CloudStoragePath cloud_storage_path = 3;
168    }
169  }
170
171  // Message defining a custom regular expression.
172  message Regex {
173    // Pattern defining the regular expression. Its syntax
174    // (https://github.com/google/re2/wiki/Syntax) can be found under the
175    // google/re2 repository on GitHub.
176    string pattern = 1;
177
178    // The index of the submatch to extract as findings. When not
179    // specified, the entire match is returned. No more than 3 may be included.
180    repeated int32 group_indexes = 2;
181  }
182
183  // Message for detecting output from deidentification transformations
184  // such as
185  // [`CryptoReplaceFfxFpeConfig`](https://cloud.google.com/sensitive-data-protection/docs/reference/rest/v2/organizations.deidentifyTemplates#cryptoreplaceffxfpeconfig).
186  // These types of transformations are
187  // those that perform pseudonymization, thereby producing a "surrogate" as
188  // output. This should be used in conjunction with a field on the
189  // transformation such as `surrogate_info_type`. This CustomInfoType does
190  // not support the use of `detection_rules`.
191  message SurrogateType {}
192
193  // Deprecated; use `InspectionRuleSet` instead. Rule for modifying a
194  // `CustomInfoType` to alter behavior under certain circumstances, depending
195  // on the specific details of the rule. Not supported for the `surrogate_type`
196  // custom infoType.
197  message DetectionRule {
198    // Message for specifying a window around a finding to apply a detection
199    // rule.
200    message Proximity {
201      // Number of characters before the finding to consider. For tabular data,
202      // if you want to modify the likelihood of an entire column of findngs,
203      // set this to 1. For more information, see
204      // [Hotword example: Set the match likelihood of a table column]
205      // (https://cloud.google.com/sensitive-data-protection/docs/creating-custom-infotypes-likelihood#match-column-values).
206      int32 window_before = 1;
207
208      // Number of characters after the finding to consider.
209      int32 window_after = 2;
210    }
211
212    // Message for specifying an adjustment to the likelihood of a finding as
213    // part of a detection rule.
214    message LikelihoodAdjustment {
215      // How the likelihood will be modified.
216      oneof adjustment {
217        // Set the likelihood of a finding to a fixed value.
218        Likelihood fixed_likelihood = 1;
219
220        // Increase or decrease the likelihood by the specified number of
221        // levels. For example, if a finding would be `POSSIBLE` without the
222        // detection rule and `relative_likelihood` is 1, then it is upgraded to
223        // `LIKELY`, while a value of -1 would downgrade it to `UNLIKELY`.
224        // Likelihood may never drop below `VERY_UNLIKELY` or exceed
225        // `VERY_LIKELY`, so applying an adjustment of 1 followed by an
226        // adjustment of -1 when base likelihood is `VERY_LIKELY` will result in
227        // a final likelihood of `LIKELY`.
228        int32 relative_likelihood = 2;
229      }
230    }
231
232    // The rule that adjusts the likelihood of findings within a certain
233    // proximity of hotwords.
234    message HotwordRule {
235      // Regular expression pattern defining what qualifies as a hotword.
236      Regex hotword_regex = 1;
237
238      // Range of characters within which the entire hotword must reside.
239      // The total length of the window cannot exceed 1000 characters.
240      // The finding itself will be included in the window, so that hotwords can
241      // be used to match substrings of the finding itself. Suppose you
242      // want Cloud DLP to promote the likelihood of the phone number
243      // regex "\(\d{3}\) \d{3}-\d{4}" if the area code is known to be the
244      // area code of a company's office. In this case, use the hotword regex
245      // "\(xxx\)", where "xxx" is the area code in question.
246      //
247      // For tabular data, if you want to modify the likelihood of an entire
248      // column of findngs, see
249      // [Hotword example: Set the match likelihood of a table column]
250      // (https://cloud.google.com/sensitive-data-protection/docs/creating-custom-infotypes-likelihood#match-column-values).
251      Proximity proximity = 2;
252
253      // Likelihood adjustment to apply to all matching findings.
254      LikelihoodAdjustment likelihood_adjustment = 3;
255    }
256
257    // Type of hotword rule.
258    oneof type {
259      // Hotword-based detection rule.
260      HotwordRule hotword_rule = 1;
261    }
262  }
263
264  // Type of exclusion rule.
265  enum ExclusionType {
266    // A finding of this custom info type will not be excluded from results.
267    EXCLUSION_TYPE_UNSPECIFIED = 0;
268
269    // A finding of this custom info type will be excluded from final results,
270    // but can still affect rule execution.
271    EXCLUSION_TYPE_EXCLUDE = 1;
272  }
273
274  // CustomInfoType can either be a new infoType, or an extension of built-in
275  // infoType, when the name matches one of existing infoTypes and that infoType
276  // is specified in `InspectContent.info_types` field. Specifying the latter
277  // adds findings to the one detected by the system. If built-in info type is
278  // not specified in `InspectContent.info_types` list then the name is treated
279  // as a custom info type.
280  InfoType info_type = 1;
281
282  // Likelihood to return for this CustomInfoType. This base value can be
283  // altered by a detection rule if the finding meets the criteria specified by
284  // the rule. Defaults to `VERY_LIKELY` if not specified.
285  Likelihood likelihood = 6;
286
287  // Type of custom detector.
288  oneof type {
289    // A list of phrases to detect as a CustomInfoType.
290    Dictionary dictionary = 2;
291
292    // Regular expression based CustomInfoType.
293    Regex regex = 3;
294
295    // Message for detecting output from deidentification transformations that
296    // support reversing.
297    SurrogateType surrogate_type = 4;
298
299    // Load an existing `StoredInfoType` resource for use in
300    // `InspectDataSource`. Not currently supported in `InspectContent`.
301    StoredType stored_type = 5;
302  }
303
304  // Set of detection rules to apply to all findings of this CustomInfoType.
305  // Rules are applied in order that they are specified. Not supported for the
306  // `surrogate_type` CustomInfoType.
307  repeated DetectionRule detection_rules = 7;
308
309  // If set to EXCLUSION_TYPE_EXCLUDE this infoType will not cause a finding
310  // to be returned. It still can be used for rules matching.
311  ExclusionType exclusion_type = 8;
312
313  // Sensitivity for this CustomInfoType. If this CustomInfoType extends an
314  // existing InfoType, the sensitivity here will take precedence over that of
315  // the original InfoType. If unset for a CustomInfoType, it will default to
316  // HIGH.
317  // This only applies to data profiling.
318  SensitivityScore sensitivity_score = 9;
319}
320
321// General identifier of a data field in a storage service.
322message FieldId {
323  // Name describing the field.
324  string name = 1;
325}
326
327// Datastore partition ID.
328// A partition ID identifies a grouping of entities. The grouping is always
329// by project and namespace, however the namespace ID may be empty.
330//
331// A partition ID contains several dimensions:
332// project ID and namespace ID.
333message PartitionId {
334  // The ID of the project to which the entities belong.
335  string project_id = 2;
336
337  // If not empty, the ID of the namespace to which the entities belong.
338  string namespace_id = 4;
339}
340
341// A representation of a Datastore kind.
342message KindExpression {
343  // The name of the kind.
344  string name = 1;
345}
346
347// Options defining a data set within Google Cloud Datastore.
348message DatastoreOptions {
349  // A partition ID identifies a grouping of entities. The grouping is always
350  // by project and namespace, however the namespace ID may be empty.
351  PartitionId partition_id = 1;
352
353  // The kind to process.
354  KindExpression kind = 2;
355}
356
357// Definitions of file type groups to scan. New types will be added to this
358// list.
359enum FileType {
360  // Includes all files.
361  FILE_TYPE_UNSPECIFIED = 0;
362
363  // Includes all file extensions not covered by another entry. Binary
364  // scanning attempts to convert the content of the file to utf_8 to scan
365  // the file.
366  // If you wish to avoid this fall back, specify one or more of the other
367  // file types in your storage scan.
368  BINARY_FILE = 1;
369
370  // Included file extensions:
371  //   asc,asp, aspx, brf, c, cc,cfm, cgi, cpp, csv, cxx, c++, cs, css, dart,
372  //   dat, dot, eml,, epbub, ged, go, h, hh, hpp, hxx, h++, hs, html, htm,
373  //   mkd, markdown, m, ml, mli, perl, pl, plist, pm, php, phtml, pht,
374  //   properties, py, pyw, rb, rbw, rs, rss,  rc, scala, sh, sql, swift, tex,
375  //   shtml, shtm, xhtml, lhs, ics, ini, java, js, json, jsonl, kix, kml,
376  //   ocaml, md, txt, text, tsv, vb, vcard, vcs, wml, xcodeproj, xml, xsl, xsd,
377  //   yml, yaml.
378  TEXT_FILE = 2;
379
380  // Included file extensions:
381  //   bmp, gif, jpg, jpeg, jpe, png. Setting
382  // [bytes_limit_per_file][google.privacy.dlp.v2.CloudStorageOptions.bytes_limit_per_file]
383  // or
384  // [bytes_limit_per_file_percent][google.privacy.dlp.v2.CloudStorageOptions.bytes_limit_per_file]
385  // has no effect on image files. Image inspection is restricted to the
386  // `global`, `us`, `asia`, and `europe` regions.
387  IMAGE = 3;
388
389  // Microsoft Word files larger than 30 MB will be scanned as binary files.
390  // Included file extensions:
391  //   docx, dotx, docm, dotm. Setting `bytes_limit_per_file` or
392  //   `bytes_limit_per_file_percent` has no effect on Word files.
393  WORD = 5;
394
395  // PDF files larger than 30 MB will be scanned as binary files.
396  // Included file extensions:
397  //   pdf. Setting `bytes_limit_per_file` or `bytes_limit_per_file_percent`
398  // has no effect on PDF files.
399  PDF = 6;
400
401  // Included file extensions:
402  //   avro
403  AVRO = 7;
404
405  // Included file extensions:
406  //   csv
407  CSV = 8;
408
409  // Included file extensions:
410  //   tsv
411  TSV = 9;
412
413  // Microsoft PowerPoint files larger than 30 MB will be scanned as binary
414  // files. Included file extensions:
415  //   pptx, pptm, potx, potm, pot. Setting `bytes_limit_per_file` or
416  //   `bytes_limit_per_file_percent` has no effect on PowerPoint files.
417  POWERPOINT = 11;
418
419  // Microsoft Excel files larger than 30 MB will be scanned as binary files.
420  // Included file extensions:
421  //   xlsx, xlsm, xltx, xltm. Setting `bytes_limit_per_file` or
422  //   `bytes_limit_per_file_percent` has no effect on Excel files.
423  EXCEL = 12;
424}
425
426// Message representing a set of files in a Cloud Storage bucket. Regular
427// expressions are used to allow fine-grained control over which files in the
428// bucket to include.
429//
430// Included files are those that match at least one item in `include_regex` and
431// do not match any items in `exclude_regex`. Note that a file that matches
432// items from both lists will _not_ be included. For a match to occur, the
433// entire file path (i.e., everything in the url after the bucket name) must
434// match the regular expression.
435//
436// For example, given the input `{bucket_name: "mybucket", include_regex:
437// ["directory1/.*"], exclude_regex:
438// ["directory1/excluded.*"]}`:
439//
440// * `gs://mybucket/directory1/myfile` will be included
441// * `gs://mybucket/directory1/directory2/myfile` will be included (`.*` matches
442// across `/`)
443// * `gs://mybucket/directory0/directory1/myfile` will _not_ be included (the
444// full path doesn't match any items in `include_regex`)
445// * `gs://mybucket/directory1/excludedfile` will _not_ be included (the path
446// matches an item in `exclude_regex`)
447//
448// If `include_regex` is left empty, it will match all files by default
449// (this is equivalent to setting `include_regex: [".*"]`).
450//
451// Some other common use cases:
452//
453// * `{bucket_name: "mybucket", exclude_regex: [".*\.pdf"]}` will include all
454// files in `mybucket` except for .pdf files
455// * `{bucket_name: "mybucket", include_regex: ["directory/[^/]+"]}` will
456// include all files directly under `gs://mybucket/directory/`, without matching
457// across `/`
458message CloudStorageRegexFileSet {
459  // The name of a Cloud Storage bucket. Required.
460  string bucket_name = 1;
461
462  // A list of regular expressions matching file paths to include. All files in
463  // the bucket that match at least one of these regular expressions will be
464  // included in the set of files, except for those that also match an item in
465  // `exclude_regex`. Leaving this field empty will match all files by default
466  // (this is equivalent to including `.*` in the list).
467  //
468  // Regular expressions use RE2
469  // [syntax](https://github.com/google/re2/wiki/Syntax); a guide can be found
470  // under the google/re2 repository on GitHub.
471  repeated string include_regex = 2;
472
473  // A list of regular expressions matching file paths to exclude. All files in
474  // the bucket that match at least one of these regular expressions will be
475  // excluded from the scan.
476  //
477  // Regular expressions use RE2
478  // [syntax](https://github.com/google/re2/wiki/Syntax); a guide can be found
479  // under the google/re2 repository on GitHub.
480  repeated string exclude_regex = 3;
481}
482
483// Options defining a file or a set of files within a Cloud Storage
484// bucket.
485message CloudStorageOptions {
486  // Set of files to scan.
487  message FileSet {
488    // The Cloud Storage url of the file(s) to scan, in the format
489    // `gs://<bucket>/<path>`. Trailing wildcard in the path is allowed.
490    //
491    // If the url ends in a trailing slash, the bucket or directory represented
492    // by the url will be scanned non-recursively (content in sub-directories
493    // will not be scanned). This means that `gs://mybucket/` is equivalent to
494    // `gs://mybucket/*`, and `gs://mybucket/directory/` is equivalent to
495    // `gs://mybucket/directory/*`.
496    //
497    // Exactly one of `url` or `regex_file_set` must be set.
498    string url = 1;
499
500    // The regex-filtered set of files to scan. Exactly one of `url` or
501    // `regex_file_set` must be set.
502    CloudStorageRegexFileSet regex_file_set = 2;
503  }
504
505  // How to sample bytes if not all bytes are scanned. Meaningful only when used
506  // in conjunction with bytes_limit_per_file. If not specified, scanning would
507  // start from the top.
508  enum SampleMethod {
509    // No sampling.
510    SAMPLE_METHOD_UNSPECIFIED = 0;
511
512    // Scan from the top (default).
513    TOP = 1;
514
515    // For each file larger than bytes_limit_per_file, randomly pick the offset
516    // to start scanning. The scanned bytes are contiguous.
517    RANDOM_START = 2;
518  }
519
520  // The set of one or more files to scan.
521  FileSet file_set = 1;
522
523  // Max number of bytes to scan from a file. If a scanned file's size is bigger
524  // than this value then the rest of the bytes are omitted. Only one of
525  // `bytes_limit_per_file` and `bytes_limit_per_file_percent` can be specified.
526  // This field can't be set if de-identification is requested. For certain file
527  // types, setting this field has no effect. For more information, see [Limits
528  // on bytes scanned per
529  // file](https://cloud.google.com/sensitive-data-protection/docs/supported-file-types#max-byte-size-per-file).
530  int64 bytes_limit_per_file = 4;
531
532  // Max percentage of bytes to scan from a file. The rest are omitted. The
533  // number of bytes scanned is rounded down. Must be between 0 and 100,
534  // inclusively. Both 0 and 100 means no limit. Defaults to 0. Only one of
535  // bytes_limit_per_file and bytes_limit_per_file_percent can be specified.
536  // This field can't be set if de-identification is requested. For certain file
537  // types, setting this field has no effect. For more information, see [Limits
538  // on bytes scanned per
539  // file](https://cloud.google.com/sensitive-data-protection/docs/supported-file-types#max-byte-size-per-file).
540  int32 bytes_limit_per_file_percent = 8;
541
542  // List of file type groups to include in the scan.
543  // If empty, all files are scanned and available data format processors
544  // are applied. In addition, the binary content of the selected files
545  // is always scanned as well.
546  // Images are scanned only as binary if the specified region
547  // does not support image inspection and no file_types were specified.
548  // Image inspection is restricted to 'global', 'us', 'asia', and 'europe'.
549  repeated FileType file_types = 5;
550
551  // How to sample the data.
552  SampleMethod sample_method = 6;
553
554  // Limits the number of files to scan to this percentage of the input FileSet.
555  // Number of files scanned is rounded down. Must be between 0 and 100,
556  // inclusively. Both 0 and 100 means no limit. Defaults to 0.
557  int32 files_limit_percent = 7;
558}
559
560// Message representing a set of files in Cloud Storage.
561message CloudStorageFileSet {
562  // The url, in the format `gs://<bucket>/<path>`. Trailing wildcard in the
563  // path is allowed.
564  string url = 1;
565}
566
567// Message representing a single file or path in Cloud Storage.
568message CloudStoragePath {
569  // A URL representing a file or path (no wildcards) in Cloud Storage.
570  // Example: `gs://[BUCKET_NAME]/dictionary.txt`
571  string path = 1;
572}
573
574// Options defining BigQuery table and row identifiers.
575message BigQueryOptions {
576  // How to sample rows if not all rows are scanned. Meaningful only when used
577  // in conjunction with either rows_limit or rows_limit_percent. If not
578  // specified, rows are scanned in the order BigQuery reads them.
579  enum SampleMethod {
580    // No sampling.
581    SAMPLE_METHOD_UNSPECIFIED = 0;
582
583    // Scan groups of rows in the order BigQuery provides (default). Multiple
584    // groups of rows may be scanned in parallel, so results may not appear in
585    // the same order the rows are read.
586    TOP = 1;
587
588    // Randomly pick groups of rows to scan.
589    RANDOM_START = 2;
590  }
591
592  // Complete BigQuery table reference.
593  BigQueryTable table_reference = 1;
594
595  // Table fields that may uniquely identify a row within the table. When
596  // `actions.saveFindings.outputConfig.table` is specified, the values of
597  // columns specified here are available in the output table under
598  // `location.content_locations.record_location.record_key.id_values`. Nested
599  // fields such as `person.birthdate.year` are allowed.
600  repeated FieldId identifying_fields = 2;
601
602  // Max number of rows to scan. If the table has more rows than this value, the
603  // rest of the rows are omitted. If not set, or if set to 0, all rows will be
604  // scanned. Only one of rows_limit and rows_limit_percent can be specified.
605  // Cannot be used in conjunction with TimespanConfig.
606  int64 rows_limit = 3;
607
608  // Max percentage of rows to scan. The rest are omitted. The number of rows
609  // scanned is rounded down. Must be between 0 and 100, inclusively. Both 0 and
610  // 100 means no limit. Defaults to 0. Only one of rows_limit and
611  // rows_limit_percent can be specified. Cannot be used in conjunction with
612  // TimespanConfig.
613  //
614  // Caution: A [known
615  // issue](https://cloud.google.com/sensitive-data-protection/docs/known-issues#bq-sampling)
616  // is causing the `rowsLimitPercent` field to behave unexpectedly. We
617  // recommend using `rowsLimit` instead.
618  int32 rows_limit_percent = 6;
619
620  // How to sample the data.
621  SampleMethod sample_method = 4;
622
623  // References to fields excluded from scanning. This allows you to skip
624  // inspection of entire columns which you know have no findings.
625  // When inspecting a table, we recommend that you inspect all columns.
626  // Otherwise, findings might be affected because hints from excluded columns
627  // will not be used.
628  repeated FieldId excluded_fields = 5;
629
630  // Limit scanning only to these fields.
631  // When inspecting a table, we recommend that you inspect all columns.
632  // Otherwise, findings might be affected because hints from excluded columns
633  // will not be used.
634  repeated FieldId included_fields = 7;
635}
636
637// Shared message indicating Cloud storage type.
638message StorageConfig {
639  // Configuration of the timespan of the items to include in scanning.
640  // Currently only supported when inspecting Cloud Storage and BigQuery.
641  message TimespanConfig {
642    // Exclude files, tables, or rows older than this value.
643    // If not set, no lower time limit is applied.
644    google.protobuf.Timestamp start_time = 1;
645
646    // Exclude files, tables, or rows newer than this value.
647    // If not set, no upper time limit is applied.
648    google.protobuf.Timestamp end_time = 2;
649
650    // Specification of the field containing the timestamp of scanned items.
651    // Used for data sources like Datastore and BigQuery.
652    //
653    // <b>For BigQuery</b>
654    //
655    // If this value is not specified and the table was modified between the
656    // given start and end times, the entire table will be scanned. If this
657    // value is specified, then rows are filtered based on the given start and
658    // end times. Rows with a `NULL` value in the provided BigQuery column are
659    // skipped.
660    // Valid data types of the provided BigQuery column are: `INTEGER`, `DATE`,
661    // `TIMESTAMP`, and `DATETIME`.
662    //
663    // If your BigQuery table is [partitioned at ingestion
664    // time](https://cloud.google.com/bigquery/docs/partitioned-tables#ingestion_time),
665    // you can use any of the following pseudo-columns as your timestamp field.
666    // When used with Cloud DLP, these pseudo-column names are case sensitive.
667    //
668    // <ul>
669    // <li><code>_PARTITIONTIME</code></li>
670    // <li><code>_PARTITIONDATE</code></li>
671    // <li><code>_PARTITION_LOAD_TIME</code></li>
672    // </ul>
673    //
674    // <b>For Datastore</b>
675    //
676    // If this value is specified, then entities are filtered based on the given
677    // start and end times. If an entity does not contain the provided timestamp
678    // property or contains empty or invalid values, then it is included.
679    // Valid data types of the provided timestamp property are: `TIMESTAMP`.
680    //
681    // See the
682    // [known
683    // issue](https://cloud.google.com/sensitive-data-protection/docs/known-issues#bq-timespan)
684    // related to this operation.
685    FieldId timestamp_field = 3;
686
687    // When the job is started by a JobTrigger we will automatically figure out
688    // a valid start_time to avoid scanning files that have not been modified
689    // since the last time the JobTrigger executed. This will be based on the
690    // time of the execution of the last run of the JobTrigger or the timespan
691    // end_time used in the last run of the JobTrigger.
692    bool enable_auto_population_of_timespan_config = 4;
693  }
694
695  // Type of storage system to inspect.
696  oneof type {
697    // Google Cloud Datastore options.
698    DatastoreOptions datastore_options = 2;
699
700    // Cloud Storage options.
701    CloudStorageOptions cloud_storage_options = 3;
702
703    // BigQuery options.
704    BigQueryOptions big_query_options = 4;
705
706    // Hybrid inspection options.
707    HybridOptions hybrid_options = 9;
708  }
709
710  // Configuration of the timespan of the items to include in scanning.
711  TimespanConfig timespan_config = 6;
712}
713
714// Configuration to control jobs where the content being inspected is outside
715// of Google Cloud Platform.
716message HybridOptions {
717  // A short description of where the data is coming from. Will be stored once
718  // in the job. 256 max length.
719  string description = 1;
720
721  // These are labels that each inspection request must include within their
722  // 'finding_labels' map. Request may contain others, but any missing one of
723  // these will be rejected.
724  //
725  // Label keys must be between 1 and 63 characters long and must conform
726  // to the following regular expression: `[a-z]([-a-z0-9]*[a-z0-9])?`.
727  //
728  // No more than 10 keys can be required.
729  repeated string required_finding_label_keys = 2;
730
731  // To organize findings, these labels will be added to each finding.
732  //
733  // Label keys must be between 1 and 63 characters long and must conform
734  // to the following regular expression: `[a-z]([-a-z0-9]*[a-z0-9])?`.
735  //
736  // Label values must be between 0 and 63 characters long and must conform
737  // to the regular expression `([a-z]([-a-z0-9]*[a-z0-9])?)?`.
738  //
739  // No more than 10 labels can be associated with a given finding.
740  //
741  // Examples:
742  // * `"environment" : "production"`
743  // * `"pipeline" : "etl"`
744  map<string, string> labels = 3;
745
746  // If the container is a table, additional information to make findings
747  // meaningful such as the columns that are primary keys.
748  TableOptions table_options = 4;
749}
750
751// Row key for identifying a record in BigQuery table.
752message BigQueryKey {
753  // Complete BigQuery table reference.
754  BigQueryTable table_reference = 1;
755
756  // Row number inferred at the time the table was scanned. This value is
757  // nondeterministic, cannot be queried, and may be null for inspection
758  // jobs. To locate findings within a table, specify
759  // `inspect_job.storage_config.big_query_options.identifying_fields` in
760  // `CreateDlpJobRequest`.
761  int64 row_number = 2;
762}
763
764// Record key for a finding in Cloud Datastore.
765message DatastoreKey {
766  // Datastore entity key.
767  Key entity_key = 1;
768}
769
770// A unique identifier for a Datastore entity.
771// If a key's partition ID or any of its path kinds or names are
772// reserved/read-only, the key is reserved/read-only.
773// A reserved/read-only key is forbidden in certain documented contexts.
774message Key {
775  // A (kind, ID/name) pair used to construct a key path.
776  //
777  // If either name or ID is set, the element is complete.
778  // If neither is set, the element is incomplete.
779  message PathElement {
780    // The kind of the entity.
781    // A kind matching regex `__.*__` is reserved/read-only.
782    // A kind must not contain more than 1500 bytes when UTF-8 encoded.
783    // Cannot be `""`.
784    string kind = 1;
785
786    // The type of ID.
787    oneof id_type {
788      // The auto-allocated ID of the entity.
789      // Never equal to zero. Values less than zero are discouraged and may not
790      // be supported in the future.
791      int64 id = 2;
792
793      // The name of the entity.
794      // A name matching regex `__.*__` is reserved/read-only.
795      // A name must not be more than 1500 bytes when UTF-8 encoded.
796      // Cannot be `""`.
797      string name = 3;
798    }
799  }
800
801  // Entities are partitioned into subsets, currently identified by a project
802  // ID and namespace ID.
803  // Queries are scoped to a single partition.
804  PartitionId partition_id = 1;
805
806  // The entity path.
807  // An entity path consists of one or more elements composed of a kind and a
808  // string or numerical identifier, which identify entities. The first
809  // element identifies a _root entity_, the second element identifies
810  // a _child_ of the root entity, the third element identifies a child of the
811  // second entity, and so forth. The entities identified by all prefixes of
812  // the path are called the element's _ancestors_.
813  //
814  // A path can never be empty, and a path can have at most 100 elements.
815  repeated PathElement path = 2;
816}
817
818// Message for a unique key indicating a record that contains a finding.
819message RecordKey {
820  // Type of key
821  oneof type {
822    // BigQuery key
823    DatastoreKey datastore_key = 2;
824
825    // Datastore key
826    BigQueryKey big_query_key = 3;
827  }
828
829  // Values of identifying columns in the given row. Order of values matches
830  // the order of `identifying_fields` specified in the scanning request.
831  repeated string id_values = 5;
832}
833
834// Message defining the location of a BigQuery table. A table is uniquely
835// identified  by its project_id, dataset_id, and table_name. Within a query
836// a table is often referenced with a string in the format of:
837// `<project_id>:<dataset_id>.<table_id>` or
838// `<project_id>.<dataset_id>.<table_id>`.
839message BigQueryTable {
840  // The Google Cloud Platform project ID of the project containing the table.
841  // If omitted, project ID is inferred from the API call.
842  string project_id = 1;
843
844  // Dataset ID of the table.
845  string dataset_id = 2;
846
847  // Name of the table.
848  string table_id = 3;
849}
850
851// Message defining a field of a BigQuery table.
852message BigQueryField {
853  // Source table of the field.
854  BigQueryTable table = 1;
855
856  // Designated field in the BigQuery table.
857  FieldId field = 2;
858}
859
860// An entity in a dataset is a field or set of fields that correspond to a
861// single person. For example, in medical records the `EntityId` might be a
862// patient identifier, or for financial records it might be an account
863// identifier. This message is used when generalizations or analysis must take
864// into account that multiple rows correspond to the same entity.
865message EntityId {
866  // Composite key indicating which field contains the entity identifier.
867  FieldId field = 1;
868}
869
870// Instructions regarding the table content being inspected.
871message TableOptions {
872  // The columns that are the primary keys for table objects included in
873  // ContentItem. A copy of this cell's value will stored alongside alongside
874  // each finding so that the finding can be traced to the specific row it came
875  // from. No more than 3 may be provided.
876  repeated FieldId identifying_fields = 1;
877}
878