1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.privacy.dlp.v2; 18 19import "google/api/resource.proto"; 20import "google/protobuf/timestamp.proto"; 21 22option csharp_namespace = "Google.Cloud.Dlp.V2"; 23option go_package = "cloud.google.com/go/dlp/apiv2/dlppb;dlppb"; 24option java_multiple_files = true; 25option java_outer_classname = "DlpStorage"; 26option java_package = "com.google.privacy.dlp.v2"; 27option php_namespace = "Google\\Cloud\\Dlp\\V2"; 28option ruby_package = "Google::Cloud::Dlp::V2"; 29 30// Type of information detected by the API. 31message InfoType { 32 // Name of the information type. Either a name of your choosing when 33 // creating a CustomInfoType, or one of the names listed 34 // at 35 // https://cloud.google.com/sensitive-data-protection/docs/infotypes-reference 36 // when specifying a built-in type. When sending Cloud DLP results to Data 37 // Catalog, infoType names should conform to the pattern 38 // `[A-Za-z0-9$_-]{1,64}`. 39 string name = 1; 40 41 // Optional version name for this InfoType. 42 string version = 2; 43 44 // Optional custom sensitivity for this InfoType. 45 // This only applies to data profiling. 46 SensitivityScore sensitivity_score = 3; 47} 48 49// Score is calculated from of all elements in the data profile. 50// A higher level means the data is more sensitive. 51message SensitivityScore { 52 // Various sensitivity score levels for resources. 53 enum SensitivityScoreLevel { 54 // Unused. 55 SENSITIVITY_SCORE_UNSPECIFIED = 0; 56 57 // No sensitive information detected. The resource isn't publicly 58 // accessible. 59 SENSITIVITY_LOW = 10; 60 61 // Medium risk. Contains personally identifiable information (PII), 62 // potentially sensitive data, or fields with free-text data that are at a 63 // higher risk of having intermittent sensitive data. Consider limiting 64 // access. 65 SENSITIVITY_MODERATE = 20; 66 67 // High risk. Sensitive personally identifiable information (SPII) can be 68 // present. Exfiltration of data can lead to user data loss. 69 // Re-identification of users might be possible. Consider limiting usage and 70 // or removing SPII. 71 SENSITIVITY_HIGH = 30; 72 } 73 74 // The sensitivity score applied to the resource. 75 SensitivityScoreLevel score = 1; 76} 77 78// Coarse-grained confidence level of how well a particular finding 79// satisfies the criteria to match a particular infoType. 80// 81// Likelihood is calculated based on the number of signals a 82// finding has that implies that the finding matches the infoType. For 83// example, a string that has an '@' and a '.com' is more likely to be a 84// match for an email address than a string that only has an '@'. 85// 86// In general, the highest likelihood level has the strongest signals that 87// indicate a match. That is, a finding with a high likelihood has a low chance 88// of being a false positive. 89// 90// For more information about each likelihood level 91// and how likelihood works, see [Match 92// likelihood](https://cloud.google.com/sensitive-data-protection/docs/likelihood). 93enum Likelihood { 94 // Default value; same as POSSIBLE. 95 LIKELIHOOD_UNSPECIFIED = 0; 96 97 // Highest chance of a false positive. 98 VERY_UNLIKELY = 1; 99 100 // High chance of a false positive. 101 UNLIKELY = 2; 102 103 // Some matching signals. The default value. 104 POSSIBLE = 3; 105 106 // Low chance of a false positive. 107 LIKELY = 4; 108 109 // Confidence level is high. Lowest chance of a false positive. 110 VERY_LIKELY = 5; 111} 112 113// A reference to a StoredInfoType to use with scanning. 114message StoredType { 115 // Resource name of the requested `StoredInfoType`, for example 116 // `organizations/433245324/storedInfoTypes/432452342` or 117 // `projects/project-id/storedInfoTypes/432452342`. 118 string name = 1; 119 120 // Timestamp indicating when the version of the `StoredInfoType` used for 121 // inspection was created. Output-only field, populated by the system. 122 google.protobuf.Timestamp create_time = 2; 123} 124 125// Custom information type provided by the user. Used to find domain-specific 126// sensitive information configurable to the data in question. 127message CustomInfoType { 128 // Custom information type based on a dictionary of words or phrases. This can 129 // be used to match sensitive information specific to the data, such as a list 130 // of employee IDs or job titles. 131 // 132 // Dictionary words are case-insensitive and all characters other than letters 133 // and digits in the unicode [Basic Multilingual 134 // Plane](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) 135 // will be replaced with whitespace when scanning for matches, so the 136 // dictionary phrase "Sam Johnson" will match all three phrases "sam johnson", 137 // "Sam, Johnson", and "Sam (Johnson)". Additionally, the characters 138 // surrounding any match must be of a different type than the adjacent 139 // characters within the word, so letters must be next to non-letters and 140 // digits next to non-digits. For example, the dictionary word "jen" will 141 // match the first three letters of the text "jen123" but will return no 142 // matches for "jennifer". 143 // 144 // Dictionary words containing a large number of characters that are not 145 // letters or digits may result in unexpected findings because such characters 146 // are treated as whitespace. The 147 // [limits](https://cloud.google.com/sensitive-data-protection/limits) page 148 // contains details about the size limits of dictionaries. For dictionaries 149 // that do not fit within these constraints, consider using 150 // `LargeCustomDictionaryConfig` in the `StoredInfoType` API. 151 message Dictionary { 152 // Message defining a list of words or phrases to search for in the data. 153 message WordList { 154 // Words or phrases defining the dictionary. The dictionary must contain 155 // at least one phrase and every phrase must contain at least 2 characters 156 // that are letters or digits. [required] 157 repeated string words = 1; 158 } 159 160 // The potential places the data can be read from. 161 oneof source { 162 // List of words or phrases to search for. 163 WordList word_list = 1; 164 165 // Newline-delimited file of words in Cloud Storage. Only a single file 166 // is accepted. 167 CloudStoragePath cloud_storage_path = 3; 168 } 169 } 170 171 // Message defining a custom regular expression. 172 message Regex { 173 // Pattern defining the regular expression. Its syntax 174 // (https://github.com/google/re2/wiki/Syntax) can be found under the 175 // google/re2 repository on GitHub. 176 string pattern = 1; 177 178 // The index of the submatch to extract as findings. When not 179 // specified, the entire match is returned. No more than 3 may be included. 180 repeated int32 group_indexes = 2; 181 } 182 183 // Message for detecting output from deidentification transformations 184 // such as 185 // [`CryptoReplaceFfxFpeConfig`](https://cloud.google.com/sensitive-data-protection/docs/reference/rest/v2/organizations.deidentifyTemplates#cryptoreplaceffxfpeconfig). 186 // These types of transformations are 187 // those that perform pseudonymization, thereby producing a "surrogate" as 188 // output. This should be used in conjunction with a field on the 189 // transformation such as `surrogate_info_type`. This CustomInfoType does 190 // not support the use of `detection_rules`. 191 message SurrogateType {} 192 193 // Deprecated; use `InspectionRuleSet` instead. Rule for modifying a 194 // `CustomInfoType` to alter behavior under certain circumstances, depending 195 // on the specific details of the rule. Not supported for the `surrogate_type` 196 // custom infoType. 197 message DetectionRule { 198 // Message for specifying a window around a finding to apply a detection 199 // rule. 200 message Proximity { 201 // Number of characters before the finding to consider. For tabular data, 202 // if you want to modify the likelihood of an entire column of findngs, 203 // set this to 1. For more information, see 204 // [Hotword example: Set the match likelihood of a table column] 205 // (https://cloud.google.com/sensitive-data-protection/docs/creating-custom-infotypes-likelihood#match-column-values). 206 int32 window_before = 1; 207 208 // Number of characters after the finding to consider. 209 int32 window_after = 2; 210 } 211 212 // Message for specifying an adjustment to the likelihood of a finding as 213 // part of a detection rule. 214 message LikelihoodAdjustment { 215 // How the likelihood will be modified. 216 oneof adjustment { 217 // Set the likelihood of a finding to a fixed value. 218 Likelihood fixed_likelihood = 1; 219 220 // Increase or decrease the likelihood by the specified number of 221 // levels. For example, if a finding would be `POSSIBLE` without the 222 // detection rule and `relative_likelihood` is 1, then it is upgraded to 223 // `LIKELY`, while a value of -1 would downgrade it to `UNLIKELY`. 224 // Likelihood may never drop below `VERY_UNLIKELY` or exceed 225 // `VERY_LIKELY`, so applying an adjustment of 1 followed by an 226 // adjustment of -1 when base likelihood is `VERY_LIKELY` will result in 227 // a final likelihood of `LIKELY`. 228 int32 relative_likelihood = 2; 229 } 230 } 231 232 // The rule that adjusts the likelihood of findings within a certain 233 // proximity of hotwords. 234 message HotwordRule { 235 // Regular expression pattern defining what qualifies as a hotword. 236 Regex hotword_regex = 1; 237 238 // Range of characters within which the entire hotword must reside. 239 // The total length of the window cannot exceed 1000 characters. 240 // The finding itself will be included in the window, so that hotwords can 241 // be used to match substrings of the finding itself. Suppose you 242 // want Cloud DLP to promote the likelihood of the phone number 243 // regex "\(\d{3}\) \d{3}-\d{4}" if the area code is known to be the 244 // area code of a company's office. In this case, use the hotword regex 245 // "\(xxx\)", where "xxx" is the area code in question. 246 // 247 // For tabular data, if you want to modify the likelihood of an entire 248 // column of findngs, see 249 // [Hotword example: Set the match likelihood of a table column] 250 // (https://cloud.google.com/sensitive-data-protection/docs/creating-custom-infotypes-likelihood#match-column-values). 251 Proximity proximity = 2; 252 253 // Likelihood adjustment to apply to all matching findings. 254 LikelihoodAdjustment likelihood_adjustment = 3; 255 } 256 257 // Type of hotword rule. 258 oneof type { 259 // Hotword-based detection rule. 260 HotwordRule hotword_rule = 1; 261 } 262 } 263 264 // Type of exclusion rule. 265 enum ExclusionType { 266 // A finding of this custom info type will not be excluded from results. 267 EXCLUSION_TYPE_UNSPECIFIED = 0; 268 269 // A finding of this custom info type will be excluded from final results, 270 // but can still affect rule execution. 271 EXCLUSION_TYPE_EXCLUDE = 1; 272 } 273 274 // CustomInfoType can either be a new infoType, or an extension of built-in 275 // infoType, when the name matches one of existing infoTypes and that infoType 276 // is specified in `InspectContent.info_types` field. Specifying the latter 277 // adds findings to the one detected by the system. If built-in info type is 278 // not specified in `InspectContent.info_types` list then the name is treated 279 // as a custom info type. 280 InfoType info_type = 1; 281 282 // Likelihood to return for this CustomInfoType. This base value can be 283 // altered by a detection rule if the finding meets the criteria specified by 284 // the rule. Defaults to `VERY_LIKELY` if not specified. 285 Likelihood likelihood = 6; 286 287 // Type of custom detector. 288 oneof type { 289 // A list of phrases to detect as a CustomInfoType. 290 Dictionary dictionary = 2; 291 292 // Regular expression based CustomInfoType. 293 Regex regex = 3; 294 295 // Message for detecting output from deidentification transformations that 296 // support reversing. 297 SurrogateType surrogate_type = 4; 298 299 // Load an existing `StoredInfoType` resource for use in 300 // `InspectDataSource`. Not currently supported in `InspectContent`. 301 StoredType stored_type = 5; 302 } 303 304 // Set of detection rules to apply to all findings of this CustomInfoType. 305 // Rules are applied in order that they are specified. Not supported for the 306 // `surrogate_type` CustomInfoType. 307 repeated DetectionRule detection_rules = 7; 308 309 // If set to EXCLUSION_TYPE_EXCLUDE this infoType will not cause a finding 310 // to be returned. It still can be used for rules matching. 311 ExclusionType exclusion_type = 8; 312 313 // Sensitivity for this CustomInfoType. If this CustomInfoType extends an 314 // existing InfoType, the sensitivity here will take precedence over that of 315 // the original InfoType. If unset for a CustomInfoType, it will default to 316 // HIGH. 317 // This only applies to data profiling. 318 SensitivityScore sensitivity_score = 9; 319} 320 321// General identifier of a data field in a storage service. 322message FieldId { 323 // Name describing the field. 324 string name = 1; 325} 326 327// Datastore partition ID. 328// A partition ID identifies a grouping of entities. The grouping is always 329// by project and namespace, however the namespace ID may be empty. 330// 331// A partition ID contains several dimensions: 332// project ID and namespace ID. 333message PartitionId { 334 // The ID of the project to which the entities belong. 335 string project_id = 2; 336 337 // If not empty, the ID of the namespace to which the entities belong. 338 string namespace_id = 4; 339} 340 341// A representation of a Datastore kind. 342message KindExpression { 343 // The name of the kind. 344 string name = 1; 345} 346 347// Options defining a data set within Google Cloud Datastore. 348message DatastoreOptions { 349 // A partition ID identifies a grouping of entities. The grouping is always 350 // by project and namespace, however the namespace ID may be empty. 351 PartitionId partition_id = 1; 352 353 // The kind to process. 354 KindExpression kind = 2; 355} 356 357// Definitions of file type groups to scan. New types will be added to this 358// list. 359enum FileType { 360 // Includes all files. 361 FILE_TYPE_UNSPECIFIED = 0; 362 363 // Includes all file extensions not covered by another entry. Binary 364 // scanning attempts to convert the content of the file to utf_8 to scan 365 // the file. 366 // If you wish to avoid this fall back, specify one or more of the other 367 // file types in your storage scan. 368 BINARY_FILE = 1; 369 370 // Included file extensions: 371 // asc,asp, aspx, brf, c, cc,cfm, cgi, cpp, csv, cxx, c++, cs, css, dart, 372 // dat, dot, eml,, epbub, ged, go, h, hh, hpp, hxx, h++, hs, html, htm, 373 // mkd, markdown, m, ml, mli, perl, pl, plist, pm, php, phtml, pht, 374 // properties, py, pyw, rb, rbw, rs, rss, rc, scala, sh, sql, swift, tex, 375 // shtml, shtm, xhtml, lhs, ics, ini, java, js, json, jsonl, kix, kml, 376 // ocaml, md, txt, text, tsv, vb, vcard, vcs, wml, xcodeproj, xml, xsl, xsd, 377 // yml, yaml. 378 TEXT_FILE = 2; 379 380 // Included file extensions: 381 // bmp, gif, jpg, jpeg, jpe, png. Setting 382 // [bytes_limit_per_file][google.privacy.dlp.v2.CloudStorageOptions.bytes_limit_per_file] 383 // or 384 // [bytes_limit_per_file_percent][google.privacy.dlp.v2.CloudStorageOptions.bytes_limit_per_file] 385 // has no effect on image files. Image inspection is restricted to the 386 // `global`, `us`, `asia`, and `europe` regions. 387 IMAGE = 3; 388 389 // Microsoft Word files larger than 30 MB will be scanned as binary files. 390 // Included file extensions: 391 // docx, dotx, docm, dotm. Setting `bytes_limit_per_file` or 392 // `bytes_limit_per_file_percent` has no effect on Word files. 393 WORD = 5; 394 395 // PDF files larger than 30 MB will be scanned as binary files. 396 // Included file extensions: 397 // pdf. Setting `bytes_limit_per_file` or `bytes_limit_per_file_percent` 398 // has no effect on PDF files. 399 PDF = 6; 400 401 // Included file extensions: 402 // avro 403 AVRO = 7; 404 405 // Included file extensions: 406 // csv 407 CSV = 8; 408 409 // Included file extensions: 410 // tsv 411 TSV = 9; 412 413 // Microsoft PowerPoint files larger than 30 MB will be scanned as binary 414 // files. Included file extensions: 415 // pptx, pptm, potx, potm, pot. Setting `bytes_limit_per_file` or 416 // `bytes_limit_per_file_percent` has no effect on PowerPoint files. 417 POWERPOINT = 11; 418 419 // Microsoft Excel files larger than 30 MB will be scanned as binary files. 420 // Included file extensions: 421 // xlsx, xlsm, xltx, xltm. Setting `bytes_limit_per_file` or 422 // `bytes_limit_per_file_percent` has no effect on Excel files. 423 EXCEL = 12; 424} 425 426// Message representing a set of files in a Cloud Storage bucket. Regular 427// expressions are used to allow fine-grained control over which files in the 428// bucket to include. 429// 430// Included files are those that match at least one item in `include_regex` and 431// do not match any items in `exclude_regex`. Note that a file that matches 432// items from both lists will _not_ be included. For a match to occur, the 433// entire file path (i.e., everything in the url after the bucket name) must 434// match the regular expression. 435// 436// For example, given the input `{bucket_name: "mybucket", include_regex: 437// ["directory1/.*"], exclude_regex: 438// ["directory1/excluded.*"]}`: 439// 440// * `gs://mybucket/directory1/myfile` will be included 441// * `gs://mybucket/directory1/directory2/myfile` will be included (`.*` matches 442// across `/`) 443// * `gs://mybucket/directory0/directory1/myfile` will _not_ be included (the 444// full path doesn't match any items in `include_regex`) 445// * `gs://mybucket/directory1/excludedfile` will _not_ be included (the path 446// matches an item in `exclude_regex`) 447// 448// If `include_regex` is left empty, it will match all files by default 449// (this is equivalent to setting `include_regex: [".*"]`). 450// 451// Some other common use cases: 452// 453// * `{bucket_name: "mybucket", exclude_regex: [".*\.pdf"]}` will include all 454// files in `mybucket` except for .pdf files 455// * `{bucket_name: "mybucket", include_regex: ["directory/[^/]+"]}` will 456// include all files directly under `gs://mybucket/directory/`, without matching 457// across `/` 458message CloudStorageRegexFileSet { 459 // The name of a Cloud Storage bucket. Required. 460 string bucket_name = 1; 461 462 // A list of regular expressions matching file paths to include. All files in 463 // the bucket that match at least one of these regular expressions will be 464 // included in the set of files, except for those that also match an item in 465 // `exclude_regex`. Leaving this field empty will match all files by default 466 // (this is equivalent to including `.*` in the list). 467 // 468 // Regular expressions use RE2 469 // [syntax](https://github.com/google/re2/wiki/Syntax); a guide can be found 470 // under the google/re2 repository on GitHub. 471 repeated string include_regex = 2; 472 473 // A list of regular expressions matching file paths to exclude. All files in 474 // the bucket that match at least one of these regular expressions will be 475 // excluded from the scan. 476 // 477 // Regular expressions use RE2 478 // [syntax](https://github.com/google/re2/wiki/Syntax); a guide can be found 479 // under the google/re2 repository on GitHub. 480 repeated string exclude_regex = 3; 481} 482 483// Options defining a file or a set of files within a Cloud Storage 484// bucket. 485message CloudStorageOptions { 486 // Set of files to scan. 487 message FileSet { 488 // The Cloud Storage url of the file(s) to scan, in the format 489 // `gs://<bucket>/<path>`. Trailing wildcard in the path is allowed. 490 // 491 // If the url ends in a trailing slash, the bucket or directory represented 492 // by the url will be scanned non-recursively (content in sub-directories 493 // will not be scanned). This means that `gs://mybucket/` is equivalent to 494 // `gs://mybucket/*`, and `gs://mybucket/directory/` is equivalent to 495 // `gs://mybucket/directory/*`. 496 // 497 // Exactly one of `url` or `regex_file_set` must be set. 498 string url = 1; 499 500 // The regex-filtered set of files to scan. Exactly one of `url` or 501 // `regex_file_set` must be set. 502 CloudStorageRegexFileSet regex_file_set = 2; 503 } 504 505 // How to sample bytes if not all bytes are scanned. Meaningful only when used 506 // in conjunction with bytes_limit_per_file. If not specified, scanning would 507 // start from the top. 508 enum SampleMethod { 509 // No sampling. 510 SAMPLE_METHOD_UNSPECIFIED = 0; 511 512 // Scan from the top (default). 513 TOP = 1; 514 515 // For each file larger than bytes_limit_per_file, randomly pick the offset 516 // to start scanning. The scanned bytes are contiguous. 517 RANDOM_START = 2; 518 } 519 520 // The set of one or more files to scan. 521 FileSet file_set = 1; 522 523 // Max number of bytes to scan from a file. If a scanned file's size is bigger 524 // than this value then the rest of the bytes are omitted. Only one of 525 // `bytes_limit_per_file` and `bytes_limit_per_file_percent` can be specified. 526 // This field can't be set if de-identification is requested. For certain file 527 // types, setting this field has no effect. For more information, see [Limits 528 // on bytes scanned per 529 // file](https://cloud.google.com/sensitive-data-protection/docs/supported-file-types#max-byte-size-per-file). 530 int64 bytes_limit_per_file = 4; 531 532 // Max percentage of bytes to scan from a file. The rest are omitted. The 533 // number of bytes scanned is rounded down. Must be between 0 and 100, 534 // inclusively. Both 0 and 100 means no limit. Defaults to 0. Only one of 535 // bytes_limit_per_file and bytes_limit_per_file_percent can be specified. 536 // This field can't be set if de-identification is requested. For certain file 537 // types, setting this field has no effect. For more information, see [Limits 538 // on bytes scanned per 539 // file](https://cloud.google.com/sensitive-data-protection/docs/supported-file-types#max-byte-size-per-file). 540 int32 bytes_limit_per_file_percent = 8; 541 542 // List of file type groups to include in the scan. 543 // If empty, all files are scanned and available data format processors 544 // are applied. In addition, the binary content of the selected files 545 // is always scanned as well. 546 // Images are scanned only as binary if the specified region 547 // does not support image inspection and no file_types were specified. 548 // Image inspection is restricted to 'global', 'us', 'asia', and 'europe'. 549 repeated FileType file_types = 5; 550 551 // How to sample the data. 552 SampleMethod sample_method = 6; 553 554 // Limits the number of files to scan to this percentage of the input FileSet. 555 // Number of files scanned is rounded down. Must be between 0 and 100, 556 // inclusively. Both 0 and 100 means no limit. Defaults to 0. 557 int32 files_limit_percent = 7; 558} 559 560// Message representing a set of files in Cloud Storage. 561message CloudStorageFileSet { 562 // The url, in the format `gs://<bucket>/<path>`. Trailing wildcard in the 563 // path is allowed. 564 string url = 1; 565} 566 567// Message representing a single file or path in Cloud Storage. 568message CloudStoragePath { 569 // A URL representing a file or path (no wildcards) in Cloud Storage. 570 // Example: `gs://[BUCKET_NAME]/dictionary.txt` 571 string path = 1; 572} 573 574// Options defining BigQuery table and row identifiers. 575message BigQueryOptions { 576 // How to sample rows if not all rows are scanned. Meaningful only when used 577 // in conjunction with either rows_limit or rows_limit_percent. If not 578 // specified, rows are scanned in the order BigQuery reads them. 579 enum SampleMethod { 580 // No sampling. 581 SAMPLE_METHOD_UNSPECIFIED = 0; 582 583 // Scan groups of rows in the order BigQuery provides (default). Multiple 584 // groups of rows may be scanned in parallel, so results may not appear in 585 // the same order the rows are read. 586 TOP = 1; 587 588 // Randomly pick groups of rows to scan. 589 RANDOM_START = 2; 590 } 591 592 // Complete BigQuery table reference. 593 BigQueryTable table_reference = 1; 594 595 // Table fields that may uniquely identify a row within the table. When 596 // `actions.saveFindings.outputConfig.table` is specified, the values of 597 // columns specified here are available in the output table under 598 // `location.content_locations.record_location.record_key.id_values`. Nested 599 // fields such as `person.birthdate.year` are allowed. 600 repeated FieldId identifying_fields = 2; 601 602 // Max number of rows to scan. If the table has more rows than this value, the 603 // rest of the rows are omitted. If not set, or if set to 0, all rows will be 604 // scanned. Only one of rows_limit and rows_limit_percent can be specified. 605 // Cannot be used in conjunction with TimespanConfig. 606 int64 rows_limit = 3; 607 608 // Max percentage of rows to scan. The rest are omitted. The number of rows 609 // scanned is rounded down. Must be between 0 and 100, inclusively. Both 0 and 610 // 100 means no limit. Defaults to 0. Only one of rows_limit and 611 // rows_limit_percent can be specified. Cannot be used in conjunction with 612 // TimespanConfig. 613 // 614 // Caution: A [known 615 // issue](https://cloud.google.com/sensitive-data-protection/docs/known-issues#bq-sampling) 616 // is causing the `rowsLimitPercent` field to behave unexpectedly. We 617 // recommend using `rowsLimit` instead. 618 int32 rows_limit_percent = 6; 619 620 // How to sample the data. 621 SampleMethod sample_method = 4; 622 623 // References to fields excluded from scanning. This allows you to skip 624 // inspection of entire columns which you know have no findings. 625 // When inspecting a table, we recommend that you inspect all columns. 626 // Otherwise, findings might be affected because hints from excluded columns 627 // will not be used. 628 repeated FieldId excluded_fields = 5; 629 630 // Limit scanning only to these fields. 631 // When inspecting a table, we recommend that you inspect all columns. 632 // Otherwise, findings might be affected because hints from excluded columns 633 // will not be used. 634 repeated FieldId included_fields = 7; 635} 636 637// Shared message indicating Cloud storage type. 638message StorageConfig { 639 // Configuration of the timespan of the items to include in scanning. 640 // Currently only supported when inspecting Cloud Storage and BigQuery. 641 message TimespanConfig { 642 // Exclude files, tables, or rows older than this value. 643 // If not set, no lower time limit is applied. 644 google.protobuf.Timestamp start_time = 1; 645 646 // Exclude files, tables, or rows newer than this value. 647 // If not set, no upper time limit is applied. 648 google.protobuf.Timestamp end_time = 2; 649 650 // Specification of the field containing the timestamp of scanned items. 651 // Used for data sources like Datastore and BigQuery. 652 // 653 // <b>For BigQuery</b> 654 // 655 // If this value is not specified and the table was modified between the 656 // given start and end times, the entire table will be scanned. If this 657 // value is specified, then rows are filtered based on the given start and 658 // end times. Rows with a `NULL` value in the provided BigQuery column are 659 // skipped. 660 // Valid data types of the provided BigQuery column are: `INTEGER`, `DATE`, 661 // `TIMESTAMP`, and `DATETIME`. 662 // 663 // If your BigQuery table is [partitioned at ingestion 664 // time](https://cloud.google.com/bigquery/docs/partitioned-tables#ingestion_time), 665 // you can use any of the following pseudo-columns as your timestamp field. 666 // When used with Cloud DLP, these pseudo-column names are case sensitive. 667 // 668 // <ul> 669 // <li><code>_PARTITIONTIME</code></li> 670 // <li><code>_PARTITIONDATE</code></li> 671 // <li><code>_PARTITION_LOAD_TIME</code></li> 672 // </ul> 673 // 674 // <b>For Datastore</b> 675 // 676 // If this value is specified, then entities are filtered based on the given 677 // start and end times. If an entity does not contain the provided timestamp 678 // property or contains empty or invalid values, then it is included. 679 // Valid data types of the provided timestamp property are: `TIMESTAMP`. 680 // 681 // See the 682 // [known 683 // issue](https://cloud.google.com/sensitive-data-protection/docs/known-issues#bq-timespan) 684 // related to this operation. 685 FieldId timestamp_field = 3; 686 687 // When the job is started by a JobTrigger we will automatically figure out 688 // a valid start_time to avoid scanning files that have not been modified 689 // since the last time the JobTrigger executed. This will be based on the 690 // time of the execution of the last run of the JobTrigger or the timespan 691 // end_time used in the last run of the JobTrigger. 692 bool enable_auto_population_of_timespan_config = 4; 693 } 694 695 // Type of storage system to inspect. 696 oneof type { 697 // Google Cloud Datastore options. 698 DatastoreOptions datastore_options = 2; 699 700 // Cloud Storage options. 701 CloudStorageOptions cloud_storage_options = 3; 702 703 // BigQuery options. 704 BigQueryOptions big_query_options = 4; 705 706 // Hybrid inspection options. 707 HybridOptions hybrid_options = 9; 708 } 709 710 // Configuration of the timespan of the items to include in scanning. 711 TimespanConfig timespan_config = 6; 712} 713 714// Configuration to control jobs where the content being inspected is outside 715// of Google Cloud Platform. 716message HybridOptions { 717 // A short description of where the data is coming from. Will be stored once 718 // in the job. 256 max length. 719 string description = 1; 720 721 // These are labels that each inspection request must include within their 722 // 'finding_labels' map. Request may contain others, but any missing one of 723 // these will be rejected. 724 // 725 // Label keys must be between 1 and 63 characters long and must conform 726 // to the following regular expression: `[a-z]([-a-z0-9]*[a-z0-9])?`. 727 // 728 // No more than 10 keys can be required. 729 repeated string required_finding_label_keys = 2; 730 731 // To organize findings, these labels will be added to each finding. 732 // 733 // Label keys must be between 1 and 63 characters long and must conform 734 // to the following regular expression: `[a-z]([-a-z0-9]*[a-z0-9])?`. 735 // 736 // Label values must be between 0 and 63 characters long and must conform 737 // to the regular expression `([a-z]([-a-z0-9]*[a-z0-9])?)?`. 738 // 739 // No more than 10 labels can be associated with a given finding. 740 // 741 // Examples: 742 // * `"environment" : "production"` 743 // * `"pipeline" : "etl"` 744 map<string, string> labels = 3; 745 746 // If the container is a table, additional information to make findings 747 // meaningful such as the columns that are primary keys. 748 TableOptions table_options = 4; 749} 750 751// Row key for identifying a record in BigQuery table. 752message BigQueryKey { 753 // Complete BigQuery table reference. 754 BigQueryTable table_reference = 1; 755 756 // Row number inferred at the time the table was scanned. This value is 757 // nondeterministic, cannot be queried, and may be null for inspection 758 // jobs. To locate findings within a table, specify 759 // `inspect_job.storage_config.big_query_options.identifying_fields` in 760 // `CreateDlpJobRequest`. 761 int64 row_number = 2; 762} 763 764// Record key for a finding in Cloud Datastore. 765message DatastoreKey { 766 // Datastore entity key. 767 Key entity_key = 1; 768} 769 770// A unique identifier for a Datastore entity. 771// If a key's partition ID or any of its path kinds or names are 772// reserved/read-only, the key is reserved/read-only. 773// A reserved/read-only key is forbidden in certain documented contexts. 774message Key { 775 // A (kind, ID/name) pair used to construct a key path. 776 // 777 // If either name or ID is set, the element is complete. 778 // If neither is set, the element is incomplete. 779 message PathElement { 780 // The kind of the entity. 781 // A kind matching regex `__.*__` is reserved/read-only. 782 // A kind must not contain more than 1500 bytes when UTF-8 encoded. 783 // Cannot be `""`. 784 string kind = 1; 785 786 // The type of ID. 787 oneof id_type { 788 // The auto-allocated ID of the entity. 789 // Never equal to zero. Values less than zero are discouraged and may not 790 // be supported in the future. 791 int64 id = 2; 792 793 // The name of the entity. 794 // A name matching regex `__.*__` is reserved/read-only. 795 // A name must not be more than 1500 bytes when UTF-8 encoded. 796 // Cannot be `""`. 797 string name = 3; 798 } 799 } 800 801 // Entities are partitioned into subsets, currently identified by a project 802 // ID and namespace ID. 803 // Queries are scoped to a single partition. 804 PartitionId partition_id = 1; 805 806 // The entity path. 807 // An entity path consists of one or more elements composed of a kind and a 808 // string or numerical identifier, which identify entities. The first 809 // element identifies a _root entity_, the second element identifies 810 // a _child_ of the root entity, the third element identifies a child of the 811 // second entity, and so forth. The entities identified by all prefixes of 812 // the path are called the element's _ancestors_. 813 // 814 // A path can never be empty, and a path can have at most 100 elements. 815 repeated PathElement path = 2; 816} 817 818// Message for a unique key indicating a record that contains a finding. 819message RecordKey { 820 // Type of key 821 oneof type { 822 // BigQuery key 823 DatastoreKey datastore_key = 2; 824 825 // Datastore key 826 BigQueryKey big_query_key = 3; 827 } 828 829 // Values of identifying columns in the given row. Order of values matches 830 // the order of `identifying_fields` specified in the scanning request. 831 repeated string id_values = 5; 832} 833 834// Message defining the location of a BigQuery table. A table is uniquely 835// identified by its project_id, dataset_id, and table_name. Within a query 836// a table is often referenced with a string in the format of: 837// `<project_id>:<dataset_id>.<table_id>` or 838// `<project_id>.<dataset_id>.<table_id>`. 839message BigQueryTable { 840 // The Google Cloud Platform project ID of the project containing the table. 841 // If omitted, project ID is inferred from the API call. 842 string project_id = 1; 843 844 // Dataset ID of the table. 845 string dataset_id = 2; 846 847 // Name of the table. 848 string table_id = 3; 849} 850 851// Message defining a field of a BigQuery table. 852message BigQueryField { 853 // Source table of the field. 854 BigQueryTable table = 1; 855 856 // Designated field in the BigQuery table. 857 FieldId field = 2; 858} 859 860// An entity in a dataset is a field or set of fields that correspond to a 861// single person. For example, in medical records the `EntityId` might be a 862// patient identifier, or for financial records it might be an account 863// identifier. This message is used when generalizations or analysis must take 864// into account that multiple rows correspond to the same entity. 865message EntityId { 866 // Composite key indicating which field contains the entity identifier. 867 FieldId field = 1; 868} 869 870// Instructions regarding the table content being inspected. 871message TableOptions { 872 // The columns that are the primary keys for table objects included in 873 // ContentItem. A copy of this cell's value will stored alongside alongside 874 // each finding so that the finding can be traced to the specific row it came 875 // from. No more than 3 may be provided. 876 repeated FieldId identifying_fields = 1; 877} 878