1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.dataplex.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/dataplex/v1/processing.proto"; 22 23option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb"; 24option java_multiple_files = true; 25option java_outer_classname = "DataQualityProto"; 26option java_package = "com.google.cloud.dataplex.v1"; 27option (google.api.resource_definition) = { 28 type: "bigquery.googleapis.com/Table" 29 pattern: "projects/{project}/datasets/{dataset}/tables/{table}" 30}; 31 32// DataQualityScan related setting. 33message DataQualitySpec { 34 // The configuration of post scan actions of DataQualityScan. 35 message PostScanActions { 36 // The configuration of BigQuery export post scan action. 37 message BigQueryExport { 38 // Optional. The BigQuery table to export DataQualityScan results to. 39 // Format: 40 // //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID 41 string results_table = 1 [(google.api.field_behavior) = OPTIONAL]; 42 } 43 44 // The individuals or groups who are designated to receive notifications 45 // upon triggers. 46 message Recipients { 47 // Optional. The email recipients who will receive the DataQualityScan 48 // results report. 49 repeated string emails = 1 [(google.api.field_behavior) = OPTIONAL]; 50 } 51 52 // This trigger is triggered when the DQ score in the job result is less 53 // than a specified input score. 54 message ScoreThresholdTrigger { 55 // Optional. The score range is in [0,100]. 56 float score_threshold = 2 [(google.api.field_behavior) = OPTIONAL]; 57 } 58 59 // This trigger is triggered when the scan job itself fails, regardless of 60 // the result. 61 message JobFailureTrigger {} 62 63 // This trigger is triggered whenever a scan job run ends, regardless 64 // of the result. 65 message JobEndTrigger {} 66 67 // The configuration of notification report post scan action. 68 message NotificationReport { 69 // Required. The recipients who will receive the notification report. 70 Recipients recipients = 1 [(google.api.field_behavior) = REQUIRED]; 71 72 // Optional. If set, report will be sent when score threshold is met. 73 ScoreThresholdTrigger score_threshold_trigger = 2 74 [(google.api.field_behavior) = OPTIONAL]; 75 76 // Optional. If set, report will be sent when a scan job fails. 77 JobFailureTrigger job_failure_trigger = 4 78 [(google.api.field_behavior) = OPTIONAL]; 79 80 // Optional. If set, report will be sent when a scan job ends. 81 JobEndTrigger job_end_trigger = 5 82 [(google.api.field_behavior) = OPTIONAL]; 83 } 84 85 // Optional. If set, results will be exported to the provided BigQuery 86 // table. 87 BigQueryExport bigquery_export = 1 [(google.api.field_behavior) = OPTIONAL]; 88 89 // Optional. If set, results will be sent to the provided notification 90 // receipts upon triggers. 91 NotificationReport notification_report = 2 92 [(google.api.field_behavior) = OPTIONAL]; 93 } 94 95 // Required. The list of rules to evaluate against a data source. At least one 96 // rule is required. 97 repeated DataQualityRule rules = 1 [(google.api.field_behavior) = REQUIRED]; 98 99 // Optional. The percentage of the records to be selected from the dataset for 100 // DataScan. 101 // 102 // * Value can range between 0.0 and 100.0 with up to 3 significant decimal 103 // digits. 104 // * Sampling is not applied if `sampling_percent` is not specified, 0 or 105 // 100. 106 float sampling_percent = 4 [(google.api.field_behavior) = OPTIONAL]; 107 108 // Optional. A filter applied to all rows in a single DataScan job. 109 // The filter needs to be a valid SQL expression for a WHERE clause in 110 // BigQuery standard SQL syntax. 111 // Example: col1 >= 0 AND col2 < 10 112 string row_filter = 5 [(google.api.field_behavior) = OPTIONAL]; 113 114 // Optional. Actions to take upon job completion. 115 PostScanActions post_scan_actions = 6 116 [(google.api.field_behavior) = OPTIONAL]; 117} 118 119// The output of a DataQualityScan. 120message DataQualityResult { 121 // The result of post scan actions of DataQualityScan job. 122 message PostScanActionsResult { 123 // The result of BigQuery export post scan action. 124 message BigQueryExportResult { 125 // Execution state for the exporting. 126 enum State { 127 // The exporting state is unspecified. 128 STATE_UNSPECIFIED = 0; 129 130 // The exporting completed successfully. 131 SUCCEEDED = 1; 132 133 // The exporting is no longer running due to an error. 134 FAILED = 2; 135 136 // The exporting is skipped due to no valid scan result to export 137 // (usually caused by scan failed). 138 SKIPPED = 3; 139 } 140 141 // Output only. Execution state for the BigQuery exporting. 142 State state = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 143 144 // Output only. Additional information about the BigQuery exporting. 145 string message = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 146 } 147 148 // Output only. The result of BigQuery export post scan action. 149 BigQueryExportResult bigquery_export_result = 1 150 [(google.api.field_behavior) = OUTPUT_ONLY]; 151 } 152 153 // Overall data quality result -- `true` if all rules passed. 154 bool passed = 5; 155 156 // Output only. The overall data quality score. 157 // 158 // The score ranges between [0, 100] (up to two decimal points). 159 optional float score = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; 160 161 // A list of results at the dimension level. 162 // 163 // A dimension will have a corresponding `DataQualityDimensionResult` if and 164 // only if there is at least one rule with the 'dimension' field set to it. 165 repeated DataQualityDimensionResult dimensions = 2; 166 167 // Output only. A list of results at the column level. 168 // 169 // A column will have a corresponding `DataQualityColumnResult` if and only if 170 // there is at least one rule with the 'column' field set to it. 171 repeated DataQualityColumnResult columns = 10 172 [(google.api.field_behavior) = OUTPUT_ONLY]; 173 174 // A list of all the rules in a job, and their results. 175 repeated DataQualityRuleResult rules = 3; 176 177 // The count of rows processed. 178 int64 row_count = 4; 179 180 // The data scanned for this result. 181 ScannedData scanned_data = 7; 182 183 // Output only. The result of post scan actions. 184 PostScanActionsResult post_scan_actions_result = 8 185 [(google.api.field_behavior) = OUTPUT_ONLY]; 186} 187 188// DataQualityRuleResult provides a more detailed, per-rule view of the results. 189message DataQualityRuleResult { 190 // The rule specified in the DataQualitySpec, as is. 191 DataQualityRule rule = 1; 192 193 // Whether the rule passed or failed. 194 bool passed = 7; 195 196 // The number of rows a rule was evaluated against. 197 // 198 // This field is only valid for row-level type rules. 199 // 200 // Evaluated count can be configured to either 201 // 202 // * include all rows (default) - with `null` rows automatically failing rule 203 // evaluation, or 204 // * exclude `null` rows from the `evaluated_count`, by setting 205 // `ignore_nulls = true`. 206 int64 evaluated_count = 9; 207 208 // The number of rows which passed a rule evaluation. 209 // 210 // This field is only valid for row-level type rules. 211 int64 passed_count = 8; 212 213 // The number of rows with null values in the specified column. 214 int64 null_count = 5; 215 216 // The ratio of **passed_count / evaluated_count**. 217 // 218 // This field is only valid for row-level type rules. 219 double pass_ratio = 6; 220 221 // The query to find rows that did not pass this rule. 222 // 223 // This field is only valid for row-level type rules. 224 string failing_rows_query = 10; 225} 226 227// DataQualityDimensionResult provides a more detailed, per-dimension view of 228// the results. 229message DataQualityDimensionResult { 230 // Output only. The dimension config specified in the DataQualitySpec, as is. 231 DataQualityDimension dimension = 1 232 [(google.api.field_behavior) = OUTPUT_ONLY]; 233 234 // Whether the dimension passed or failed. 235 bool passed = 3; 236 237 // Output only. The dimension-level data quality score for this data scan job 238 // if and only if the 'dimension' field is set. 239 // 240 // The score ranges between [0, 100] (up to two decimal 241 // points). 242 optional float score = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; 243} 244 245// A dimension captures data quality intent about a defined subset of the rules 246// specified. 247message DataQualityDimension { 248 // The dimension name a rule belongs to. Supported dimensions are 249 // ["COMPLETENESS", "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", 250 // "INTEGRITY"] 251 string name = 1; 252} 253 254// A rule captures data quality intent about a data source. 255message DataQualityRule { 256 // Evaluates whether each column value lies between a specified range. 257 message RangeExpectation { 258 // Optional. The minimum column value allowed for a row to pass this 259 // validation. At least one of `min_value` and `max_value` need to be 260 // provided. 261 string min_value = 1 [(google.api.field_behavior) = OPTIONAL]; 262 263 // Optional. The maximum column value allowed for a row to pass this 264 // validation. At least one of `min_value` and `max_value` need to be 265 // provided. 266 string max_value = 2 [(google.api.field_behavior) = OPTIONAL]; 267 268 // Optional. Whether each value needs to be strictly greater than ('>') the 269 // minimum, or if equality is allowed. 270 // 271 // Only relevant if a `min_value` has been defined. Default = false. 272 bool strict_min_enabled = 3 [(google.api.field_behavior) = OPTIONAL]; 273 274 // Optional. Whether each value needs to be strictly lesser than ('<') the 275 // maximum, or if equality is allowed. 276 // 277 // Only relevant if a `max_value` has been defined. Default = false. 278 bool strict_max_enabled = 4 [(google.api.field_behavior) = OPTIONAL]; 279 } 280 281 // Evaluates whether each column value is null. 282 message NonNullExpectation {} 283 284 // Evaluates whether each column value is contained by a specified set. 285 message SetExpectation { 286 // Optional. Expected values for the column value. 287 repeated string values = 1 [(google.api.field_behavior) = OPTIONAL]; 288 } 289 290 // Evaluates whether each column value matches a specified regex. 291 message RegexExpectation { 292 // Optional. A regular expression the column value is expected to match. 293 string regex = 1 [(google.api.field_behavior) = OPTIONAL]; 294 } 295 296 // Evaluates whether the column has duplicates. 297 message UniquenessExpectation {} 298 299 // Evaluates whether the column aggregate statistic lies between a specified 300 // range. 301 message StatisticRangeExpectation { 302 // The list of aggregate metrics a rule can be evaluated against. 303 enum ColumnStatistic { 304 // Unspecified statistic type 305 STATISTIC_UNDEFINED = 0; 306 307 // Evaluate the column mean 308 MEAN = 1; 309 310 // Evaluate the column min 311 MIN = 2; 312 313 // Evaluate the column max 314 MAX = 3; 315 } 316 317 // Optional. The aggregate metric to evaluate. 318 ColumnStatistic statistic = 1 [(google.api.field_behavior) = OPTIONAL]; 319 320 // Optional. The minimum column statistic value allowed for a row to pass 321 // this validation. 322 // 323 // At least one of `min_value` and `max_value` need to be provided. 324 string min_value = 2 [(google.api.field_behavior) = OPTIONAL]; 325 326 // Optional. The maximum column statistic value allowed for a row to pass 327 // this validation. 328 // 329 // At least one of `min_value` and `max_value` need to be provided. 330 string max_value = 3 [(google.api.field_behavior) = OPTIONAL]; 331 332 // Optional. Whether column statistic needs to be strictly greater than 333 // ('>') the minimum, or if equality is allowed. 334 // 335 // Only relevant if a `min_value` has been defined. Default = false. 336 bool strict_min_enabled = 4 [(google.api.field_behavior) = OPTIONAL]; 337 338 // Optional. Whether column statistic needs to be strictly lesser than ('<') 339 // the maximum, or if equality is allowed. 340 // 341 // Only relevant if a `max_value` has been defined. Default = false. 342 bool strict_max_enabled = 5 [(google.api.field_behavior) = OPTIONAL]; 343 } 344 345 // Evaluates whether each row passes the specified condition. 346 // 347 // The SQL expression needs to use BigQuery standard SQL syntax and should 348 // produce a boolean value per row as the result. 349 // 350 // Example: col1 >= 0 AND col2 < 10 351 message RowConditionExpectation { 352 // Optional. The SQL expression. 353 string sql_expression = 1 [(google.api.field_behavior) = OPTIONAL]; 354 } 355 356 // Evaluates whether the provided expression is true. 357 // 358 // The SQL expression needs to use BigQuery standard SQL syntax and should 359 // produce a scalar boolean result. 360 // 361 // Example: MIN(col1) >= 0 362 message TableConditionExpectation { 363 // Optional. The SQL expression. 364 string sql_expression = 1 [(google.api.field_behavior) = OPTIONAL]; 365 } 366 367 // The rule-specific configuration. 368 oneof rule_type { 369 // Row-level rule which evaluates whether each column value lies between a 370 // specified range. 371 RangeExpectation range_expectation = 1; 372 373 // Row-level rule which evaluates whether each column value is null. 374 NonNullExpectation non_null_expectation = 2; 375 376 // Row-level rule which evaluates whether each column value is contained by 377 // a specified set. 378 SetExpectation set_expectation = 3; 379 380 // Row-level rule which evaluates whether each column value matches a 381 // specified regex. 382 RegexExpectation regex_expectation = 4; 383 384 // Row-level rule which evaluates whether each column value is unique. 385 UniquenessExpectation uniqueness_expectation = 100; 386 387 // Aggregate rule which evaluates whether the column aggregate 388 // statistic lies between a specified range. 389 StatisticRangeExpectation statistic_range_expectation = 101; 390 391 // Row-level rule which evaluates whether each row in a table passes the 392 // specified condition. 393 RowConditionExpectation row_condition_expectation = 200; 394 395 // Aggregate rule which evaluates whether the provided expression is true 396 // for a table. 397 TableConditionExpectation table_condition_expectation = 201; 398 } 399 400 // Optional. The unnested column which this rule is evaluated against. 401 string column = 500 [(google.api.field_behavior) = OPTIONAL]; 402 403 // Optional. Rows with `null` values will automatically fail a rule, unless 404 // `ignore_null` is `true`. In that case, such `null` rows are trivially 405 // considered passing. 406 // 407 // This field is only valid for the following type of rules: 408 // 409 // * RangeExpectation 410 // * RegexExpectation 411 // * SetExpectation 412 // * UniquenessExpectation 413 bool ignore_null = 501 [(google.api.field_behavior) = OPTIONAL]; 414 415 // Required. The dimension a rule belongs to. Results are also aggregated at 416 // the dimension level. Supported dimensions are **["COMPLETENESS", 417 // "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]** 418 string dimension = 502 [(google.api.field_behavior) = REQUIRED]; 419 420 // Optional. The minimum ratio of **passing_rows / total_rows** required to 421 // pass this rule, with a range of [0.0, 1.0]. 422 // 423 // 0 indicates default value (i.e. 1.0). 424 // 425 // This field is only valid for row-level type rules. 426 double threshold = 503 [(google.api.field_behavior) = OPTIONAL]; 427 428 // Optional. A mutable name for the rule. 429 // 430 // * The name must contain only letters (a-z, A-Z), numbers (0-9), or 431 // hyphens (-). 432 // * The maximum length is 63 characters. 433 // * Must start with a letter. 434 // * Must end with a number or a letter. 435 string name = 504 [(google.api.field_behavior) = OPTIONAL]; 436 437 // Optional. Description of the rule. 438 // 439 // * The maximum length is 1,024 characters. 440 string description = 505 [(google.api.field_behavior) = OPTIONAL]; 441} 442 443// DataQualityColumnResult provides a more detailed, per-column view of 444// the results. 445message DataQualityColumnResult { 446 // Output only. The column specified in the DataQualityRule. 447 string column = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 448 449 // Output only. The column-level data quality score for this data scan job if 450 // and only if the 'column' field is set. 451 // 452 // The score ranges between between [0, 100] (up to two decimal 453 // points). 454 optional float score = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 455} 456