xref: /aosp_15_r20/external/googleapis/google/cloud/dataplex/v1/data_quality.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.dataplex.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/dataplex/v1/processing.proto";
22
23option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb";
24option java_multiple_files = true;
25option java_outer_classname = "DataQualityProto";
26option java_package = "com.google.cloud.dataplex.v1";
27option (google.api.resource_definition) = {
28  type: "bigquery.googleapis.com/Table"
29  pattern: "projects/{project}/datasets/{dataset}/tables/{table}"
30};
31
32// DataQualityScan related setting.
33message DataQualitySpec {
34  // The configuration of post scan actions of DataQualityScan.
35  message PostScanActions {
36    // The configuration of BigQuery export post scan action.
37    message BigQueryExport {
38      // Optional. The BigQuery table to export DataQualityScan results to.
39      // Format:
40      // //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID
41      string results_table = 1 [(google.api.field_behavior) = OPTIONAL];
42    }
43
44    // The individuals or groups who are designated to receive notifications
45    // upon triggers.
46    message Recipients {
47      // Optional. The email recipients who will receive the DataQualityScan
48      // results report.
49      repeated string emails = 1 [(google.api.field_behavior) = OPTIONAL];
50    }
51
52    // This trigger is triggered when the DQ score in the job result is less
53    // than a specified input score.
54    message ScoreThresholdTrigger {
55      // Optional. The score range is in [0,100].
56      float score_threshold = 2 [(google.api.field_behavior) = OPTIONAL];
57    }
58
59    // This trigger is triggered when the scan job itself fails, regardless of
60    // the result.
61    message JobFailureTrigger {}
62
63    // This trigger is triggered whenever a scan job run ends, regardless
64    // of the result.
65    message JobEndTrigger {}
66
67    // The configuration of notification report post scan action.
68    message NotificationReport {
69      // Required. The recipients who will receive the notification report.
70      Recipients recipients = 1 [(google.api.field_behavior) = REQUIRED];
71
72      // Optional. If set, report will be sent when score threshold is met.
73      ScoreThresholdTrigger score_threshold_trigger = 2
74          [(google.api.field_behavior) = OPTIONAL];
75
76      // Optional. If set, report will be sent when a scan job fails.
77      JobFailureTrigger job_failure_trigger = 4
78          [(google.api.field_behavior) = OPTIONAL];
79
80      // Optional. If set, report will be sent when a scan job ends.
81      JobEndTrigger job_end_trigger = 5
82          [(google.api.field_behavior) = OPTIONAL];
83    }
84
85    // Optional. If set, results will be exported to the provided BigQuery
86    // table.
87    BigQueryExport bigquery_export = 1 [(google.api.field_behavior) = OPTIONAL];
88
89    // Optional. If set, results will be sent to the provided notification
90    // receipts upon triggers.
91    NotificationReport notification_report = 2
92        [(google.api.field_behavior) = OPTIONAL];
93  }
94
95  // Required. The list of rules to evaluate against a data source. At least one
96  // rule is required.
97  repeated DataQualityRule rules = 1 [(google.api.field_behavior) = REQUIRED];
98
99  // Optional. The percentage of the records to be selected from the dataset for
100  // DataScan.
101  //
102  // * Value can range between 0.0 and 100.0 with up to 3 significant decimal
103  // digits.
104  // * Sampling is not applied if `sampling_percent` is not specified, 0 or
105  // 100.
106  float sampling_percent = 4 [(google.api.field_behavior) = OPTIONAL];
107
108  // Optional. A filter applied to all rows in a single DataScan job.
109  // The filter needs to be a valid SQL expression for a WHERE clause in
110  // BigQuery standard SQL syntax.
111  // Example: col1 >= 0 AND col2 < 10
112  string row_filter = 5 [(google.api.field_behavior) = OPTIONAL];
113
114  // Optional. Actions to take upon job completion.
115  PostScanActions post_scan_actions = 6
116      [(google.api.field_behavior) = OPTIONAL];
117}
118
119// The output of a DataQualityScan.
120message DataQualityResult {
121  // The result of post scan actions of DataQualityScan job.
122  message PostScanActionsResult {
123    // The result of BigQuery export post scan action.
124    message BigQueryExportResult {
125      // Execution state for the exporting.
126      enum State {
127        // The exporting state is unspecified.
128        STATE_UNSPECIFIED = 0;
129
130        // The exporting completed successfully.
131        SUCCEEDED = 1;
132
133        // The exporting is no longer running due to an error.
134        FAILED = 2;
135
136        // The exporting is skipped due to no valid scan result to export
137        // (usually caused by scan failed).
138        SKIPPED = 3;
139      }
140
141      // Output only. Execution state for the BigQuery exporting.
142      State state = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
143
144      // Output only. Additional information about the BigQuery exporting.
145      string message = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
146    }
147
148    // Output only. The result of BigQuery export post scan action.
149    BigQueryExportResult bigquery_export_result = 1
150        [(google.api.field_behavior) = OUTPUT_ONLY];
151  }
152
153  // Overall data quality result -- `true` if all rules passed.
154  bool passed = 5;
155
156  // Output only. The overall data quality score.
157  //
158  // The score ranges between [0, 100] (up to two decimal points).
159  optional float score = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
160
161  // A list of results at the dimension level.
162  //
163  // A dimension will have a corresponding `DataQualityDimensionResult` if and
164  // only if there is at least one rule with the 'dimension' field set to it.
165  repeated DataQualityDimensionResult dimensions = 2;
166
167  // Output only. A list of results at the column level.
168  //
169  // A column will have a corresponding `DataQualityColumnResult` if and only if
170  // there is at least one rule with the 'column' field set to it.
171  repeated DataQualityColumnResult columns = 10
172      [(google.api.field_behavior) = OUTPUT_ONLY];
173
174  // A list of all the rules in a job, and their results.
175  repeated DataQualityRuleResult rules = 3;
176
177  // The count of rows processed.
178  int64 row_count = 4;
179
180  // The data scanned for this result.
181  ScannedData scanned_data = 7;
182
183  // Output only. The result of post scan actions.
184  PostScanActionsResult post_scan_actions_result = 8
185      [(google.api.field_behavior) = OUTPUT_ONLY];
186}
187
188// DataQualityRuleResult provides a more detailed, per-rule view of the results.
189message DataQualityRuleResult {
190  // The rule specified in the DataQualitySpec, as is.
191  DataQualityRule rule = 1;
192
193  // Whether the rule passed or failed.
194  bool passed = 7;
195
196  // The number of rows a rule was evaluated against.
197  //
198  // This field is only valid for row-level type rules.
199  //
200  // Evaluated count can be configured to either
201  //
202  // * include all rows (default) - with `null` rows automatically failing rule
203  // evaluation, or
204  // * exclude `null` rows from the `evaluated_count`, by setting
205  // `ignore_nulls = true`.
206  int64 evaluated_count = 9;
207
208  // The number of rows which passed a rule evaluation.
209  //
210  // This field is only valid for row-level type rules.
211  int64 passed_count = 8;
212
213  // The number of rows with null values in the specified column.
214  int64 null_count = 5;
215
216  // The ratio of **passed_count / evaluated_count**.
217  //
218  // This field is only valid for row-level type rules.
219  double pass_ratio = 6;
220
221  // The query to find rows that did not pass this rule.
222  //
223  // This field is only valid for row-level type rules.
224  string failing_rows_query = 10;
225}
226
227// DataQualityDimensionResult provides a more detailed, per-dimension view of
228// the results.
229message DataQualityDimensionResult {
230  // Output only. The dimension config specified in the DataQualitySpec, as is.
231  DataQualityDimension dimension = 1
232      [(google.api.field_behavior) = OUTPUT_ONLY];
233
234  // Whether the dimension passed or failed.
235  bool passed = 3;
236
237  // Output only. The dimension-level data quality score for this data scan job
238  // if and only if the 'dimension' field is set.
239  //
240  // The score ranges between [0, 100] (up to two decimal
241  // points).
242  optional float score = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
243}
244
245// A dimension captures data quality intent about a defined subset of the rules
246// specified.
247message DataQualityDimension {
248  // The dimension name a rule belongs to. Supported dimensions are
249  // ["COMPLETENESS", "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS",
250  // "INTEGRITY"]
251  string name = 1;
252}
253
254// A rule captures data quality intent about a data source.
255message DataQualityRule {
256  // Evaluates whether each column value lies between a specified range.
257  message RangeExpectation {
258    // Optional. The minimum column value allowed for a row to pass this
259    // validation. At least one of `min_value` and `max_value` need to be
260    // provided.
261    string min_value = 1 [(google.api.field_behavior) = OPTIONAL];
262
263    // Optional. The maximum column value allowed for a row to pass this
264    // validation. At least one of `min_value` and `max_value` need to be
265    // provided.
266    string max_value = 2 [(google.api.field_behavior) = OPTIONAL];
267
268    // Optional. Whether each value needs to be strictly greater than ('>') the
269    // minimum, or if equality is allowed.
270    //
271    // Only relevant if a `min_value` has been defined. Default = false.
272    bool strict_min_enabled = 3 [(google.api.field_behavior) = OPTIONAL];
273
274    // Optional. Whether each value needs to be strictly lesser than ('<') the
275    // maximum, or if equality is allowed.
276    //
277    // Only relevant if a `max_value` has been defined. Default = false.
278    bool strict_max_enabled = 4 [(google.api.field_behavior) = OPTIONAL];
279  }
280
281  // Evaluates whether each column value is null.
282  message NonNullExpectation {}
283
284  // Evaluates whether each column value is contained by a specified set.
285  message SetExpectation {
286    // Optional. Expected values for the column value.
287    repeated string values = 1 [(google.api.field_behavior) = OPTIONAL];
288  }
289
290  // Evaluates whether each column value matches a specified regex.
291  message RegexExpectation {
292    // Optional. A regular expression the column value is expected to match.
293    string regex = 1 [(google.api.field_behavior) = OPTIONAL];
294  }
295
296  // Evaluates whether the column has duplicates.
297  message UniquenessExpectation {}
298
299  // Evaluates whether the column aggregate statistic lies between a specified
300  // range.
301  message StatisticRangeExpectation {
302    // The list of aggregate metrics a rule can be evaluated against.
303    enum ColumnStatistic {
304      // Unspecified statistic type
305      STATISTIC_UNDEFINED = 0;
306
307      // Evaluate the column mean
308      MEAN = 1;
309
310      // Evaluate the column min
311      MIN = 2;
312
313      // Evaluate the column max
314      MAX = 3;
315    }
316
317    // Optional. The aggregate metric to evaluate.
318    ColumnStatistic statistic = 1 [(google.api.field_behavior) = OPTIONAL];
319
320    // Optional. The minimum column statistic value allowed for a row to pass
321    // this validation.
322    //
323    // At least one of `min_value` and `max_value` need to be provided.
324    string min_value = 2 [(google.api.field_behavior) = OPTIONAL];
325
326    // Optional. The maximum column statistic value allowed for a row to pass
327    // this validation.
328    //
329    // At least one of `min_value` and `max_value` need to be provided.
330    string max_value = 3 [(google.api.field_behavior) = OPTIONAL];
331
332    // Optional. Whether column statistic needs to be strictly greater than
333    // ('>') the minimum, or if equality is allowed.
334    //
335    // Only relevant if a `min_value` has been defined. Default = false.
336    bool strict_min_enabled = 4 [(google.api.field_behavior) = OPTIONAL];
337
338    // Optional. Whether column statistic needs to be strictly lesser than ('<')
339    // the maximum, or if equality is allowed.
340    //
341    // Only relevant if a `max_value` has been defined. Default = false.
342    bool strict_max_enabled = 5 [(google.api.field_behavior) = OPTIONAL];
343  }
344
345  // Evaluates whether each row passes the specified condition.
346  //
347  // The SQL expression needs to use BigQuery standard SQL syntax and should
348  // produce a boolean value per row as the result.
349  //
350  // Example: col1 >= 0 AND col2 < 10
351  message RowConditionExpectation {
352    // Optional. The SQL expression.
353    string sql_expression = 1 [(google.api.field_behavior) = OPTIONAL];
354  }
355
356  // Evaluates whether the provided expression is true.
357  //
358  // The SQL expression needs to use BigQuery standard SQL syntax and should
359  // produce a scalar boolean result.
360  //
361  // Example: MIN(col1) >= 0
362  message TableConditionExpectation {
363    // Optional. The SQL expression.
364    string sql_expression = 1 [(google.api.field_behavior) = OPTIONAL];
365  }
366
367  // The rule-specific configuration.
368  oneof rule_type {
369    // Row-level rule which evaluates whether each column value lies between a
370    // specified range.
371    RangeExpectation range_expectation = 1;
372
373    // Row-level rule which evaluates whether each column value is null.
374    NonNullExpectation non_null_expectation = 2;
375
376    // Row-level rule which evaluates whether each column value is contained by
377    // a specified set.
378    SetExpectation set_expectation = 3;
379
380    // Row-level rule which evaluates whether each column value matches a
381    // specified regex.
382    RegexExpectation regex_expectation = 4;
383
384    // Row-level rule which evaluates whether each column value is unique.
385    UniquenessExpectation uniqueness_expectation = 100;
386
387    // Aggregate rule which evaluates whether the column aggregate
388    // statistic lies between a specified range.
389    StatisticRangeExpectation statistic_range_expectation = 101;
390
391    // Row-level rule which evaluates whether each row in a table passes the
392    // specified condition.
393    RowConditionExpectation row_condition_expectation = 200;
394
395    // Aggregate rule which evaluates whether the provided expression is true
396    // for a table.
397    TableConditionExpectation table_condition_expectation = 201;
398  }
399
400  // Optional. The unnested column which this rule is evaluated against.
401  string column = 500 [(google.api.field_behavior) = OPTIONAL];
402
403  // Optional. Rows with `null` values will automatically fail a rule, unless
404  // `ignore_null` is `true`. In that case, such `null` rows are trivially
405  // considered passing.
406  //
407  // This field is only valid for the following type of rules:
408  //
409  // * RangeExpectation
410  // * RegexExpectation
411  // * SetExpectation
412  // * UniquenessExpectation
413  bool ignore_null = 501 [(google.api.field_behavior) = OPTIONAL];
414
415  // Required. The dimension a rule belongs to. Results are also aggregated at
416  // the dimension level. Supported dimensions are **["COMPLETENESS",
417  // "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]**
418  string dimension = 502 [(google.api.field_behavior) = REQUIRED];
419
420  // Optional. The minimum ratio of **passing_rows / total_rows** required to
421  // pass this rule, with a range of [0.0, 1.0].
422  //
423  // 0 indicates default value (i.e. 1.0).
424  //
425  // This field is only valid for row-level type rules.
426  double threshold = 503 [(google.api.field_behavior) = OPTIONAL];
427
428  // Optional. A mutable name for the rule.
429  //
430  // * The name must contain only letters (a-z, A-Z), numbers (0-9), or
431  // hyphens (-).
432  // * The maximum length is 63 characters.
433  // * Must start with a letter.
434  // * Must end with a number or a letter.
435  string name = 504 [(google.api.field_behavior) = OPTIONAL];
436
437  // Optional. Description of the rule.
438  //
439  // * The maximum length is 1,024 characters.
440  string description = 505 [(google.api.field_behavior) = OPTIONAL];
441}
442
443// DataQualityColumnResult provides a more detailed, per-column view of
444// the results.
445message DataQualityColumnResult {
446  // Output only. The column specified in the DataQualityRule.
447  string column = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
448
449  // Output only. The column-level data quality score for this data scan job if
450  // and only if the 'column' field is set.
451  //
452  // The score ranges between between [0, 100] (up to two decimal
453  // points).
454  optional float score = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
455}
456