xref: /aosp_15_r20/external/googleapis/google/cloud/dataplex/v1/data_profile.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.dataplex.v1;
18
19import "google/api/field_behavior.proto";
20import "google/cloud/dataplex/v1/processing.proto";
21
22option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb";
23option java_multiple_files = true;
24option java_outer_classname = "DataProfileProto";
25option java_package = "com.google.cloud.dataplex.v1";
26
27// DataProfileScan related setting.
28message DataProfileSpec {
29  // The configuration of post scan actions of DataProfileScan job.
30  message PostScanActions {
31    // The configuration of BigQuery export post scan action.
32    message BigQueryExport {
33      // Optional. The BigQuery table to export DataProfileScan results to.
34      // Format:
35      // //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID
36      string results_table = 1 [(google.api.field_behavior) = OPTIONAL];
37    }
38
39    // Optional. If set, results will be exported to the provided BigQuery
40    // table.
41    BigQueryExport bigquery_export = 1 [(google.api.field_behavior) = OPTIONAL];
42  }
43
44  // The specification for fields to include or exclude in data profile scan.
45  message SelectedFields {
46    // Optional. Expected input is a list of fully qualified names of fields as
47    // in the schema.
48    //
49    // Only top-level field names for nested fields are supported.
50    // For instance, if 'x' is of nested field type, listing 'x' is supported
51    // but 'x.y.z' is not supported. Here 'y' and 'y.z' are nested fields of
52    // 'x'.
53    repeated string field_names = 1 [(google.api.field_behavior) = OPTIONAL];
54  }
55
56  // Optional. The percentage of the records to be selected from the dataset for
57  // DataScan.
58  //
59  // * Value can range between 0.0 and 100.0 with up to 3 significant decimal
60  // digits.
61  // * Sampling is not applied if `sampling_percent` is not specified, 0 or
62  // 100.
63  float sampling_percent = 2 [(google.api.field_behavior) = OPTIONAL];
64
65  // Optional. A filter applied to all rows in a single DataScan job.
66  // The filter needs to be a valid SQL expression for a WHERE clause in
67  // BigQuery standard SQL syntax.
68  // Example: col1 >= 0 AND col2 < 10
69  string row_filter = 3 [(google.api.field_behavior) = OPTIONAL];
70
71  // Optional. Actions to take upon job completion..
72  PostScanActions post_scan_actions = 4
73      [(google.api.field_behavior) = OPTIONAL];
74
75  // Optional. The fields to include in data profile.
76  //
77  // If not specified, all fields at the time of profile scan job execution are
78  // included, except for ones listed in `exclude_fields`.
79  SelectedFields include_fields = 5 [(google.api.field_behavior) = OPTIONAL];
80
81  // Optional. The fields to exclude from data profile.
82  //
83  // If specified, the fields will be excluded from data profile, regardless of
84  // `include_fields` value.
85  SelectedFields exclude_fields = 6 [(google.api.field_behavior) = OPTIONAL];
86}
87
88// DataProfileResult defines the output of DataProfileScan. Each field of the
89// table will have field type specific profile result.
90message DataProfileResult {
91  // Contains name, type, mode and field type specific profile information.
92  message Profile {
93    // A field within a table.
94    message Field {
95      // The profile information for each field type.
96      message ProfileInfo {
97        // The profile information for a string type field.
98        message StringFieldInfo {
99          // Minimum length of non-null values in the scanned data.
100          int64 min_length = 1;
101
102          // Maximum length of non-null values in the scanned data.
103          int64 max_length = 2;
104
105          // Average length of non-null values in the scanned data.
106          double average_length = 3;
107        }
108
109        // The profile information for an integer type field.
110        message IntegerFieldInfo {
111          // Average of non-null values in the scanned data. NaN, if the field
112          // has a NaN.
113          double average = 1;
114
115          // Standard deviation of non-null values in the scanned data. NaN, if
116          // the field has a NaN.
117          double standard_deviation = 3;
118
119          // Minimum of non-null values in the scanned data. NaN, if the field
120          // has a NaN.
121          int64 min = 4;
122
123          // A quartile divides the number of data points into four parts, or
124          // quarters, of more-or-less equal size. Three main quartiles used
125          // are: The first quartile (Q1) splits off the lowest 25% of data from
126          // the highest 75%. It is also known as the lower or 25th empirical
127          // quartile, as 25% of the data is below this point. The second
128          // quartile (Q2) is the median of a data set. So, 50% of the data lies
129          // below this point. The third quartile (Q3) splits off the highest
130          // 25% of data from the lowest 75%. It is known as the upper or 75th
131          // empirical quartile, as 75% of the data lies below this point.
132          // Here, the quartiles is provided as an ordered list of approximate
133          // quartile values for the scanned data, occurring in order Q1,
134          // median, Q3.
135          repeated int64 quartiles = 6;
136
137          // Maximum of non-null values in the scanned data. NaN, if the field
138          // has a NaN.
139          int64 max = 5;
140        }
141
142        // The profile information for a double type field.
143        message DoubleFieldInfo {
144          // Average of non-null values in the scanned data. NaN, if the field
145          // has a NaN.
146          double average = 1;
147
148          // Standard deviation of non-null values in the scanned data. NaN, if
149          // the field has a NaN.
150          double standard_deviation = 3;
151
152          // Minimum of non-null values in the scanned data. NaN, if the field
153          // has a NaN.
154          double min = 4;
155
156          // A quartile divides the number of data points into four parts, or
157          // quarters, of more-or-less equal size. Three main quartiles used
158          // are: The first quartile (Q1) splits off the lowest 25% of data from
159          // the highest 75%. It is also known as the lower or 25th empirical
160          // quartile, as 25% of the data is below this point. The second
161          // quartile (Q2) is the median of a data set. So, 50% of the data lies
162          // below this point. The third quartile (Q3) splits off the highest
163          // 25% of data from the lowest 75%. It is known as the upper or 75th
164          // empirical quartile, as 75% of the data lies below this point.
165          // Here, the quartiles is provided as an ordered list of quartile
166          // values for the scanned data, occurring in order Q1, median, Q3.
167          repeated double quartiles = 6;
168
169          // Maximum of non-null values in the scanned data. NaN, if the field
170          // has a NaN.
171          double max = 5;
172        }
173
174        // Top N non-null values in the scanned data.
175        message TopNValue {
176          // String value of a top N non-null value.
177          string value = 1;
178
179          // Count of the corresponding value in the scanned data.
180          int64 count = 2;
181
182          // Ratio of the corresponding value in the field against the total
183          // number of rows in the scanned data.
184          double ratio = 3;
185        }
186
187        // Ratio of rows with null value against total scanned rows.
188        double null_ratio = 2;
189
190        // Ratio of rows with distinct values against total scanned rows.
191        // Not available for complex non-groupable field type RECORD and fields
192        // with REPEATABLE mode.
193        double distinct_ratio = 3;
194
195        // The list of top N non-null values, frequency and ratio with which
196        // they occur in the scanned data. N is 10 or equal to the number of
197        // distinct values in the field, whichever is smaller. Not available for
198        // complex non-groupable field type RECORD and fields with REPEATABLE
199        // mode.
200        repeated TopNValue top_n_values = 4;
201
202        // Structural and profile information for specific field type. Not
203        // available, if mode is REPEATABLE.
204        oneof field_info {
205          // String type field information.
206          StringFieldInfo string_profile = 101;
207
208          // Integer type field information.
209          IntegerFieldInfo integer_profile = 102;
210
211          // Double type field information.
212          DoubleFieldInfo double_profile = 103;
213        }
214      }
215
216      // The name of the field.
217      string name = 1;
218
219      // The data type retrieved from the schema of the data source. For
220      // instance, for a BigQuery native table, it is the [BigQuery Table
221      // Schema](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablefieldschema).
222      // For a Dataplex Entity, it is the [Entity
223      // Schema](https://cloud.google.com/dataplex/docs/reference/rpc/google.cloud.dataplex.v1#type_3).
224      string type = 2;
225
226      // The mode of the field. Possible values include:
227      //
228      // * REQUIRED, if it is a required field.
229      // * NULLABLE, if it is an optional field.
230      // * REPEATED, if it is a repeated field.
231      string mode = 3;
232
233      // Profile information for the corresponding field.
234      ProfileInfo profile = 4;
235    }
236
237    // List of fields with structural and profile information for each field.
238    repeated Field fields = 2;
239  }
240
241  // The result of post scan actions of DataProfileScan job.
242  message PostScanActionsResult {
243    // The result of BigQuery export post scan action.
244    message BigQueryExportResult {
245      // Execution state for the exporting.
246      enum State {
247        // The exporting state is unspecified.
248        STATE_UNSPECIFIED = 0;
249
250        // The exporting completed successfully.
251        SUCCEEDED = 1;
252
253        // The exporting is no longer running due to an error.
254        FAILED = 2;
255
256        // The exporting is skipped due to no valid scan result to export
257        // (usually caused by scan failed).
258        SKIPPED = 3;
259      }
260
261      // Output only. Execution state for the BigQuery exporting.
262      State state = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
263
264      // Output only. Additional information about the BigQuery exporting.
265      string message = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
266    }
267
268    // Output only. The result of BigQuery export post scan action.
269    BigQueryExportResult bigquery_export_result = 1
270        [(google.api.field_behavior) = OUTPUT_ONLY];
271  }
272
273  // The count of rows scanned.
274  int64 row_count = 3;
275
276  // The profile information per field.
277  Profile profile = 4;
278
279  // The data scanned for this result.
280  ScannedData scanned_data = 5;
281
282  // Output only. The result of post scan actions.
283  PostScanActionsResult post_scan_actions_result = 6
284      [(google.api.field_behavior) = OUTPUT_ONLY];
285}
286