1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.dataplex.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/cloud/dataplex/v1/processing.proto"; 21 22option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb"; 23option java_multiple_files = true; 24option java_outer_classname = "DataProfileProto"; 25option java_package = "com.google.cloud.dataplex.v1"; 26 27// DataProfileScan related setting. 28message DataProfileSpec { 29 // The configuration of post scan actions of DataProfileScan job. 30 message PostScanActions { 31 // The configuration of BigQuery export post scan action. 32 message BigQueryExport { 33 // Optional. The BigQuery table to export DataProfileScan results to. 34 // Format: 35 // //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID 36 string results_table = 1 [(google.api.field_behavior) = OPTIONAL]; 37 } 38 39 // Optional. If set, results will be exported to the provided BigQuery 40 // table. 41 BigQueryExport bigquery_export = 1 [(google.api.field_behavior) = OPTIONAL]; 42 } 43 44 // The specification for fields to include or exclude in data profile scan. 45 message SelectedFields { 46 // Optional. Expected input is a list of fully qualified names of fields as 47 // in the schema. 48 // 49 // Only top-level field names for nested fields are supported. 50 // For instance, if 'x' is of nested field type, listing 'x' is supported 51 // but 'x.y.z' is not supported. Here 'y' and 'y.z' are nested fields of 52 // 'x'. 53 repeated string field_names = 1 [(google.api.field_behavior) = OPTIONAL]; 54 } 55 56 // Optional. The percentage of the records to be selected from the dataset for 57 // DataScan. 58 // 59 // * Value can range between 0.0 and 100.0 with up to 3 significant decimal 60 // digits. 61 // * Sampling is not applied if `sampling_percent` is not specified, 0 or 62 // 100. 63 float sampling_percent = 2 [(google.api.field_behavior) = OPTIONAL]; 64 65 // Optional. A filter applied to all rows in a single DataScan job. 66 // The filter needs to be a valid SQL expression for a WHERE clause in 67 // BigQuery standard SQL syntax. 68 // Example: col1 >= 0 AND col2 < 10 69 string row_filter = 3 [(google.api.field_behavior) = OPTIONAL]; 70 71 // Optional. Actions to take upon job completion.. 72 PostScanActions post_scan_actions = 4 73 [(google.api.field_behavior) = OPTIONAL]; 74 75 // Optional. The fields to include in data profile. 76 // 77 // If not specified, all fields at the time of profile scan job execution are 78 // included, except for ones listed in `exclude_fields`. 79 SelectedFields include_fields = 5 [(google.api.field_behavior) = OPTIONAL]; 80 81 // Optional. The fields to exclude from data profile. 82 // 83 // If specified, the fields will be excluded from data profile, regardless of 84 // `include_fields` value. 85 SelectedFields exclude_fields = 6 [(google.api.field_behavior) = OPTIONAL]; 86} 87 88// DataProfileResult defines the output of DataProfileScan. Each field of the 89// table will have field type specific profile result. 90message DataProfileResult { 91 // Contains name, type, mode and field type specific profile information. 92 message Profile { 93 // A field within a table. 94 message Field { 95 // The profile information for each field type. 96 message ProfileInfo { 97 // The profile information for a string type field. 98 message StringFieldInfo { 99 // Minimum length of non-null values in the scanned data. 100 int64 min_length = 1; 101 102 // Maximum length of non-null values in the scanned data. 103 int64 max_length = 2; 104 105 // Average length of non-null values in the scanned data. 106 double average_length = 3; 107 } 108 109 // The profile information for an integer type field. 110 message IntegerFieldInfo { 111 // Average of non-null values in the scanned data. NaN, if the field 112 // has a NaN. 113 double average = 1; 114 115 // Standard deviation of non-null values in the scanned data. NaN, if 116 // the field has a NaN. 117 double standard_deviation = 3; 118 119 // Minimum of non-null values in the scanned data. NaN, if the field 120 // has a NaN. 121 int64 min = 4; 122 123 // A quartile divides the number of data points into four parts, or 124 // quarters, of more-or-less equal size. Three main quartiles used 125 // are: The first quartile (Q1) splits off the lowest 25% of data from 126 // the highest 75%. It is also known as the lower or 25th empirical 127 // quartile, as 25% of the data is below this point. The second 128 // quartile (Q2) is the median of a data set. So, 50% of the data lies 129 // below this point. The third quartile (Q3) splits off the highest 130 // 25% of data from the lowest 75%. It is known as the upper or 75th 131 // empirical quartile, as 75% of the data lies below this point. 132 // Here, the quartiles is provided as an ordered list of approximate 133 // quartile values for the scanned data, occurring in order Q1, 134 // median, Q3. 135 repeated int64 quartiles = 6; 136 137 // Maximum of non-null values in the scanned data. NaN, if the field 138 // has a NaN. 139 int64 max = 5; 140 } 141 142 // The profile information for a double type field. 143 message DoubleFieldInfo { 144 // Average of non-null values in the scanned data. NaN, if the field 145 // has a NaN. 146 double average = 1; 147 148 // Standard deviation of non-null values in the scanned data. NaN, if 149 // the field has a NaN. 150 double standard_deviation = 3; 151 152 // Minimum of non-null values in the scanned data. NaN, if the field 153 // has a NaN. 154 double min = 4; 155 156 // A quartile divides the number of data points into four parts, or 157 // quarters, of more-or-less equal size. Three main quartiles used 158 // are: The first quartile (Q1) splits off the lowest 25% of data from 159 // the highest 75%. It is also known as the lower or 25th empirical 160 // quartile, as 25% of the data is below this point. The second 161 // quartile (Q2) is the median of a data set. So, 50% of the data lies 162 // below this point. The third quartile (Q3) splits off the highest 163 // 25% of data from the lowest 75%. It is known as the upper or 75th 164 // empirical quartile, as 75% of the data lies below this point. 165 // Here, the quartiles is provided as an ordered list of quartile 166 // values for the scanned data, occurring in order Q1, median, Q3. 167 repeated double quartiles = 6; 168 169 // Maximum of non-null values in the scanned data. NaN, if the field 170 // has a NaN. 171 double max = 5; 172 } 173 174 // Top N non-null values in the scanned data. 175 message TopNValue { 176 // String value of a top N non-null value. 177 string value = 1; 178 179 // Count of the corresponding value in the scanned data. 180 int64 count = 2; 181 182 // Ratio of the corresponding value in the field against the total 183 // number of rows in the scanned data. 184 double ratio = 3; 185 } 186 187 // Ratio of rows with null value against total scanned rows. 188 double null_ratio = 2; 189 190 // Ratio of rows with distinct values against total scanned rows. 191 // Not available for complex non-groupable field type RECORD and fields 192 // with REPEATABLE mode. 193 double distinct_ratio = 3; 194 195 // The list of top N non-null values, frequency and ratio with which 196 // they occur in the scanned data. N is 10 or equal to the number of 197 // distinct values in the field, whichever is smaller. Not available for 198 // complex non-groupable field type RECORD and fields with REPEATABLE 199 // mode. 200 repeated TopNValue top_n_values = 4; 201 202 // Structural and profile information for specific field type. Not 203 // available, if mode is REPEATABLE. 204 oneof field_info { 205 // String type field information. 206 StringFieldInfo string_profile = 101; 207 208 // Integer type field information. 209 IntegerFieldInfo integer_profile = 102; 210 211 // Double type field information. 212 DoubleFieldInfo double_profile = 103; 213 } 214 } 215 216 // The name of the field. 217 string name = 1; 218 219 // The data type retrieved from the schema of the data source. For 220 // instance, for a BigQuery native table, it is the [BigQuery Table 221 // Schema](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablefieldschema). 222 // For a Dataplex Entity, it is the [Entity 223 // Schema](https://cloud.google.com/dataplex/docs/reference/rpc/google.cloud.dataplex.v1#type_3). 224 string type = 2; 225 226 // The mode of the field. Possible values include: 227 // 228 // * REQUIRED, if it is a required field. 229 // * NULLABLE, if it is an optional field. 230 // * REPEATED, if it is a repeated field. 231 string mode = 3; 232 233 // Profile information for the corresponding field. 234 ProfileInfo profile = 4; 235 } 236 237 // List of fields with structural and profile information for each field. 238 repeated Field fields = 2; 239 } 240 241 // The result of post scan actions of DataProfileScan job. 242 message PostScanActionsResult { 243 // The result of BigQuery export post scan action. 244 message BigQueryExportResult { 245 // Execution state for the exporting. 246 enum State { 247 // The exporting state is unspecified. 248 STATE_UNSPECIFIED = 0; 249 250 // The exporting completed successfully. 251 SUCCEEDED = 1; 252 253 // The exporting is no longer running due to an error. 254 FAILED = 2; 255 256 // The exporting is skipped due to no valid scan result to export 257 // (usually caused by scan failed). 258 SKIPPED = 3; 259 } 260 261 // Output only. Execution state for the BigQuery exporting. 262 State state = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 263 264 // Output only. Additional information about the BigQuery exporting. 265 string message = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 266 } 267 268 // Output only. The result of BigQuery export post scan action. 269 BigQueryExportResult bigquery_export_result = 1 270 [(google.api.field_behavior) = OUTPUT_ONLY]; 271 } 272 273 // The count of rows scanned. 274 int64 row_count = 3; 275 276 // The profile information per field. 277 Profile profile = 4; 278 279 // The data scanned for this result. 280 ScannedData scanned_data = 5; 281 282 // Output only. The result of post scan actions. 283 PostScanActionsResult post_scan_actions_result = 6 284 [(google.api.field_behavior) = OUTPUT_ONLY]; 285} 286