1// Copyright 2021 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package fcp.client.opstats; 18 19import "google/protobuf/duration.proto"; 20import "google/protobuf/timestamp.proto"; 21import "fcp/protos/federated_api.proto"; 22 23// Operational stats per run. 24message OperationalStats { 25 // Population name. 26 string population_name = 1; 27 28 // Session name, if applicable. 29 string session_name = 2; 30 31 // Name of the task that was executed. 32 string task_name = 3; 33 34 // Timestamped training stages and error types. 35 message Event { 36 // Key training stages and error types. 37 enum EventKind { 38 EVENT_KIND_UNRECOGNIZED = 0; 39 40 // An eligibility task checkin attempt started. This does not 41 // indicate whether the eligibility checkin request was actually sent. 42 EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED = 1; 43 44 // An eligibility task checkin response indicated that the client was 45 // rejected because the client was incompatible with the population's 46 // eligibility task plan. 47 EVENT_KIND_ELIGIBILITY_REJECTED = 2; 48 49 // An eligibility task checkin response indicated that eligibility task 50 // plans are not configured. 51 EVENT_KIND_ELIGIBILITY_DISABLED = 3; 52 53 // An eligibility task checkin response return an eligibility task plan 54 // URI, but the client hasn't downloaded the plan and checkpoint yet. Also 55 // logged when the plan/checkpoint resources were actually supplied inline 56 // in the protocol response message and no actual HTTP fetch needs to 57 // happen anymore. This ensures that this event can always be compared 58 // against EVENT_KIND_ELIGIBILITY_ENABLED. 59 EVENT_KIND_ELIGIBILITY_PLAN_URI_RECEIVED = 48; 60 61 // An eligibility task checkin response returned an eligibility task plan, 62 // and the received plan was parseable. 63 EVENT_KIND_ELIGIBILITY_ENABLED = 4; 64 65 // A plan execution started for an eligibility task. 66 EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED = 5; 67 68 // A plan execution completed successfully for an eligibility task. 69 EVENT_KIND_ELIGIBILITY_COMPUTATION_FINISHED = 6; 70 71 // A checkin attempt started. This does not indicate whether the checkin 72 // request was actually sent. 73 EVENT_KIND_CHECKIN_STARTED = 7; 74 75 // A checkin response indicated that the client was rejected. 76 EVENT_KIND_CHECKIN_REJECTED = 8; 77 78 // A checkin response indicated that the client was accepted for a task, 79 // but the client hasn't downloaded the plan and checkpoint yet. Also 80 // logged when the plan/checkpoint resources were actually supplied inline 81 // in the protocol response message and no actual HTTP fetch needs to 82 // happen anymore. This ensures that this event can always be compared 83 // against EVENT_KIND_CHECKIN_ACCEPTED. 84 EVENT_KIND_CHECKIN_PLAN_URI_RECEIVED = 49; 85 86 // A checkin response indicated that the client was accepted for a task, 87 // and the received plan was parseable. 88 EVENT_KIND_CHECKIN_ACCEPTED = 9; 89 90 // A plan execution started for a normal task. 91 EVENT_KIND_COMPUTATION_STARTED = 10; 92 93 // A plan execution completed successfully for a normal task. 94 EVENT_KIND_COMPUTATION_FINISHED = 11; 95 96 // An upload attempt started. This does not indicate whether the upload 97 // was actually sent. 98 // Deprecated: split into EVENT_KIND_RESULT_UPLOAD_STARTED and 99 // EVENT_KIND_FAILURE_UPLOAD_STARTED. 100 EVENT_KIND_UPLOAD_STARTED = 12 [deprecated = true]; 101 102 // An upload response indicated that the server successfully received the 103 // client's upload. This does not guarantee that the client's results are 104 // included in a round update. 105 // Deprecated: split into EVENT_KIND_RESULT_UPLOAD_FINISHED and 106 // EVENT_KIND_FAILURE_UPLOAD_FINISHED. 107 EVENT_KIND_UPLOAD_FINISHED = 13 [deprecated = true]; 108 109 // The client interrupted training due to unmet training conditions. This 110 // may occur during checkin, training, or upload. 111 // Deprecated: split into EVENT_KIND_{phase}_INTERRUPTED, where phase is 112 // one of ELIGIBILITY_CHECKIN, ELIGIBILITY_COMPUTATION, CHECKIN, 113 // COMPUTATION, RESULT_UPLOAD, FAILURE_UPLOAD. 114 EVENT_KIND_CLIENT_INTERRUPTED = 14 [deprecated = true]; 115 116 // The server aborted the client's connection. This may occur during 117 // checkin or upload. 118 // Deprecated: split into EVENT_KIND_{phase}_SERVER_ABORTED, where phase 119 // is one of ELIGIBILITY_CHECKIN, CHECKIN, RESULT_UPLOAD, FAILURE_UPLOAD. 120 EVENT_KIND_SERVER_ABORTED = 15 [deprecated = true]; 121 122 // An error occurred that was related to local storage access, 123 // communication with the server, or an invalid plan. 124 // Deprecated: split into EVENT_KIND_{phase}_ERROR_IO, 125 // EVENT_KIND_{phase}_ERROR_INVALID_ARGUMENT and 126 // EVENT_KIND_{phase}_ERROR_INVALID_PAYLOAD, where phase is one of 127 // ELIGIBILITY_CHECKIN, CHECKIN, RESULT_UPLOAD, FAILURE_UPLOAD, 128 // ELIGIBILITY_COMPUTATION, or COMPUTATION. 129 EVENT_KIND_ERROR_IO = 16 [deprecated = true]; 130 131 // The TensorFlow library reported an error. 132 // Deprecated: split into EVENT_KIND_{phase}_ERROR_TENSORFLOW, where phase 133 // is one of ELIGIBILITY_COMPUTATION, COMPUTATION. 134 EVENT_KIND_ERROR_TENSORFLOW = 17 [deprecated = true]; 135 136 // An error occurred when processing the example selector. 137 // Deprecated: split into EVENT_KIND_{phase}_ERROR_EXAMPLE_ITERATOR, where 138 // phase is one of ELIGIBILITY_EVAL_COMPUTATION, COMPUTATION. 139 EVENT_KIND_ERROR_EXAMPLE_SELECTOR = 18 [deprecated = true]; 140 141 // Indicates that training was scheduled but did not start due to runtime 142 // checks (e.g. insufficient battery levels). 143 EVENT_KIND_TRAIN_NOT_STARTED = 19; 144 145 // Client issued an eligibility eval checkin request, but an IO error was 146 // encountered. 147 // Always preceded by EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED. 148 EVENT_KIND_ELIGIBILITY_CHECKIN_ERROR_IO = 20; 149 150 // Client issued an eligibility eval checkin request, but an invalid 151 // payload was received. 152 // Always preceded by EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED. 153 EVENT_KIND_ELIGIBILITY_CHECKIN_ERROR_INVALID_PAYLOAD = 21; 154 155 // Client issued an eligibility eval checkin request, but got interrupted 156 // on the client. Always preceded by 157 // EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED. 158 EVENT_KIND_ELIGIBILITY_CHECKIN_CLIENT_INTERRUPTED = 22; 159 160 // Client issued an eligibility eval checkin request, but server aborted. 161 // Always preceded by EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED. 162 EVENT_KIND_ELIGIBILITY_CHECKIN_SERVER_ABORTED = 23; 163 164 // Client issued a regular checkin request, but got an IO error. 165 // Always preceded by EVENT_KIND_CHECKIN_STARTED. 166 EVENT_KIND_CHECKIN_ERROR_IO = 24; 167 168 // Client issued a regular checkin request, but the server returned an 169 // invalid payload. 170 // Always preceded by EVENT_KIND_CHECKIN_STARTED. 171 EVENT_KIND_CHECKIN_ERROR_INVALID_PAYLOAD = 25; 172 173 // Client issued a regular checin request, but got interrupted on the 174 // client. Always preceded by EVENT_KIND_CHECKIN_STARTED. 175 EVENT_KIND_CHECKIN_CLIENT_INTERRUPTED = 26; 176 177 // Client issued a regular checin request, but got aborted by the server. 178 // Always preceded by EVENT_KIND_CHECKIN_STARTED. 179 EVENT_KIND_CHECKIN_SERVER_ABORTED = 27; 180 181 // Client encountered a TensorFlow error during eligibility eval task 182 // computation. 183 // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED. 184 EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_TENSORFLOW = 28; 185 186 // Reading from disk failed during eligibility eval task computation. 187 // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED. 188 EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_IO = 29; 189 190 // Input parameters are invalid for eligibility eval task computation. 191 // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED. 192 EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_INVALID_ARGUMENT = 30; 193 194 // Client encountered an example selector error during eligibility eval 195 // task computation. Always preceded by 196 // EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED. 197 EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_EXAMPLE_ITERATOR = 31; 198 199 // Eligibility eval computation was interrupted by the client. 200 // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED. 201 EVENT_KIND_ELIGIBILITY_COMPUTATION_CLIENT_INTERRUPTED = 32; 202 203 // A TensorFlow error was encountered during computation, or the output 204 // from the computation was missing or of an unexpected type. Always 205 // preceded by EVENT_KIND_COMPUTATION_STARTED. 206 EVENT_KIND_COMPUTATION_ERROR_TENSORFLOW = 33; 207 208 // Reading from disk failed during computation. 209 // Always preceded by EVENT_KIND_COMPUTATION_STARTED. 210 EVENT_KIND_COMPUTATION_ERROR_IO = 34; 211 212 // Input parameters are invalid for the given computation. 213 // Always preceded by EVENT_KIND_COMPUTATION_STARTED. 214 EVENT_KIND_COMPUTATION_ERROR_INVALID_ARGUMENT = 35; 215 216 // An error occurred when processing the example selector. 217 // Always preceded by EVENT_KIND_COMPUTATION_STARTED. 218 EVENT_KIND_COMPUTATION_ERROR_EXAMPLE_ITERATOR = 36; 219 220 // Client got interrupted during computation. 221 // Always preceded by EVENT_KIND_COMPUTATION_STARTED. 222 EVENT_KIND_COMPUTATION_CLIENT_INTERRUPTED = 37; 223 224 // Client starts to upload successfully computed results. 225 EVENT_KIND_RESULT_UPLOAD_STARTED = 38; 226 227 // An error occurred during upload. 228 // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED. 229 EVENT_KIND_RESULT_UPLOAD_ERROR_IO = 39; 230 231 // Upload was interrupted by the client. 232 // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED. 233 EVENT_KIND_RESULT_UPLOAD_CLIENT_INTERRUPTED = 40; 234 235 // Upload was aborted by the server. 236 // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED. 237 EVENT_KIND_RESULT_UPLOAD_SERVER_ABORTED = 41; 238 239 // Client uploaded training results to the server 240 // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED. 241 EVENT_KIND_RESULT_UPLOAD_FINISHED = 42; 242 243 // Client starts to upload failure report. 244 EVENT_KIND_FAILURE_UPLOAD_STARTED = 43; 245 246 // An error occurred during upload. 247 // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED. 248 EVENT_KIND_FAILURE_UPLOAD_ERROR_IO = 44; 249 250 // Upload was interrupted. 251 // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED. 252 EVENT_KIND_FAILURE_UPLOAD_CLIENT_INTERRUPTED = 45; 253 254 // Upload was interrupted. 255 // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED. 256 EVENT_KIND_FAILURE_UPLOAD_SERVER_ABORTED = 46; 257 258 // Client uploaded failure report to the server 259 // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED. 260 EVENT_KIND_FAILURE_UPLOAD_FINISHED = 47; 261 262 // Client failed to initialize a component, but execution was not halted. 263 EVENT_KIND_INITIALIZATION_ERROR_NONFATAL = 50; 264 265 // Client failed to initialize a component, and execution was halted. 266 EVENT_KIND_INITIALIZATION_ERROR_FATAL = 51; 267 } 268 269 EventKind event_type = 1; 270 271 // Event time. 272 google.protobuf.Timestamp timestamp = 2; 273 } 274 275 // History of key training stages and errors encountered during a run. The 276 // events are stored in sequential order, with the earliest event first. 277 repeated Event events = 4; 278 279 // Stats about the examples read from a given collection, potentially 280 // aggregated over multiple iterators. 281 message DatasetStats { 282 // Total number of examples read. 283 int64 num_examples_read = 1; 284 285 // Total number of bytes read. 286 int64 num_bytes_read = 2; 287 } 288 289 // Map of dataset stats keyed on the collection URI. 290 map<string, DatasetStats> dataset_stats = 5; 291 292 // If this execution failed with an error, the message of that error. 293 string error_message = 6; 294 295 // The retry window returned by the fl runner. 296 google.internal.federatedml.v2.RetryWindow retry_window = 7; 297 298 // The number of bytes downloaded (payload size via the chunking layer, which 299 // may be compressed) from the server while executing the task thus far. 300 int64 chunking_layer_bytes_downloaded = 10; 301 302 // The number of bytes uploaded (payload size via the chunking layer, which 303 // may be compressed) from the server while executing the task thus far. 304 int64 chunking_layer_bytes_uploaded = 11; 305 306 // The duration of time spent waiting on the network (but excluding idle time 307 // like the time between polling the server). 308 google.protobuf.Duration network_duration = 12; 309 310 reserved 8, 9; 311} 312 313// Top level op stats message. 314message OpStatsSequence { 315 // The OperationalStats messages are stored in sequential order, with the 316 // earliest message first. 317 repeated OperationalStats opstats = 1; 318 // A timestamp that marks when we can start to trust the data in the 319 // OpStatsDb. Any event happens before this time is missing or removed. 320 google.protobuf.Timestamp earliest_trustworthy_time = 2; 321} 322 323// Selection criteria for op stats data. 324// If this selection criteria not set, all data will be used. 325// If start_time is not set but end_time is set, all examples up to end_time 326// will be used. 327// If end_time is not set, all examples after start_time will be used. 328// If neither start_time nor end_time are set, all examples will be used. 329// If both start_time and end_time are set, the examples within the time range 330// will be used. 331// If last_successful_contribution is set, start_time and end_time are ignored, 332// and opstats returns a single example containing the entry of the last 333// successful contribution (if it exists) of the runtime to the current task. If 334// there are no previous successful contributions, returns an empty iterator. 335message OpStatsSelectionCriteria { 336 // The lower bound (inclusive) of the last updated time for a OperationalStats 337 // message. 338 google.protobuf.Timestamp start_time = 1; 339 // The upper bound (inclusive) of the last updated time for a OperationalStats 340 // message. 341 google.protobuf.Timestamp end_time = 2; 342 // If set, returns the entry of the last successful contribution to the 343 // current task, or no entries if there was no successful contribution. 344 // `start_time` and `end_time are ignored. 345 bool last_successful_contribution = 3; 346} 347