xref: /aosp_15_r20/external/federated-compute/fcp/protos/opstats.proto (revision 14675a029014e728ec732f129a32e299b2da0601)
1// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package fcp.client.opstats;
18
19import "google/protobuf/duration.proto";
20import "google/protobuf/timestamp.proto";
21import "fcp/protos/federated_api.proto";
22
23// Operational stats per run.
24message OperationalStats {
25  // Population name.
26  string population_name = 1;
27
28  // Session name, if applicable.
29  string session_name = 2;
30
31  // Name of the task that was executed.
32  string task_name = 3;
33
34  // Timestamped training stages and error types.
35  message Event {
36    // Key training stages and error types.
37    enum EventKind {
38      EVENT_KIND_UNRECOGNIZED = 0;
39
40      // An eligibility task checkin attempt started. This does not
41      // indicate whether the eligibility checkin request was actually sent.
42      EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED = 1;
43
44      // An eligibility task checkin response indicated that the client was
45      // rejected because the client was incompatible with the population's
46      // eligibility task plan.
47      EVENT_KIND_ELIGIBILITY_REJECTED = 2;
48
49      // An eligibility task checkin response indicated that eligibility task
50      // plans are not configured.
51      EVENT_KIND_ELIGIBILITY_DISABLED = 3;
52
53      // An eligibility task checkin response return an eligibility task plan
54      // URI, but the client hasn't downloaded the plan and checkpoint yet. Also
55      // logged when the plan/checkpoint resources were actually supplied inline
56      // in the protocol response message and no actual HTTP fetch needs to
57      // happen anymore. This ensures that this event can always be compared
58      // against EVENT_KIND_ELIGIBILITY_ENABLED.
59      EVENT_KIND_ELIGIBILITY_PLAN_URI_RECEIVED = 48;
60
61      // An eligibility task checkin response returned an eligibility task plan,
62      // and the received plan was parseable.
63      EVENT_KIND_ELIGIBILITY_ENABLED = 4;
64
65      // A plan execution started for an eligibility task.
66      EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED = 5;
67
68      // A plan execution completed successfully for an eligibility task.
69      EVENT_KIND_ELIGIBILITY_COMPUTATION_FINISHED = 6;
70
71      // A checkin attempt started. This does not indicate whether the checkin
72      // request was actually sent.
73      EVENT_KIND_CHECKIN_STARTED = 7;
74
75      // A checkin response indicated that the client was rejected.
76      EVENT_KIND_CHECKIN_REJECTED = 8;
77
78      // A checkin response indicated that the client was accepted for a task,
79      // but the client hasn't downloaded the plan and checkpoint yet. Also
80      // logged when the plan/checkpoint resources were actually supplied inline
81      // in the protocol response message and no actual HTTP fetch needs to
82      // happen anymore. This ensures that this event can always be compared
83      // against EVENT_KIND_CHECKIN_ACCEPTED.
84      EVENT_KIND_CHECKIN_PLAN_URI_RECEIVED = 49;
85
86      // A checkin response indicated that the client was accepted for a task,
87      // and the received plan was parseable.
88      EVENT_KIND_CHECKIN_ACCEPTED = 9;
89
90      // A plan execution started for a normal task.
91      EVENT_KIND_COMPUTATION_STARTED = 10;
92
93      // A plan execution completed successfully for a normal task.
94      EVENT_KIND_COMPUTATION_FINISHED = 11;
95
96      // An upload attempt started. This does not indicate whether the upload
97      // was actually sent.
98      // Deprecated: split into EVENT_KIND_RESULT_UPLOAD_STARTED and
99      // EVENT_KIND_FAILURE_UPLOAD_STARTED.
100      EVENT_KIND_UPLOAD_STARTED = 12 [deprecated = true];
101
102      // An upload response indicated that the server successfully received the
103      // client's upload. This does not guarantee that the client's results are
104      // included in a round update.
105      // Deprecated: split into EVENT_KIND_RESULT_UPLOAD_FINISHED and
106      // EVENT_KIND_FAILURE_UPLOAD_FINISHED.
107      EVENT_KIND_UPLOAD_FINISHED = 13 [deprecated = true];
108
109      // The client interrupted training due to unmet training conditions. This
110      // may occur during checkin, training, or upload.
111      // Deprecated: split into EVENT_KIND_{phase}_INTERRUPTED, where phase is
112      // one of ELIGIBILITY_CHECKIN, ELIGIBILITY_COMPUTATION, CHECKIN,
113      // COMPUTATION, RESULT_UPLOAD, FAILURE_UPLOAD.
114      EVENT_KIND_CLIENT_INTERRUPTED = 14 [deprecated = true];
115
116      // The server aborted the client's connection. This may occur during
117      // checkin or upload.
118      // Deprecated: split into EVENT_KIND_{phase}_SERVER_ABORTED, where phase
119      // is one of ELIGIBILITY_CHECKIN, CHECKIN, RESULT_UPLOAD, FAILURE_UPLOAD.
120      EVENT_KIND_SERVER_ABORTED = 15 [deprecated = true];
121
122      // An error occurred that was related to local storage access,
123      // communication with the server, or an invalid plan.
124      // Deprecated: split into EVENT_KIND_{phase}_ERROR_IO,
125      // EVENT_KIND_{phase}_ERROR_INVALID_ARGUMENT and
126      // EVENT_KIND_{phase}_ERROR_INVALID_PAYLOAD, where phase is one of
127      // ELIGIBILITY_CHECKIN, CHECKIN, RESULT_UPLOAD, FAILURE_UPLOAD,
128      // ELIGIBILITY_COMPUTATION, or COMPUTATION.
129      EVENT_KIND_ERROR_IO = 16 [deprecated = true];
130
131      // The TensorFlow library reported an error.
132      // Deprecated: split into EVENT_KIND_{phase}_ERROR_TENSORFLOW, where phase
133      // is one of ELIGIBILITY_COMPUTATION, COMPUTATION.
134      EVENT_KIND_ERROR_TENSORFLOW = 17 [deprecated = true];
135
136      // An error occurred when processing the example selector.
137      // Deprecated: split into EVENT_KIND_{phase}_ERROR_EXAMPLE_ITERATOR, where
138      // phase is one of ELIGIBILITY_EVAL_COMPUTATION, COMPUTATION.
139      EVENT_KIND_ERROR_EXAMPLE_SELECTOR = 18 [deprecated = true];
140
141      // Indicates that training was scheduled but did not start due to runtime
142      // checks (e.g. insufficient battery levels).
143      EVENT_KIND_TRAIN_NOT_STARTED = 19;
144
145      // Client issued an eligibility eval checkin request, but an IO error was
146      // encountered.
147      // Always preceded by EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED.
148      EVENT_KIND_ELIGIBILITY_CHECKIN_ERROR_IO = 20;
149
150      // Client issued an eligibility eval checkin request, but an invalid
151      // payload was received.
152      // Always preceded by EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED.
153      EVENT_KIND_ELIGIBILITY_CHECKIN_ERROR_INVALID_PAYLOAD = 21;
154
155      // Client issued an eligibility eval checkin request, but got interrupted
156      // on the client. Always preceded by
157      // EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED.
158      EVENT_KIND_ELIGIBILITY_CHECKIN_CLIENT_INTERRUPTED = 22;
159
160      // Client issued an eligibility eval checkin request, but server aborted.
161      // Always preceded by EVENT_KIND_ELIGIBILITY_CHECKIN_STARTED.
162      EVENT_KIND_ELIGIBILITY_CHECKIN_SERVER_ABORTED = 23;
163
164      // Client issued a regular checkin request, but got an IO error.
165      // Always preceded by EVENT_KIND_CHECKIN_STARTED.
166      EVENT_KIND_CHECKIN_ERROR_IO = 24;
167
168      // Client issued a regular checkin request, but the server returned an
169      // invalid payload.
170      // Always preceded by EVENT_KIND_CHECKIN_STARTED.
171      EVENT_KIND_CHECKIN_ERROR_INVALID_PAYLOAD = 25;
172
173      // Client issued a regular checin request, but got interrupted on the
174      // client. Always preceded by EVENT_KIND_CHECKIN_STARTED.
175      EVENT_KIND_CHECKIN_CLIENT_INTERRUPTED = 26;
176
177      // Client issued a regular checin request, but got aborted by the server.
178      // Always preceded by EVENT_KIND_CHECKIN_STARTED.
179      EVENT_KIND_CHECKIN_SERVER_ABORTED = 27;
180
181      // Client encountered a TensorFlow error during eligibility eval task
182      // computation.
183      // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED.
184      EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_TENSORFLOW = 28;
185
186      // Reading from disk failed during eligibility eval task computation.
187      // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED.
188      EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_IO = 29;
189
190      // Input parameters are invalid for eligibility eval task computation.
191      // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED.
192      EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_INVALID_ARGUMENT = 30;
193
194      // Client encountered an example selector error during eligibility eval
195      // task computation. Always preceded by
196      // EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED.
197      EVENT_KIND_ELIGIBILITY_COMPUTATION_ERROR_EXAMPLE_ITERATOR = 31;
198
199      // Eligibility eval computation was interrupted by the client.
200      // Always preceded by EVENT_KIND_ELIGIBILITY_COMPUTATION_STARTED.
201      EVENT_KIND_ELIGIBILITY_COMPUTATION_CLIENT_INTERRUPTED = 32;
202
203      // A TensorFlow error was encountered during computation, or the output
204      // from the computation was missing or of an unexpected type. Always
205      // preceded by EVENT_KIND_COMPUTATION_STARTED.
206      EVENT_KIND_COMPUTATION_ERROR_TENSORFLOW = 33;
207
208      // Reading from disk failed during computation.
209      // Always preceded by EVENT_KIND_COMPUTATION_STARTED.
210      EVENT_KIND_COMPUTATION_ERROR_IO = 34;
211
212      // Input parameters are invalid for the given computation.
213      // Always preceded by EVENT_KIND_COMPUTATION_STARTED.
214      EVENT_KIND_COMPUTATION_ERROR_INVALID_ARGUMENT = 35;
215
216      // An error occurred when processing the example selector.
217      // Always preceded by EVENT_KIND_COMPUTATION_STARTED.
218      EVENT_KIND_COMPUTATION_ERROR_EXAMPLE_ITERATOR = 36;
219
220      // Client got interrupted during computation.
221      // Always preceded by EVENT_KIND_COMPUTATION_STARTED.
222      EVENT_KIND_COMPUTATION_CLIENT_INTERRUPTED = 37;
223
224      // Client starts to upload successfully computed results.
225      EVENT_KIND_RESULT_UPLOAD_STARTED = 38;
226
227      // An error occurred during upload.
228      // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED.
229      EVENT_KIND_RESULT_UPLOAD_ERROR_IO = 39;
230
231      // Upload was interrupted by the client.
232      // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED.
233      EVENT_KIND_RESULT_UPLOAD_CLIENT_INTERRUPTED = 40;
234
235      // Upload was aborted by the server.
236      // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED.
237      EVENT_KIND_RESULT_UPLOAD_SERVER_ABORTED = 41;
238
239      // Client uploaded training results to the server
240      // Always preceded by EVENT_KIND_RESULT_UPLOAD_STARTED.
241      EVENT_KIND_RESULT_UPLOAD_FINISHED = 42;
242
243      // Client starts to upload failure report.
244      EVENT_KIND_FAILURE_UPLOAD_STARTED = 43;
245
246      // An error occurred during upload.
247      // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED.
248      EVENT_KIND_FAILURE_UPLOAD_ERROR_IO = 44;
249
250      // Upload was interrupted.
251      // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED.
252      EVENT_KIND_FAILURE_UPLOAD_CLIENT_INTERRUPTED = 45;
253
254      // Upload was interrupted.
255      // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED.
256      EVENT_KIND_FAILURE_UPLOAD_SERVER_ABORTED = 46;
257
258      // Client uploaded failure report to the server
259      // Always preceded by EVENT_KIND_FAILURE_UPLOAD_STARTED.
260      EVENT_KIND_FAILURE_UPLOAD_FINISHED = 47;
261
262      // Client failed to initialize a component, but execution was not halted.
263      EVENT_KIND_INITIALIZATION_ERROR_NONFATAL = 50;
264
265      // Client failed to initialize a component, and execution was halted.
266      EVENT_KIND_INITIALIZATION_ERROR_FATAL = 51;
267    }
268
269    EventKind event_type = 1;
270
271    // Event time.
272    google.protobuf.Timestamp timestamp = 2;
273  }
274
275  // History of key training stages and errors encountered during a run. The
276  // events are stored in sequential order, with the earliest event first.
277  repeated Event events = 4;
278
279  // Stats about the examples read from a given collection, potentially
280  // aggregated over multiple iterators.
281  message DatasetStats {
282    // Total number of examples read.
283    int64 num_examples_read = 1;
284
285    // Total number of bytes read.
286    int64 num_bytes_read = 2;
287  }
288
289  // Map of dataset stats keyed on the collection URI.
290  map<string, DatasetStats> dataset_stats = 5;
291
292  // If this execution failed with an error, the message of that error.
293  string error_message = 6;
294
295  // The retry window returned by the fl runner.
296  google.internal.federatedml.v2.RetryWindow retry_window = 7;
297
298  // The number of bytes downloaded (payload size via the chunking layer, which
299  // may be compressed) from the server while executing the task thus far.
300  int64 chunking_layer_bytes_downloaded = 10;
301
302  // The number of bytes uploaded (payload size via the chunking layer, which
303  // may be compressed) from the server while executing the task thus far.
304  int64 chunking_layer_bytes_uploaded = 11;
305
306  // The duration of time spent waiting on the network (but excluding idle time
307  // like the time between polling the server).
308  google.protobuf.Duration network_duration = 12;
309
310  reserved 8, 9;
311}
312
313// Top level op stats message.
314message OpStatsSequence {
315  // The OperationalStats messages are stored in sequential order, with the
316  // earliest message first.
317  repeated OperationalStats opstats = 1;
318  // A timestamp that marks when we can start to trust the data in the
319  // OpStatsDb. Any event happens before this time is missing or removed.
320  google.protobuf.Timestamp earliest_trustworthy_time = 2;
321}
322
323// Selection criteria for op stats data.
324// If this selection criteria not set, all data will be used.
325// If start_time is not set but end_time is set, all examples up to end_time
326// will be used.
327// If end_time is not set, all examples after start_time will be used.
328// If neither start_time nor end_time are set, all examples will be used.
329// If both start_time and end_time are set, the examples within the time range
330// will be used.
331// If last_successful_contribution is set, start_time and end_time are ignored,
332// and opstats returns a single example containing the entry of the last
333// successful contribution (if it exists) of the runtime to the current task. If
334// there are no previous successful contributions, returns an empty iterator.
335message OpStatsSelectionCriteria {
336  // The lower bound (inclusive) of the last updated time for a OperationalStats
337  // message.
338  google.protobuf.Timestamp start_time = 1;
339  // The upper bound (inclusive) of the last updated time for a OperationalStats
340  // message.
341  google.protobuf.Timestamp end_time = 2;
342  // If set, returns the entry of the last successful contribution to the
343  // current task, or no entries if there was no successful contribution.
344  // `start_time` and `end_time are ignored.
345  bool last_successful_contribution = 3;
346}
347