xref: /aosp_15_r20/external/federated-compute/fcp/client/event_publisher.h (revision 14675a029014e728ec732f129a32e299b2da0601)
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef FCP_CLIENT_EVENT_PUBLISHER_H_
17 #define FCP_CLIENT_EVENT_PUBLISHER_H_
18 
19 #include <cstdint>
20 #include <string>
21 
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/status/status.h"
24 #include "absl/strings/string_view.h"
25 #include "absl/time/time.h"
26 #include "fcp/client/stats.h"
27 
28 namespace fcp {
29 namespace client {
30 
31 class SecAggEventPublisher;
32 
33 // An interface for publishing events that occur during training. This is a
34 // separate interface from LogManager because the reported events will typically
35 // be both reported to a cloud monitoring backend and to the Federated server as
36 // part of publishing results.
37 // All methods in here either succeed with OK, or fail with INVALID_ARGUMENT.
38 class EventPublisher {
39  public:
40   virtual ~EventPublisher() = default;
41 
42   // Publishes that the device is about to issue an eligibility eval check in
43   // with the server.
44   virtual void PublishEligibilityEvalCheckin() = 0;
45 
46   // Publishes that the device has finished its eligibility eval checkin with
47   // the server, and received the URIs to download the eligibility eval plan
48   // with, but hasn't actually downloaded them yet, along with information
49   // how much data was transferred up to this point and how long that took.
50   virtual void PublishEligibilityEvalPlanUriReceived(
51       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
52 
53   // Publishes that the device has finished its eligibility eval checkin with
54   // the server, and received an eligibility eval plan, along with information
55   // how much data was transferred and how long that took.
56   virtual void PublishEligibilityEvalPlanReceived(
57       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
58 
59   // Publishes that the server did not return an eligibility eval task to the
60   // client, along with information how much data was transferred and how long
61   // that took.
62   virtual void PublishEligibilityEvalNotConfigured(
63       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
64 
65   // Publishes that the server rejected the device's eligibility eval checkin,
66   // along with information how much data was downloaded and how long that took.
67   virtual void PublishEligibilityEvalRejected(
68       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
69 
70   // Publishes that the device is about to check in with the server.
71   virtual void PublishCheckin() = 0;
72 
73   // Publishes that the device has finished checking in with the server, along
74   // with information how much data was downloaded and how long that took.
75   virtual void PublishCheckinFinished(const NetworkStats& network_stats,
76                                       absl::Duration phase_duration) = 0;
77 
78   // Publishes that the server rejected the device.
79   virtual void PublishRejected() = 0;
80 
81   // Publishes that the device is about to report the results of a federated
82   // computation to the server.
83   virtual void PublishReportStarted(int64_t report_size_bytes) = 0;
84 
85   // Publishes that the device has successfully reported its results to the
86   // server and received instructions on when to reconnect.
87   virtual void PublishReportFinished(const NetworkStats& network_stats,
88                                      absl::Duration report_duration) = 0;
89 
90   // Publishes that plan execution has started.
91   virtual void PublishPlanExecutionStarted() = 0;
92 
93   // Publishes a TensorFlow error that happened in the given ClientExecution.
94   virtual void PublishTensorFlowError(int example_count,
95                                       absl::string_view error_message) = 0;
96 
97   // Publishes an I/O error (e.g. disk, network) that happened in the given
98   // ClientExecution.
99   virtual void PublishIoError(absl::string_view error_message) = 0;
100 
101   // Publishes an ExampleSelector error from the given ClientExecution.
102   virtual void PublishExampleSelectorError(int example_count,
103                                            absl::string_view error_message) = 0;
104 
105   // Publishes an interruption event for the given client execution.
106   virtual void PublishInterruption(const ExampleStats& example_stats,
107                                    absl::Time start_time) = 0;
108 
109   // Publishes an event that plan execution is complete.
110   virtual void PublishPlanCompleted(const ExampleStats& example_stats,
111                                     absl::Time start_time) = 0;
112   // Publishes that the task didn't start.
113   virtual void PublishTaskNotStarted(absl::string_view error_message) = 0;
114 
115   // Publishes that the federated compute runtime failed to initialize a
116   // noncritical component, but execution continued.
117   virtual void PublishNonfatalInitializationError(
118       absl::string_view error_message) = 0;
119   // Publishes that the federated compute runtime failed to initialize a
120   // component, and execution was halted.
121   virtual void PublishFatalInitializationError(
122       absl::string_view error_message) = 0;
123 
124   // Publish that an IO error was encountered during eligibility eval check-in.
125   virtual void PublishEligibilityEvalCheckinIoError(
126       absl::string_view error_message, const NetworkStats& network_stats,
127       absl::Duration phase_duration) = 0;
128   // Publish that the eligibility eval check-in is interrupted by the client.
129   virtual void PublishEligibilityEvalCheckinClientInterrupted(
130       absl::string_view error_message, const NetworkStats& network_stats,
131       absl::Duration phase_duration) = 0;
132   // Publish that the eligibility eval check-in is aborted by the server.
133   virtual void PublishEligibilityEvalCheckinServerAborted(
134       absl::string_view error_message, const NetworkStats& network_stats,
135       absl::Duration phase_duration) = 0;
136   // Publish that the eligibility eval check-in returned an invalid payload.
137   virtual void PublishEligibilityEvalCheckinErrorInvalidPayload(
138       absl::string_view error_message, const NetworkStats& network_stats,
139       absl::Duration phase_duration) = 0;
140   // Publish an eligibility eval task starts computation.
141   virtual void PublishEligibilityEvalComputationStarted() = 0;
142   // Publish that the eligibility eval task is invalid.
143   virtual void PublishEligibilityEvalComputationInvalidArgument(
144       absl::string_view error_message, const ExampleStats& example_stats,
145       absl::Duration phase_duration) = 0;
146   // Publish an example iterator error occurred during eligibility eval task.
147   virtual void PublishEligibilityEvalComputationExampleIteratorError(
148       absl::string_view error_message, const ExampleStats& example_stats,
149       absl::Duration phase_duration) = 0;
150   // Publish that a tensorflow error occurred during eligibility eval task.
151   virtual void PublishEligibilityEvalComputationTensorflowError(
152       absl::string_view error_message, const ExampleStats& example_stats,
153       absl::Duration phase_duration) = 0;
154   // Publish that the client has interrupted the eligibility eval task.
155   virtual void PublishEligibilityEvalComputationInterrupted(
156       absl::string_view error_message, const ExampleStats& example_stats,
157       absl::Duration phase_duration) = 0;
158   // Publish an eligibility eval task finished.
159   virtual void PublishEligibilityEvalComputationCompleted(
160       const ExampleStats& example_stats, absl::Duration phase_duration) = 0;
161   // Publish an IO error occurred during regular check-in.
162   virtual void PublishCheckinIoError(absl::string_view error_message,
163                                      const NetworkStats& network_stats,
164                                      absl::Duration phase_duration) = 0;
165   // Publish that the client interrupted the regular check-in.
166   virtual void PublishCheckinClientInterrupted(
167       absl::string_view error_message, const NetworkStats& network_stats,
168       absl::Duration phase_duration) = 0;
169   // Publish that the server aborted the regular check-in.
170   virtual void PublishCheckinServerAborted(absl::string_view error_message,
171                                            const NetworkStats& network_stats,
172                                            absl::Duration phase_duration) = 0;
173   // Publish that an invalid payload was downloaded from the regular check-in.
174   virtual void PublishCheckinInvalidPayload(absl::string_view error_message,
175                                             const NetworkStats& network_stats,
176                                             absl::Duration phase_duration) = 0;
177   // Publishes that the server rejected the device, also logs network stats and
178   // duration.
179   virtual void PublishRejected(const NetworkStats& network_stats,
180                                absl::Duration phase_duration) = 0;
181 
182   // Publishes that the device has finished checking in with the server and
183   // received URIs to download the plan and checkpoint with, but hasn't yet
184   // downloaded those, along with information how much data was transferred up
185   // to this point and how long that took.
186   virtual void PublishCheckinPlanUriReceived(const NetworkStats& network_stats,
187                                              absl::Duration phase_duration) = 0;
188   // Publishes that the device has finished checking in with the server, along
189   // with information how much data was transferred and how long that took.
190   virtual void PublishCheckinFinishedV2(const NetworkStats& network_stats,
191                                         absl::Duration phase_duration) = 0;
192   // Publishes that plan execution has started.
193   virtual void PublishComputationStarted() = 0;
194   // Publish that the task is invalid.
195   virtual void PublishComputationInvalidArgument(
196       absl::string_view error_message, const ExampleStats& example_stats,
197       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
198   // Publish that an IO error occurred during computation.
199   virtual void PublishComputationIOError(absl::string_view error_message,
200                                          const ExampleStats& example_stats,
201                                          const NetworkStats& network_stats,
202                                          absl::Duration phase_duration) = 0;
203   // Publish that an example iterator error occurred during computation.
204   virtual void PublishComputationExampleIteratorError(
205       absl::string_view error_message, const ExampleStats& example_stats,
206       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
207   // Publish that an tensorflow error occurred during computation.
208   virtual void PublishComputationTensorflowError(
209       absl::string_view error_message, const ExampleStats& example_stats,
210       const NetworkStats& network_stats, absl::Duration phase_duration) = 0;
211   // Publish that the task computation is interrupted.
212   virtual void PublishComputationInterrupted(absl::string_view error_message,
213                                              const ExampleStats& example_stats,
214                                              const NetworkStats& network_stats,
215                                              absl::Duration phase_duration) = 0;
216   // Publishes an event that plan execution is complete.
217   virtual void PublishComputationCompleted(const ExampleStats& example_stats,
218                                            const NetworkStats& network_stats,
219                                            absl::Duration phase_duration) = 0;
220   // Publish that the client starts to upload result.
221   virtual void PublishResultUploadStarted() = 0;
222   // Publish that an IO error occurred during result upload.
223   virtual void PublishResultUploadIOError(absl::string_view error_message,
224                                           const NetworkStats& network_stats,
225                                           absl::Duration phase_duration) = 0;
226   // Publish that the client has interrupted the result upload.
227   virtual void PublishResultUploadClientInterrupted(
228       absl::string_view error_message, const NetworkStats& network_stats,
229       absl::Duration phase_duration) = 0;
230   // Publish hat the server has aborted the result upload.
231   virtual void PublishResultUploadServerAborted(
232       absl::string_view error_message, const NetworkStats& network_stats,
233       absl::Duration phase_duration) = 0;
234   // Publish that the result upload is completed.
235   virtual void PublishResultUploadCompleted(const NetworkStats& network_stats,
236                                             absl::Duration phase_duration) = 0;
237   // Publish that the task computation has failed, and the client starts to
238   // upload the failure to the server.
239   virtual void PublishFailureUploadStarted() = 0;
240   // Publish that an IO error occurred during failure upload.
241   virtual void PublishFailureUploadIOError(absl::string_view error_message,
242                                            const NetworkStats& network_stats,
243                                            absl::Duration phase_duration) = 0;
244   // Publish that the client has interrupted the failure upload.
245   virtual void PublishFailureUploadClientInterrupted(
246       absl::string_view error_message, const NetworkStats& network_stats,
247       absl::Duration phase_duration) = 0;
248   // Publish that the server has aborted the failure upload.
249   virtual void PublishFailureUploadServerAborted(
250       absl::string_view error_message, const NetworkStats& network_stats,
251       absl::Duration phase_duration) = 0;
252   // Publish that the failure upload completed.
253   virtual void PublishFailureUploadCompleted(const NetworkStats& network_stats,
254                                              absl::Duration phase_duration) = 0;
255 
256   // After calling this function, all subsequently published events will be
257   // annotated with the specified model_identifier. This value is typically
258   // provided by the federated server and used on events resulting from
259   // PublishEligibilityEvalCheckinFinished(), PublishCheckinFinished() and
260   // later.
261   //
262   // Note that this method may be called multiple times with different values,
263   // if over the course of a training session multiple models are executed.
264   virtual void SetModelIdentifier(const std::string& model_identifier) = 0;
265 
266   // Returns a pointer to a publisher which records secure aggregation protocol
267   // events.  The returned value must not be nullptr.
268   virtual SecAggEventPublisher* secagg_event_publisher() = 0;
269 };
270 
271 }  // namespace client
272 }  // namespace fcp
273 
274 #endif  // FCP_CLIENT_EVENT_PUBLISHER_H_
275