1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef FCP_CLIENT_EVENT_PUBLISHER_H_ 17 #define FCP_CLIENT_EVENT_PUBLISHER_H_ 18 19 #include <cstdint> 20 #include <string> 21 22 #include "absl/container/flat_hash_map.h" 23 #include "absl/status/status.h" 24 #include "absl/strings/string_view.h" 25 #include "absl/time/time.h" 26 #include "fcp/client/stats.h" 27 28 namespace fcp { 29 namespace client { 30 31 class SecAggEventPublisher; 32 33 // An interface for publishing events that occur during training. This is a 34 // separate interface from LogManager because the reported events will typically 35 // be both reported to a cloud monitoring backend and to the Federated server as 36 // part of publishing results. 37 // All methods in here either succeed with OK, or fail with INVALID_ARGUMENT. 38 class EventPublisher { 39 public: 40 virtual ~EventPublisher() = default; 41 42 // Publishes that the device is about to issue an eligibility eval check in 43 // with the server. 44 virtual void PublishEligibilityEvalCheckin() = 0; 45 46 // Publishes that the device has finished its eligibility eval checkin with 47 // the server, and received the URIs to download the eligibility eval plan 48 // with, but hasn't actually downloaded them yet, along with information 49 // how much data was transferred up to this point and how long that took. 50 virtual void PublishEligibilityEvalPlanUriReceived( 51 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 52 53 // Publishes that the device has finished its eligibility eval checkin with 54 // the server, and received an eligibility eval plan, along with information 55 // how much data was transferred and how long that took. 56 virtual void PublishEligibilityEvalPlanReceived( 57 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 58 59 // Publishes that the server did not return an eligibility eval task to the 60 // client, along with information how much data was transferred and how long 61 // that took. 62 virtual void PublishEligibilityEvalNotConfigured( 63 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 64 65 // Publishes that the server rejected the device's eligibility eval checkin, 66 // along with information how much data was downloaded and how long that took. 67 virtual void PublishEligibilityEvalRejected( 68 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 69 70 // Publishes that the device is about to check in with the server. 71 virtual void PublishCheckin() = 0; 72 73 // Publishes that the device has finished checking in with the server, along 74 // with information how much data was downloaded and how long that took. 75 virtual void PublishCheckinFinished(const NetworkStats& network_stats, 76 absl::Duration phase_duration) = 0; 77 78 // Publishes that the server rejected the device. 79 virtual void PublishRejected() = 0; 80 81 // Publishes that the device is about to report the results of a federated 82 // computation to the server. 83 virtual void PublishReportStarted(int64_t report_size_bytes) = 0; 84 85 // Publishes that the device has successfully reported its results to the 86 // server and received instructions on when to reconnect. 87 virtual void PublishReportFinished(const NetworkStats& network_stats, 88 absl::Duration report_duration) = 0; 89 90 // Publishes that plan execution has started. 91 virtual void PublishPlanExecutionStarted() = 0; 92 93 // Publishes a TensorFlow error that happened in the given ClientExecution. 94 virtual void PublishTensorFlowError(int example_count, 95 absl::string_view error_message) = 0; 96 97 // Publishes an I/O error (e.g. disk, network) that happened in the given 98 // ClientExecution. 99 virtual void PublishIoError(absl::string_view error_message) = 0; 100 101 // Publishes an ExampleSelector error from the given ClientExecution. 102 virtual void PublishExampleSelectorError(int example_count, 103 absl::string_view error_message) = 0; 104 105 // Publishes an interruption event for the given client execution. 106 virtual void PublishInterruption(const ExampleStats& example_stats, 107 absl::Time start_time) = 0; 108 109 // Publishes an event that plan execution is complete. 110 virtual void PublishPlanCompleted(const ExampleStats& example_stats, 111 absl::Time start_time) = 0; 112 // Publishes that the task didn't start. 113 virtual void PublishTaskNotStarted(absl::string_view error_message) = 0; 114 115 // Publishes that the federated compute runtime failed to initialize a 116 // noncritical component, but execution continued. 117 virtual void PublishNonfatalInitializationError( 118 absl::string_view error_message) = 0; 119 // Publishes that the federated compute runtime failed to initialize a 120 // component, and execution was halted. 121 virtual void PublishFatalInitializationError( 122 absl::string_view error_message) = 0; 123 124 // Publish that an IO error was encountered during eligibility eval check-in. 125 virtual void PublishEligibilityEvalCheckinIoError( 126 absl::string_view error_message, const NetworkStats& network_stats, 127 absl::Duration phase_duration) = 0; 128 // Publish that the eligibility eval check-in is interrupted by the client. 129 virtual void PublishEligibilityEvalCheckinClientInterrupted( 130 absl::string_view error_message, const NetworkStats& network_stats, 131 absl::Duration phase_duration) = 0; 132 // Publish that the eligibility eval check-in is aborted by the server. 133 virtual void PublishEligibilityEvalCheckinServerAborted( 134 absl::string_view error_message, const NetworkStats& network_stats, 135 absl::Duration phase_duration) = 0; 136 // Publish that the eligibility eval check-in returned an invalid payload. 137 virtual void PublishEligibilityEvalCheckinErrorInvalidPayload( 138 absl::string_view error_message, const NetworkStats& network_stats, 139 absl::Duration phase_duration) = 0; 140 // Publish an eligibility eval task starts computation. 141 virtual void PublishEligibilityEvalComputationStarted() = 0; 142 // Publish that the eligibility eval task is invalid. 143 virtual void PublishEligibilityEvalComputationInvalidArgument( 144 absl::string_view error_message, const ExampleStats& example_stats, 145 absl::Duration phase_duration) = 0; 146 // Publish an example iterator error occurred during eligibility eval task. 147 virtual void PublishEligibilityEvalComputationExampleIteratorError( 148 absl::string_view error_message, const ExampleStats& example_stats, 149 absl::Duration phase_duration) = 0; 150 // Publish that a tensorflow error occurred during eligibility eval task. 151 virtual void PublishEligibilityEvalComputationTensorflowError( 152 absl::string_view error_message, const ExampleStats& example_stats, 153 absl::Duration phase_duration) = 0; 154 // Publish that the client has interrupted the eligibility eval task. 155 virtual void PublishEligibilityEvalComputationInterrupted( 156 absl::string_view error_message, const ExampleStats& example_stats, 157 absl::Duration phase_duration) = 0; 158 // Publish an eligibility eval task finished. 159 virtual void PublishEligibilityEvalComputationCompleted( 160 const ExampleStats& example_stats, absl::Duration phase_duration) = 0; 161 // Publish an IO error occurred during regular check-in. 162 virtual void PublishCheckinIoError(absl::string_view error_message, 163 const NetworkStats& network_stats, 164 absl::Duration phase_duration) = 0; 165 // Publish that the client interrupted the regular check-in. 166 virtual void PublishCheckinClientInterrupted( 167 absl::string_view error_message, const NetworkStats& network_stats, 168 absl::Duration phase_duration) = 0; 169 // Publish that the server aborted the regular check-in. 170 virtual void PublishCheckinServerAborted(absl::string_view error_message, 171 const NetworkStats& network_stats, 172 absl::Duration phase_duration) = 0; 173 // Publish that an invalid payload was downloaded from the regular check-in. 174 virtual void PublishCheckinInvalidPayload(absl::string_view error_message, 175 const NetworkStats& network_stats, 176 absl::Duration phase_duration) = 0; 177 // Publishes that the server rejected the device, also logs network stats and 178 // duration. 179 virtual void PublishRejected(const NetworkStats& network_stats, 180 absl::Duration phase_duration) = 0; 181 182 // Publishes that the device has finished checking in with the server and 183 // received URIs to download the plan and checkpoint with, but hasn't yet 184 // downloaded those, along with information how much data was transferred up 185 // to this point and how long that took. 186 virtual void PublishCheckinPlanUriReceived(const NetworkStats& network_stats, 187 absl::Duration phase_duration) = 0; 188 // Publishes that the device has finished checking in with the server, along 189 // with information how much data was transferred and how long that took. 190 virtual void PublishCheckinFinishedV2(const NetworkStats& network_stats, 191 absl::Duration phase_duration) = 0; 192 // Publishes that plan execution has started. 193 virtual void PublishComputationStarted() = 0; 194 // Publish that the task is invalid. 195 virtual void PublishComputationInvalidArgument( 196 absl::string_view error_message, const ExampleStats& example_stats, 197 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 198 // Publish that an IO error occurred during computation. 199 virtual void PublishComputationIOError(absl::string_view error_message, 200 const ExampleStats& example_stats, 201 const NetworkStats& network_stats, 202 absl::Duration phase_duration) = 0; 203 // Publish that an example iterator error occurred during computation. 204 virtual void PublishComputationExampleIteratorError( 205 absl::string_view error_message, const ExampleStats& example_stats, 206 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 207 // Publish that an tensorflow error occurred during computation. 208 virtual void PublishComputationTensorflowError( 209 absl::string_view error_message, const ExampleStats& example_stats, 210 const NetworkStats& network_stats, absl::Duration phase_duration) = 0; 211 // Publish that the task computation is interrupted. 212 virtual void PublishComputationInterrupted(absl::string_view error_message, 213 const ExampleStats& example_stats, 214 const NetworkStats& network_stats, 215 absl::Duration phase_duration) = 0; 216 // Publishes an event that plan execution is complete. 217 virtual void PublishComputationCompleted(const ExampleStats& example_stats, 218 const NetworkStats& network_stats, 219 absl::Duration phase_duration) = 0; 220 // Publish that the client starts to upload result. 221 virtual void PublishResultUploadStarted() = 0; 222 // Publish that an IO error occurred during result upload. 223 virtual void PublishResultUploadIOError(absl::string_view error_message, 224 const NetworkStats& network_stats, 225 absl::Duration phase_duration) = 0; 226 // Publish that the client has interrupted the result upload. 227 virtual void PublishResultUploadClientInterrupted( 228 absl::string_view error_message, const NetworkStats& network_stats, 229 absl::Duration phase_duration) = 0; 230 // Publish hat the server has aborted the result upload. 231 virtual void PublishResultUploadServerAborted( 232 absl::string_view error_message, const NetworkStats& network_stats, 233 absl::Duration phase_duration) = 0; 234 // Publish that the result upload is completed. 235 virtual void PublishResultUploadCompleted(const NetworkStats& network_stats, 236 absl::Duration phase_duration) = 0; 237 // Publish that the task computation has failed, and the client starts to 238 // upload the failure to the server. 239 virtual void PublishFailureUploadStarted() = 0; 240 // Publish that an IO error occurred during failure upload. 241 virtual void PublishFailureUploadIOError(absl::string_view error_message, 242 const NetworkStats& network_stats, 243 absl::Duration phase_duration) = 0; 244 // Publish that the client has interrupted the failure upload. 245 virtual void PublishFailureUploadClientInterrupted( 246 absl::string_view error_message, const NetworkStats& network_stats, 247 absl::Duration phase_duration) = 0; 248 // Publish that the server has aborted the failure upload. 249 virtual void PublishFailureUploadServerAborted( 250 absl::string_view error_message, const NetworkStats& network_stats, 251 absl::Duration phase_duration) = 0; 252 // Publish that the failure upload completed. 253 virtual void PublishFailureUploadCompleted(const NetworkStats& network_stats, 254 absl::Duration phase_duration) = 0; 255 256 // After calling this function, all subsequently published events will be 257 // annotated with the specified model_identifier. This value is typically 258 // provided by the federated server and used on events resulting from 259 // PublishEligibilityEvalCheckinFinished(), PublishCheckinFinished() and 260 // later. 261 // 262 // Note that this method may be called multiple times with different values, 263 // if over the course of a training session multiple models are executed. 264 virtual void SetModelIdentifier(const std::string& model_identifier) = 0; 265 266 // Returns a pointer to a publisher which records secure aggregation protocol 267 // events. The returned value must not be nullptr. 268 virtual SecAggEventPublisher* secagg_event_publisher() = 0; 269 }; 270 271 } // namespace client 272 } // namespace fcp 273 274 #endif // FCP_CLIENT_EVENT_PUBLISHER_H_ 275