xref: /aosp_15_r20/external/federated-compute/fcp/client/flags.h (revision 14675a029014e728ec732f129a32e299b2da0601)
1 /*
2  * Copyright 2020 Google LLC
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef FCP_CLIENT_FLAGS_H_
17 #define FCP_CLIENT_FLAGS_H_
18 
19 #include <cstdint>
20 #include <string>
21 
22 #include "absl/status/status.h"
23 
24 namespace fcp {
25 namespace client {
26 
27 // A class for changing runtime behavior with "flags" - typically, server
28 // provided values.
29 class Flags {
30  public:
31   virtual ~Flags() = default;
32 
33   // The period of time in milliseconds between device condition checks. This is
34   // used during potentially long blocking calls such as TensorFlow or network
35   // I/O, as well as for throttling regular condition checks during plan
36   // execution (e.g. before fetching a new example).
37   virtual int64_t condition_polling_period_millis() const = 0;
38 
39   // The period of time in milliseconds allowed for TensorFlow execution to
40   // finish after it's been interrupted.
41   virtual int64_t tf_execution_teardown_grace_period_millis() const = 0;
42 
43   // The period of time in milliseconds allowed for TensorFlow execution to
44   // finish after the grace period. This allows us to decide if we want long
45   // running native execution to be forcibly resolved or continue indefinitely.
46   virtual int64_t tf_execution_teardown_extended_period_millis() const = 0;
47 
48   // The deadline in seconds for the gRPC channel used for communication
49   // between the client and server.
50   virtual int64_t grpc_channel_deadline_seconds() const = 0;
51 
52   // Whether to log the error message strings from TensorFlow exceptions.
53   virtual bool log_tensorflow_error_messages() const = 0;
54 
55   // Whether to enable recording to and querying from the Operational Statistics
56   // db.
enable_opstats()57   virtual bool enable_opstats() const { return true; }
58 
59   // The number of days for data to live in the OpStatsDb without update.
opstats_ttl_days()60   virtual int64_t opstats_ttl_days() const { return 30; }
61 
62   // The maximum size of the data stored by OpStatsDb.
opstats_db_size_limit_bytes()63   virtual int64_t opstats_db_size_limit_bytes() const {
64     return 1 * 1024 * 1024;
65   }
66 
67   // The retry delay to use when encountering a transient error during a
68   // training run before having received a RetryWindow from the server.
federated_training_transient_errors_retry_delay_secs()69   virtual int64_t federated_training_transient_errors_retry_delay_secs() const {
70     // 15 minutes
71     return 15 * 60;
72   }
73 
74   // The amount of jitter to apply when using the
75   // `federated_training_transient_errors_retry_delay_secs` flag. Must be a
76   // value between 0 and 1. E.g. a value of 0.2 means that retry delays will
77   // fall within [0.8 * target period, 1.2 * target period).
federated_training_transient_errors_retry_delay_jitter_percent()78   virtual float federated_training_transient_errors_retry_delay_jitter_percent()
79       const {
80     return 0.2;
81   }
82 
83   // The retry delay to use when encountering a permanent error during a
84   // training run (regardless of whether the client already received a
85   // RetryWindow from the server).
federated_training_permanent_errors_retry_delay_secs()86   virtual int64_t federated_training_permanent_errors_retry_delay_secs() const {
87     // 4 hours
88     return 4 * 60 * 60;
89   }
90 
91   // The amount of jitter to apply when using the
92   // `federated_training_permanent_errors_retry_delay_secs` flag. Must be a
93   // value between 0 and 1. E.g. a value of 0.2 means that retry delays will
94   // fall within [0.8 * target period, 1.2 * target period).
federated_training_permanent_errors_retry_delay_jitter_percent()95   virtual float federated_training_permanent_errors_retry_delay_jitter_percent()
96       const {
97     return 0.2;
98   }
99 
100   // The list of error codes that should be considered 'permanent'.
federated_training_permanent_error_codes()101   virtual std::vector<int32_t> federated_training_permanent_error_codes()
102       const {
103     return {
104         // The server returns NOT_FOUND if the client checks in with an unknown
105         // population name. While this can be resolved without any client
106         // changes by creating the population server-side, it is nevertheless
107         // wise to treat this as a 'permanent' error for which a longer
108         // RetryPeriod is used, because such temporary mismatches in
109         // client/server configuration are fairly common and otherwise cause
110         // clients to check in unnecessarily frequently.
111         static_cast<int32_t>(absl::StatusCode::kNotFound),
112         // INVALID_ARGUMENT generally indicates a client-side issue (e.g. a bug
113         // in the client's protocol implementation), which is unlikely to be
114         // resolved by merely retrying the request.
115         static_cast<int32_t>(absl::StatusCode::kInvalidArgument),
116         // UNIMPLEMENTED similarly could indicate a client-side issue, or a
117         // temporary server issue (e.g. a bug/missing feature implementation in
118         // the server). Either way, it is also unlikely to be resolved by merely
119         // retrying the request soon.
120         static_cast<int32_t>(absl::StatusCode::kUnimplemented)};
121   }
122 
123   // Whether use TFLite for training.
use_tflite_training()124   virtual bool use_tflite_training() const { return false; }
125 
126   // Whether to enable support for downloading plan/initial checkpoint resources
127   // via HTTP, while still using gRPC for the main protocol.
enable_grpc_with_http_resource_support()128   virtual bool enable_grpc_with_http_resource_support() const { return false; }
129 
130   // Whether to enable support for downloading eligibility eval plan/initial
131   // checkpoint resources via HTTP, while still using gRPC for the main
132   // protocol.
enable_grpc_with_eligibility_eval_http_resource_support()133   virtual bool enable_grpc_with_eligibility_eval_http_resource_support() const {
134     return false;
135   }
136 
137   // When true, TFLite interpreter will use dynamic memory allocation, and
138   // release the memory for tensors that are no longer needed.
ensure_dynamic_tensors_are_released()139   virtual bool ensure_dynamic_tensors_are_released() const { return true; }
140 
141   // When the value is above zero, any tensor size (bytes) above the threshold
142   // will be considered as a large tensor, and dynamic allocation is applied on
143   // them.
large_tensor_threshold_for_dynamic_allocation()144   virtual int32_t large_tensor_threshold_for_dynamic_allocation() const {
145     return 1000;
146   }
147 
148   // When true, the TFLite runtime graph-reordering optimization that clusters
149   // delegate nodes together is disabled.
disable_tflite_delegate_clustering()150   virtual bool disable_tflite_delegate_clustering() const { return false; }
151 
152   // When true, http request body won't be compressed.
disable_http_request_body_compression()153   virtual bool disable_http_request_body_compression() const { return false; }
154 
155   // When true, HTTP Federated Compute protocol is used.
use_http_federated_compute_protocol()156   virtual bool use_http_federated_compute_protocol() const { return false; }
157 
158   // When true, the client computes the task identity to pass in
159   // SelectorContext.
enable_computation_id()160   virtual bool enable_computation_id() const { return false; }
161 
162   // The waiting period for issuing cancellation requests before checking
163   // whether the client should be interrupted.
waiting_period_sec_for_cancellation()164   virtual int32_t waiting_period_sec_for_cancellation() const { return 10; }
165 
166   // If true, the client supports the Federated Select feature. If not
167   // then any Federated Select-specific example query will fail with an error
enable_federated_select()168   virtual bool enable_federated_select() const { return false; }
169 
170   // The max size in bytes of resources that the ResourceCache is allowed to
171   // store. If greater than 0, the client will attempt to cache resources sent
172   // by uri via the hybrid grpc-with-http-resources and the full http stack. If
173   // this value is reduced from some previous greater value, the cache dir will
174   // be reduced appropriately the next time it is initialized at the start of
175   // the next run.
max_resource_cache_size_bytes()176   virtual int64_t max_resource_cache_size_bytes() const { return 0; }
177 
178   // If true, an error during the initialization of the resource cache will
179   // publish a fatal initialization error instead of a nonfatal initialization
180   // error and halt execution.
resource_cache_initialization_error_is_fatal()181   virtual bool resource_cache_initialization_error_is_fatal() const {
182     return false;
183   }
184 
185   // The number of threads that TFLite interpreter will use.
num_threads_for_tflite()186   virtual int32_t num_threads_for_tflite() const { return 1; }
187 
188   // If true, Opstats initialization errors will be logged via
189   // PhaseLogger.LogNonfatalInitializationError(). Execution will still be
190   // allowed to continue with a no-op implementation like before.
log_opstats_initialization_errors()191   virtual bool log_opstats_initialization_errors() const { return false; }
192 
193   // If true, enables the last_successful_contribution option in the opstats
194   // selection criteria which returns an opstats entry for the last successful
195   // contribution for the currently executing task.
opstats_last_successful_contribution_criteria()196   virtual bool opstats_last_successful_contribution_criteria() const {
197     return false;
198   }
199 
200   // If true, enables support for the `TensorflowSpec.constant_inputs` field. If
201   // false, then the field will be ignored.
support_constant_tf_inputs()202   virtual bool support_constant_tf_inputs() const { return false; }
203 
204   // If true, enables an Example Query plan engine to be invoked for
205   // non-TensorFlow tasks.
enable_example_query_plan_engine()206   virtual bool enable_example_query_plan_engine() const { return false; }
207 
208   // If true, the HTTP federated protocol supports multiple task assignments.
http_protocol_supports_multiple_task_assignments()209   virtual bool http_protocol_supports_multiple_task_assignments() const {
210     return false;
211   }
212 };
213 }  // namespace client
214 }  // namespace fcp
215 
216 #endif  // FCP_CLIENT_FLAGS_H_
217