1 /* 2 * Copyright 2020 Google LLC 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef FCP_CLIENT_FLAGS_H_ 17 #define FCP_CLIENT_FLAGS_H_ 18 19 #include <cstdint> 20 #include <string> 21 22 #include "absl/status/status.h" 23 24 namespace fcp { 25 namespace client { 26 27 // A class for changing runtime behavior with "flags" - typically, server 28 // provided values. 29 class Flags { 30 public: 31 virtual ~Flags() = default; 32 33 // The period of time in milliseconds between device condition checks. This is 34 // used during potentially long blocking calls such as TensorFlow or network 35 // I/O, as well as for throttling regular condition checks during plan 36 // execution (e.g. before fetching a new example). 37 virtual int64_t condition_polling_period_millis() const = 0; 38 39 // The period of time in milliseconds allowed for TensorFlow execution to 40 // finish after it's been interrupted. 41 virtual int64_t tf_execution_teardown_grace_period_millis() const = 0; 42 43 // The period of time in milliseconds allowed for TensorFlow execution to 44 // finish after the grace period. This allows us to decide if we want long 45 // running native execution to be forcibly resolved or continue indefinitely. 46 virtual int64_t tf_execution_teardown_extended_period_millis() const = 0; 47 48 // The deadline in seconds for the gRPC channel used for communication 49 // between the client and server. 50 virtual int64_t grpc_channel_deadline_seconds() const = 0; 51 52 // Whether to log the error message strings from TensorFlow exceptions. 53 virtual bool log_tensorflow_error_messages() const = 0; 54 55 // Whether to enable recording to and querying from the Operational Statistics 56 // db. enable_opstats()57 virtual bool enable_opstats() const { return true; } 58 59 // The number of days for data to live in the OpStatsDb without update. opstats_ttl_days()60 virtual int64_t opstats_ttl_days() const { return 30; } 61 62 // The maximum size of the data stored by OpStatsDb. opstats_db_size_limit_bytes()63 virtual int64_t opstats_db_size_limit_bytes() const { 64 return 1 * 1024 * 1024; 65 } 66 67 // The retry delay to use when encountering a transient error during a 68 // training run before having received a RetryWindow from the server. federated_training_transient_errors_retry_delay_secs()69 virtual int64_t federated_training_transient_errors_retry_delay_secs() const { 70 // 15 minutes 71 return 15 * 60; 72 } 73 74 // The amount of jitter to apply when using the 75 // `federated_training_transient_errors_retry_delay_secs` flag. Must be a 76 // value between 0 and 1. E.g. a value of 0.2 means that retry delays will 77 // fall within [0.8 * target period, 1.2 * target period). federated_training_transient_errors_retry_delay_jitter_percent()78 virtual float federated_training_transient_errors_retry_delay_jitter_percent() 79 const { 80 return 0.2; 81 } 82 83 // The retry delay to use when encountering a permanent error during a 84 // training run (regardless of whether the client already received a 85 // RetryWindow from the server). federated_training_permanent_errors_retry_delay_secs()86 virtual int64_t federated_training_permanent_errors_retry_delay_secs() const { 87 // 4 hours 88 return 4 * 60 * 60; 89 } 90 91 // The amount of jitter to apply when using the 92 // `federated_training_permanent_errors_retry_delay_secs` flag. Must be a 93 // value between 0 and 1. E.g. a value of 0.2 means that retry delays will 94 // fall within [0.8 * target period, 1.2 * target period). federated_training_permanent_errors_retry_delay_jitter_percent()95 virtual float federated_training_permanent_errors_retry_delay_jitter_percent() 96 const { 97 return 0.2; 98 } 99 100 // The list of error codes that should be considered 'permanent'. federated_training_permanent_error_codes()101 virtual std::vector<int32_t> federated_training_permanent_error_codes() 102 const { 103 return { 104 // The server returns NOT_FOUND if the client checks in with an unknown 105 // population name. While this can be resolved without any client 106 // changes by creating the population server-side, it is nevertheless 107 // wise to treat this as a 'permanent' error for which a longer 108 // RetryPeriod is used, because such temporary mismatches in 109 // client/server configuration are fairly common and otherwise cause 110 // clients to check in unnecessarily frequently. 111 static_cast<int32_t>(absl::StatusCode::kNotFound), 112 // INVALID_ARGUMENT generally indicates a client-side issue (e.g. a bug 113 // in the client's protocol implementation), which is unlikely to be 114 // resolved by merely retrying the request. 115 static_cast<int32_t>(absl::StatusCode::kInvalidArgument), 116 // UNIMPLEMENTED similarly could indicate a client-side issue, or a 117 // temporary server issue (e.g. a bug/missing feature implementation in 118 // the server). Either way, it is also unlikely to be resolved by merely 119 // retrying the request soon. 120 static_cast<int32_t>(absl::StatusCode::kUnimplemented)}; 121 } 122 123 // Whether use TFLite for training. use_tflite_training()124 virtual bool use_tflite_training() const { return false; } 125 126 // Whether to enable support for downloading plan/initial checkpoint resources 127 // via HTTP, while still using gRPC for the main protocol. enable_grpc_with_http_resource_support()128 virtual bool enable_grpc_with_http_resource_support() const { return false; } 129 130 // Whether to enable support for downloading eligibility eval plan/initial 131 // checkpoint resources via HTTP, while still using gRPC for the main 132 // protocol. enable_grpc_with_eligibility_eval_http_resource_support()133 virtual bool enable_grpc_with_eligibility_eval_http_resource_support() const { 134 return false; 135 } 136 137 // When true, TFLite interpreter will use dynamic memory allocation, and 138 // release the memory for tensors that are no longer needed. ensure_dynamic_tensors_are_released()139 virtual bool ensure_dynamic_tensors_are_released() const { return true; } 140 141 // When the value is above zero, any tensor size (bytes) above the threshold 142 // will be considered as a large tensor, and dynamic allocation is applied on 143 // them. large_tensor_threshold_for_dynamic_allocation()144 virtual int32_t large_tensor_threshold_for_dynamic_allocation() const { 145 return 1000; 146 } 147 148 // When true, the TFLite runtime graph-reordering optimization that clusters 149 // delegate nodes together is disabled. disable_tflite_delegate_clustering()150 virtual bool disable_tflite_delegate_clustering() const { return false; } 151 152 // When true, http request body won't be compressed. disable_http_request_body_compression()153 virtual bool disable_http_request_body_compression() const { return false; } 154 155 // When true, HTTP Federated Compute protocol is used. use_http_federated_compute_protocol()156 virtual bool use_http_federated_compute_protocol() const { return false; } 157 158 // When true, the client computes the task identity to pass in 159 // SelectorContext. enable_computation_id()160 virtual bool enable_computation_id() const { return false; } 161 162 // The waiting period for issuing cancellation requests before checking 163 // whether the client should be interrupted. waiting_period_sec_for_cancellation()164 virtual int32_t waiting_period_sec_for_cancellation() const { return 10; } 165 166 // If true, the client supports the Federated Select feature. If not 167 // then any Federated Select-specific example query will fail with an error enable_federated_select()168 virtual bool enable_federated_select() const { return false; } 169 170 // The max size in bytes of resources that the ResourceCache is allowed to 171 // store. If greater than 0, the client will attempt to cache resources sent 172 // by uri via the hybrid grpc-with-http-resources and the full http stack. If 173 // this value is reduced from some previous greater value, the cache dir will 174 // be reduced appropriately the next time it is initialized at the start of 175 // the next run. max_resource_cache_size_bytes()176 virtual int64_t max_resource_cache_size_bytes() const { return 0; } 177 178 // If true, an error during the initialization of the resource cache will 179 // publish a fatal initialization error instead of a nonfatal initialization 180 // error and halt execution. resource_cache_initialization_error_is_fatal()181 virtual bool resource_cache_initialization_error_is_fatal() const { 182 return false; 183 } 184 185 // The number of threads that TFLite interpreter will use. num_threads_for_tflite()186 virtual int32_t num_threads_for_tflite() const { return 1; } 187 188 // If true, Opstats initialization errors will be logged via 189 // PhaseLogger.LogNonfatalInitializationError(). Execution will still be 190 // allowed to continue with a no-op implementation like before. log_opstats_initialization_errors()191 virtual bool log_opstats_initialization_errors() const { return false; } 192 193 // If true, enables the last_successful_contribution option in the opstats 194 // selection criteria which returns an opstats entry for the last successful 195 // contribution for the currently executing task. opstats_last_successful_contribution_criteria()196 virtual bool opstats_last_successful_contribution_criteria() const { 197 return false; 198 } 199 200 // If true, enables support for the `TensorflowSpec.constant_inputs` field. If 201 // false, then the field will be ignored. support_constant_tf_inputs()202 virtual bool support_constant_tf_inputs() const { return false; } 203 204 // If true, enables an Example Query plan engine to be invoked for 205 // non-TensorFlow tasks. enable_example_query_plan_engine()206 virtual bool enable_example_query_plan_engine() const { return false; } 207 208 // If true, the HTTP federated protocol supports multiple task assignments. http_protocol_supports_multiple_task_assignments()209 virtual bool http_protocol_supports_multiple_task_assignments() const { 210 return false; 211 } 212 }; 213 } // namespace client 214 } // namespace fcp 215 216 #endif // FCP_CLIENT_FLAGS_H_ 217