1 // Copyright 2017 gRPC authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <chrono>
16 #include <string>
17 #include <thread>
18 #include <vector>
19
20 #include <gmock/gmock.h>
21 #include <gtest/gtest.h>
22
23 #include "src/core/client_channel/backup_poller.h"
24 #include "src/core/lib/config/config_vars.h"
25 #include "src/proto/grpc/testing/xds/v3/cluster.grpc.pb.h"
26 #include "src/proto/grpc/testing/xds/v3/fault.grpc.pb.h"
27 #include "src/proto/grpc/testing/xds/v3/outlier_detection.grpc.pb.h"
28 #include "src/proto/grpc/testing/xds/v3/router.grpc.pb.h"
29 #include "test/core/util/resolve_localhost_ip46.h"
30 #include "test/cpp/end2end/xds/xds_end2end_test_lib.h"
31
32 namespace grpc {
33 namespace testing {
34 namespace {
35
36 class OutlierDetectionTest : public XdsEnd2endTest {
37 protected:
CreateMetadataValueThatHashesToBackend(int index)38 std::string CreateMetadataValueThatHashesToBackend(int index) {
39 return absl::StrCat(grpc_core::LocalIp(), ":", backends_[index]->port(),
40 "_0");
41 }
42 };
43
44 INSTANTIATE_TEST_SUITE_P(XdsTest, OutlierDetectionTest,
45 ::testing::Values(XdsTestType()), &XdsTestType::Name);
46 // TODO(donnadionne): add non-xds test a new
47 // test/cpp/end2end/outlier_detection_end2end_test.cc
48
49 // Tests SuccessRateEjectionAndUnejection:
50 // 1. Use ring hash policy that hashes using a header value to ensure rpcs
51 // go to all backends.
52 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
53 // interval to pass.
54 // 3. We should skip exactly 1 backend due to ejection and all the loads
55 // sticky to that backend should go to 1 other backend.
56 // 4. Let the ejection period pass and verify we can go back to both backends
57 // after the uneject.
TEST_P(OutlierDetectionTest,SuccessRateEjectionAndUnejection)58 TEST_P(OutlierDetectionTest, SuccessRateEjectionAndUnejection) {
59 CreateAndStartBackends(2);
60 auto cluster = default_cluster_;
61 cluster.set_lb_policy(Cluster::RING_HASH);
62 // Setup outlier failure percentage parameters.
63 // Any failure will cause an potential ejection with the probability of 100%
64 // (to eliminate flakiness of the test).
65 auto* outlier_detection = cluster.mutable_outlier_detection();
66 SetProtoDuration(grpc_core::Duration::Seconds(1),
67 outlier_detection->mutable_interval());
68 SetProtoDuration(grpc_core::Duration::Seconds(1),
69 outlier_detection->mutable_base_ejection_time());
70 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
71 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
72 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
73 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
74 balancer_->ads_service()->SetCdsResource(cluster);
75 auto new_route_config = default_route_config_;
76 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
77 auto* hash_policy = route->mutable_route()->add_hash_policy();
78 hash_policy->mutable_header()->set_header_name("address_hash");
79 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
80 new_route_config);
81 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
82 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
83 // Note each type of RPC will contain a header value that will always be
84 // hashed to a specific backend as the header value matches the value used
85 // to create the entry in the ring.
86 std::vector<std::pair<std::string, std::string>> metadata = {
87 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
88 std::vector<std::pair<std::string, std::string>> metadata1 = {
89 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
90 const auto rpc_options = RpcOptions().set_metadata(metadata);
91 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
92 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
93 WaitForBackendOptions(), rpc_options);
94 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
95 WaitForBackendOptions(), rpc_options1);
96 // Trigger an error to backend 0.
97 // The success rate enforcement_percentage is 100%, so this will cause
98 // the backend to be ejected when the ejection timer fires.
99 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
100 RpcOptions()
101 .set_metadata(std::move(metadata))
102 .set_server_expected_error(StatusCode::CANCELLED));
103 // Wait for traffic aimed at backend 0 to start going to backend 1.
104 // This tells us that backend 0 has been ejected.
105 // It should take no more than one ejection timer interval.
106 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
107 WaitForBackendOptions().set_timeout_ms(
108 3000 * grpc_test_slowdown_factor()),
109 rpc_options);
110 // Now wait for traffic aimed at backend 0 to switch back to backend 0.
111 // This tells us that backend 0 has been unejected.
112 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
113 WaitForBackendOptions().set_timeout_ms(
114 3000 * grpc_test_slowdown_factor()),
115 rpc_options);
116 }
117
118 // We don't eject more than max_ejection_percent (default 10%) of the backends
119 // beyond the first one.
TEST_P(OutlierDetectionTest,SuccessRateMaxPercent)120 TEST_P(OutlierDetectionTest, SuccessRateMaxPercent) {
121 CreateAndStartBackends(4);
122 auto cluster = default_cluster_;
123 cluster.set_lb_policy(Cluster::RING_HASH);
124 // Setup outlier failure percentage parameters.
125 // Any failure will cause an potential ejection with the probability of 100%
126 // (to eliminate flakiness of the test).
127 auto* outlier_detection = cluster.mutable_outlier_detection();
128 SetProtoDuration(grpc_core::Duration::Seconds(1),
129 outlier_detection->mutable_interval());
130 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
131 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
132 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
133 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
134 balancer_->ads_service()->SetCdsResource(cluster);
135 auto new_route_config = default_route_config_;
136 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
137 auto* hash_policy = route->mutable_route()->add_hash_policy();
138 hash_policy->mutable_header()->set_header_name("address_hash");
139 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
140 new_route_config);
141 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
142 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
143 // Note each type of RPC will contain a header value that will always be
144 // hashed to a specific backend as the header value matches the value used
145 // to create the entry in the ring.
146 std::vector<std::pair<std::string, std::string>> metadata = {
147 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
148 std::vector<std::pair<std::string, std::string>> metadata1 = {
149 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
150 std::vector<std::pair<std::string, std::string>> metadata2 = {
151 {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
152 std::vector<std::pair<std::string, std::string>> metadata3 = {
153 {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
154 const auto rpc_options = RpcOptions().set_metadata(metadata);
155 const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
156 const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
157 const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
158 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
159 WaitForBackendOptions(), rpc_options);
160 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
161 WaitForBackendOptions(), rpc_options1);
162 WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
163 WaitForBackendOptions(), rpc_options2);
164 WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
165 WaitForBackendOptions(), rpc_options3);
166 // Cause 2 errors and wait until one ejection happens.
167 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
168 RpcOptions()
169 .set_metadata(std::move(metadata))
170 .set_server_expected_error(StatusCode::CANCELLED));
171 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
172 RpcOptions()
173 .set_metadata(std::move(metadata1))
174 .set_server_expected_error(StatusCode::CANCELLED));
175 absl::Time deadline =
176 absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
177 while (true) {
178 ResetBackendCounters();
179 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
180 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
181 if (!SeenAllBackends(0, 2)) {
182 break;
183 }
184 EXPECT_LE(absl::Now(), deadline);
185 if (absl::Now() >= deadline) break;
186 }
187 // 1 backend should be ejected, trafficed picked up by another backend.
188 // No other backend should be ejected.
189 ResetBackendCounters();
190 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
191 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
192 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
193 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
194 size_t empty_load_backend_count = 0;
195 size_t double_load_backend_count = 0;
196 size_t regular_load_backend_count = 0;
197 for (size_t i = 0; i < backends_.size(); ++i) {
198 if (backends_[i]->backend_service()->request_count() == 0) {
199 ++empty_load_backend_count;
200 } else if (backends_[i]->backend_service()->request_count() == 200) {
201 ++double_load_backend_count;
202 } else if (backends_[i]->backend_service()->request_count() == 100) {
203 ++regular_load_backend_count;
204 } else {
205 GPR_ASSERT(1);
206 }
207 }
208 EXPECT_EQ(1, empty_load_backend_count);
209 EXPECT_EQ(1, double_load_backend_count);
210 EXPECT_EQ(2, regular_load_backend_count);
211 }
212
213 // Success rate stdev_factor is honored, a higher value would ensure ejection
214 // does not occur.
TEST_P(OutlierDetectionTest,SuccessRateStdevFactor)215 TEST_P(OutlierDetectionTest, SuccessRateStdevFactor) {
216 CreateAndStartBackends(2);
217 auto cluster = default_cluster_;
218 cluster.set_lb_policy(Cluster::RING_HASH);
219 // Setup outlier failure percentage parameters.
220 // Any failure will cause an potential ejection with the probability of 100%
221 // (to eliminate flakiness of the test).
222 auto* outlier_detection = cluster.mutable_outlier_detection();
223 SetProtoDuration(grpc_core::Duration::Seconds(1),
224 outlier_detection->mutable_interval());
225 SetProtoDuration(grpc_core::Duration::Seconds(1),
226 outlier_detection->mutable_base_ejection_time());
227 // We know a stdev factor of 100 will ensure the ejection occurs, so setting
228 // it to something higher like 1000 to test that ejection will not occur.
229 // Note this parameter is the only difference between this test and
230 // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
231 // this one value changes means the difference between not ejecting in this
232 // test and ejecting in the other test.
233 outlier_detection->mutable_success_rate_stdev_factor()->set_value(1000);
234 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
235 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
236 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
237 balancer_->ads_service()->SetCdsResource(cluster);
238 auto new_route_config = default_route_config_;
239 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
240 auto* hash_policy = route->mutable_route()->add_hash_policy();
241 hash_policy->mutable_header()->set_header_name("address_hash");
242 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
243 new_route_config);
244 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
245 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
246 // Note each type of RPC will contain a header value that will always be
247 // hashed to a specific backend as the header value matches the value used
248 // to create the entry in the ring.
249 std::vector<std::pair<std::string, std::string>> metadata = {
250 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
251 std::vector<std::pair<std::string, std::string>> metadata1 = {
252 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
253 const auto rpc_options = RpcOptions().set_metadata(metadata);
254 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
255 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
256 WaitForBackendOptions(), rpc_options);
257 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
258 WaitForBackendOptions(), rpc_options1);
259 // Cause an error and wait for 1 outlier detection interval to pass
260 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
261 RpcOptions()
262 .set_metadata(std::move(metadata))
263 .set_server_expected_error(StatusCode::CANCELLED));
264 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
265 3000 * grpc_test_slowdown_factor()));
266 ResetBackendCounters();
267 // 1 backend experenced failure, but since the stdev_factor is high, no
268 // backend will be noticed as an outlier so no ejection.
269 // Both backends are still getting the RPCs intended for them.
270 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
271 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
272 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
273 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
274 }
275
276 // Success rate enforcement percentage is honored, setting it to 0 so guarantee
277 // the randomized number between 1 to 100 will always be great, so nothing will
278 // be ejected.
TEST_P(OutlierDetectionTest,SuccessRateEnforcementPercentage)279 TEST_P(OutlierDetectionTest, SuccessRateEnforcementPercentage) {
280 CreateAndStartBackends(2);
281 auto cluster = default_cluster_;
282 cluster.set_lb_policy(Cluster::RING_HASH);
283 auto* outlier_detection = cluster.mutable_outlier_detection();
284 SetProtoDuration(grpc_core::Duration::Seconds(1),
285 outlier_detection->mutable_interval());
286 SetProtoDuration(grpc_core::Duration::Seconds(1),
287 outlier_detection->mutable_base_ejection_time());
288 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
289 // Setting enforcing_success_rate to 0 to ensure we will never eject.
290 // Note this parameter is the only difference between this test and
291 // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
292 // this one value changes means the difference between guaranteed not ejecting
293 // in this test and guaranteed ejecting in the other test.
294 outlier_detection->mutable_enforcing_success_rate()->set_value(0);
295 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
296 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
297 balancer_->ads_service()->SetCdsResource(cluster);
298 auto new_route_config = default_route_config_;
299 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
300 auto* hash_policy = route->mutable_route()->add_hash_policy();
301 hash_policy->mutable_header()->set_header_name("address_hash");
302 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
303 new_route_config);
304 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
305 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
306 // Note each type of RPC will contain a header value that will always be
307 // hashed to a specific backend as the header value matches the value used
308 // to create the entry in the ring.
309 std::vector<std::pair<std::string, std::string>> metadata = {
310 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
311 std::vector<std::pair<std::string, std::string>> metadata1 = {
312 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
313 const auto rpc_options = RpcOptions().set_metadata(metadata);
314 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
315 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
316 WaitForBackendOptions(), rpc_options);
317 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
318 WaitForBackendOptions(), rpc_options1);
319 // Cause an error and wait for 1 outlier detection interval to pass
320 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
321 RpcOptions()
322 .set_metadata(std::move(metadata))
323 .set_server_expected_error(StatusCode::CANCELLED));
324 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
325 3000 * grpc_test_slowdown_factor()));
326 ResetBackendCounters();
327 // 1 backend experenced failure, but since the enforcement percentage is 0, no
328 // backend will be ejected.
329 // Both backends are still getting the RPCs intended for them.
330 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
331 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
332 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
333 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
334 }
335
336 // Success rate does not eject if there are less than minimum_hosts backends
337 // Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,SuccessRateMinimumHosts)338 TEST_P(OutlierDetectionTest, SuccessRateMinimumHosts) {
339 CreateAndStartBackends(2);
340 auto cluster = default_cluster_;
341 cluster.set_lb_policy(Cluster::RING_HASH);
342 // Setup outlier failure percentage parameters.
343 // Any failure will cause an potential ejection with the probability of 100%
344 // (to eliminate flakiness of the test).
345 auto* outlier_detection = cluster.mutable_outlier_detection();
346 SetProtoDuration(grpc_core::Duration::Seconds(1),
347 outlier_detection->mutable_interval());
348 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
349 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
350 // Set success_rate_minimum_hosts to 3 when we only have 2 backends
351 // Note this parameter is the only difference between this test and
352 // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
353 // this one value changes means the difference between not ejecting in this
354 // test and ejecting in the other test.
355 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(3);
356 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
357 balancer_->ads_service()->SetCdsResource(cluster);
358 auto new_route_config = default_route_config_;
359 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
360 auto* hash_policy = route->mutable_route()->add_hash_policy();
361 hash_policy->mutable_header()->set_header_name("address_hash");
362 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
363 new_route_config);
364 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
365 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
366 // Note each type of RPC will contain a header value that will always be
367 // hashed to a specific backend as the header value matches the value used
368 // to create the entry in the ring.
369 std::vector<std::pair<std::string, std::string>> metadata = {
370 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
371 std::vector<std::pair<std::string, std::string>> metadata1 = {
372 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
373 const auto rpc_options = RpcOptions().set_metadata(metadata);
374 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
375 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
376 WaitForBackendOptions(), rpc_options);
377 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
378 WaitForBackendOptions(), rpc_options1);
379 // Cause an error and wait for 1 outlier detection interval to pass
380 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
381 RpcOptions()
382 .set_metadata(std::move(metadata))
383 .set_server_expected_error(StatusCode::CANCELLED));
384 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
385 3000 * grpc_test_slowdown_factor()));
386 ResetBackendCounters();
387 // All traffic still reaching the original backends and no backends are
388 // ejected.
389 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
390 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
391 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
392 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
393 }
394
395 // Success rate does not eject if there are less than request_volume requests
396 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
397 // interval.
TEST_P(OutlierDetectionTest,SuccessRateRequestVolume)398 TEST_P(OutlierDetectionTest, SuccessRateRequestVolume) {
399 CreateAndStartBackends(2);
400 auto cluster = default_cluster_;
401 cluster.set_lb_policy(Cluster::RING_HASH);
402 // Setup outlier failure percentage parameters.
403 // Any failure will cause an potential ejection with the probability of 100%
404 // (to eliminate flakiness of the test).
405 auto* outlier_detection = cluster.mutable_outlier_detection();
406 SetProtoDuration(grpc_core::Duration::Seconds(1),
407 outlier_detection->mutable_interval());
408 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
409 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
410 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
411 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
412 // interval.
413 // Note this parameter is the only difference between this test and
414 // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
415 // this one value changes means the difference between not ejecting in this
416 // test and ejecting in the other test.
417 outlier_detection->mutable_success_rate_request_volume()->set_value(4);
418 balancer_->ads_service()->SetCdsResource(cluster);
419 auto new_route_config = default_route_config_;
420 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
421 auto* hash_policy = route->mutable_route()->add_hash_policy();
422 hash_policy->mutable_header()->set_header_name("address_hash");
423 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
424 new_route_config);
425 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
426 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
427 // Note each type of RPC will contain a header value that will always be
428 // hashed to a specific backend as the header value matches the value used
429 // to create the entry in the ring.
430 std::vector<std::pair<std::string, std::string>> metadata = {
431 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
432 std::vector<std::pair<std::string, std::string>> metadata1 = {
433 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
434 const auto rpc_options = RpcOptions().set_metadata(metadata);
435 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
436 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
437 WaitForBackendOptions(), rpc_options);
438 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
439 WaitForBackendOptions(), rpc_options1);
440 // Cause an error and wait for 1 outlier detection interval to pass
441 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
442 RpcOptions()
443 .set_metadata(std::move(metadata))
444 .set_server_expected_error(StatusCode::CANCELLED));
445 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
446 3000 * grpc_test_slowdown_factor()));
447 ResetBackendCounters();
448 // All traffic still reaching the original backends and no backends are
449 // ejected.
450 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
451 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
452 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
453 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
454 }
455
456 // Tests FailurePercentageEjectionAndUnejection:
457 // 1. Use ring hash policy that hashes using a header value to ensure RPCs
458 // go to all backends.
459 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
460 // interval to pass.
461 // 3. We should skip exactly 1 backend due to ejection and all the loads
462 // sticky to that backend should go to 1 other backend.
463 // 4. Let the ejection period pass and verify that traffic will again go both
464 // backends as we have unejected the backend.
TEST_P(OutlierDetectionTest,FailurePercentageEjectionAndUnejection)465 TEST_P(OutlierDetectionTest, FailurePercentageEjectionAndUnejection) {
466 CreateAndStartBackends(2);
467 auto cluster = default_cluster_;
468 cluster.set_lb_policy(Cluster::RING_HASH);
469 // Setup outlier failure percentage parameters.
470 // Any failure will cause an potential ejection with the probability of 100%
471 // (to eliminate flakiness of the test).
472 auto* outlier_detection = cluster.mutable_outlier_detection();
473 SetProtoDuration(grpc_core::Duration::Seconds(1),
474 outlier_detection->mutable_interval());
475 SetProtoDuration(grpc_core::Duration::Seconds(3),
476 outlier_detection->mutable_base_ejection_time());
477 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
478 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
479 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
480 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
481 balancer_->ads_service()->SetCdsResource(cluster);
482 auto new_route_config = default_route_config_;
483 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
484 auto* hash_policy = route->mutable_route()->add_hash_policy();
485 hash_policy->mutable_header()->set_header_name("address_hash");
486 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
487 new_route_config);
488 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
489 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
490 // Note each type of RPC will contain a header value that will always be
491 // hashed to a specific backend as the header value matches the value used
492 // to create the entry in the ring.
493 std::vector<std::pair<std::string, std::string>> metadata = {
494 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
495 std::vector<std::pair<std::string, std::string>> metadata1 = {
496 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
497 const auto rpc_options = RpcOptions().set_metadata(metadata);
498 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
499 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
500 WaitForBackendOptions(), rpc_options);
501 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
502 WaitForBackendOptions(), rpc_options1);
503 // Cause an error and wait for traffic aimed at backend 0 to start going to
504 // backend 1.
505 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
506 RpcOptions()
507 .set_metadata(std::move(metadata))
508 .set_server_expected_error(StatusCode::CANCELLED));
509 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
510 WaitForBackendOptions().set_timeout_ms(
511 3000 * grpc_test_slowdown_factor()),
512 rpc_options);
513 // 1 backend is ejected all traffic going to the ejected backend should now
514 // all be going to the other backend.
515 // failure percentage enforcement_percentage of 100% is honored as this test
516 // will consistently reject 1 backend.
517 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
518 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
519 // Now wait for traffic aimed at backend 0 to switch back to backend 0.
520 // This tells us that backend 0 has been unejected.
521 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
522 WaitForBackendOptions().set_timeout_ms(
523 30000 * grpc_test_slowdown_factor()),
524 rpc_options);
525 // Verify that rpcs go to their expectedly hashed backends.
526 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
527 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
528 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
529 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
530 }
531
532 // We don't eject more than max_ejection_percent (default 10%) of the backends
533 // beyond the first one.
TEST_P(OutlierDetectionTest,FailurePercentageMaxPercentage)534 TEST_P(OutlierDetectionTest, FailurePercentageMaxPercentage) {
535 CreateAndStartBackends(4);
536 auto cluster = default_cluster_;
537 cluster.set_lb_policy(Cluster::RING_HASH);
538 // Setup outlier failure percentage parameters.
539 // Any failure will cause an potential ejection with the probability of 100%
540 // (to eliminate flakiness of the test).
541 auto* outlier_detection = cluster.mutable_outlier_detection();
542 SetProtoDuration(grpc_core::Duration::Seconds(1),
543 outlier_detection->mutable_interval());
544 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
545 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
546 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
547 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
548 balancer_->ads_service()->SetCdsResource(cluster);
549 auto new_route_config = default_route_config_;
550 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
551 auto* hash_policy = route->mutable_route()->add_hash_policy();
552 hash_policy->mutable_header()->set_header_name("address_hash");
553 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
554 new_route_config);
555 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
556 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
557 // Note each type of RPC will contain a header value that will always be
558 // hashed to a specific backend as the header value matches the value used
559 // to create the entry in the ring.
560 std::vector<std::pair<std::string, std::string>> metadata = {
561 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
562 std::vector<std::pair<std::string, std::string>> metadata1 = {
563 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
564 std::vector<std::pair<std::string, std::string>> metadata2 = {
565 {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
566 std::vector<std::pair<std::string, std::string>> metadata3 = {
567 {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
568 const auto rpc_options = RpcOptions().set_metadata(metadata);
569 const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
570 const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
571 const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
572 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
573 WaitForBackendOptions(), rpc_options);
574 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
575 WaitForBackendOptions(), rpc_options1);
576 WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
577 WaitForBackendOptions(), rpc_options2);
578 WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
579 WaitForBackendOptions(), rpc_options3);
580 // Cause 2 errors and wait until one ejection happens.
581 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
582 RpcOptions()
583 .set_metadata(std::move(metadata))
584 .set_server_expected_error(StatusCode::CANCELLED));
585 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
586 RpcOptions()
587 .set_metadata(std::move(metadata1))
588 .set_server_expected_error(StatusCode::CANCELLED));
589 absl::Time deadline =
590 absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
591 while (true) {
592 ResetBackendCounters();
593 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
594 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
595 if (!SeenAllBackends(0, 2)) {
596 break;
597 }
598 EXPECT_LE(absl::Now(), deadline);
599 if (absl::Now() >= deadline) break;
600 }
601 // 1 backend should be ejected, trafficed picked up by another backend.
602 // No other backend should be ejected.
603 ResetBackendCounters();
604 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
605 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
606 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
607 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
608 size_t empty_load_backend_count = 0;
609 size_t double_load_backend_count = 0;
610 size_t regular_load_backend_count = 0;
611 for (size_t i = 0; i < backends_.size(); ++i) {
612 if (backends_[i]->backend_service()->request_count() == 0) {
613 ++empty_load_backend_count;
614 } else if (backends_[i]->backend_service()->request_count() == 200) {
615 ++double_load_backend_count;
616 } else if (backends_[i]->backend_service()->request_count() == 100) {
617 ++regular_load_backend_count;
618 } else {
619 GPR_ASSERT(1);
620 }
621 }
622 EXPECT_EQ(1, empty_load_backend_count);
623 EXPECT_EQ(1, double_load_backend_count);
624 EXPECT_EQ(2, regular_load_backend_count);
625 }
626
627 // Failure percentage threshold is honored, a higher value would ensure ejection
628 // does not occur
TEST_P(OutlierDetectionTest,FailurePercentageThreshold)629 TEST_P(OutlierDetectionTest, FailurePercentageThreshold) {
630 CreateAndStartBackends(2);
631 auto cluster = default_cluster_;
632 cluster.set_lb_policy(Cluster::RING_HASH);
633 auto* outlier_detection = cluster.mutable_outlier_detection();
634 SetProtoDuration(grpc_core::Duration::Seconds(1),
635 outlier_detection->mutable_interval());
636 SetProtoDuration(grpc_core::Duration::Seconds(1),
637 outlier_detection->mutable_base_ejection_time());
638 // Setup outlier failure percentage parameter to 50
639 // Note this parameter is the only difference between this test and
640 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 0)
641 // and this one value changes means the difference between not ejecting in
642 // this test and ejecting in the other test.
643 outlier_detection->mutable_failure_percentage_threshold()->set_value(50);
644 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
645 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
646 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
647 balancer_->ads_service()->SetCdsResource(cluster);
648 auto new_route_config = default_route_config_;
649 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
650 auto* hash_policy = route->mutable_route()->add_hash_policy();
651 hash_policy->mutable_header()->set_header_name("address_hash");
652 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
653 new_route_config);
654 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
655 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
656 // Note each type of RPC will contain a header value that will always be
657 // hashed to a specific backend as the header value matches the value used
658 // to create the entry in the ring.
659 std::vector<std::pair<std::string, std::string>> metadata = {
660 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
661 std::vector<std::pair<std::string, std::string>> metadata1 = {
662 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
663 const auto rpc_options = RpcOptions().set_metadata(metadata);
664 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
665 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
666 WaitForBackendOptions(), rpc_options);
667 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
668 WaitForBackendOptions(), rpc_options1);
669 // Cause an error and wait for 1 outlier detection interval to pass to cause
670 // the backend to be ejected.
671 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
672 RpcOptions()
673 .set_metadata(std::move(metadata))
674 .set_server_expected_error(StatusCode::CANCELLED));
675 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
676 3000 * grpc_test_slowdown_factor()));
677 ResetBackendCounters();
678 // 1 backend experenced 1 failure, but since the threshold is 50 % no
679 // backend will be noticed as an outlier so no ejection.
680 // Both backends are still getting the RPCs intended for them.
681 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
682 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
683 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
684 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
685 }
686
687 // Failure percentage enforcement percentage is honored, setting it to 0 so
688 // guarantee the randomized number between 1 to 100 will always be great, so
689 // nothing will be ejected.
TEST_P(OutlierDetectionTest,FailurePercentageEnforcementPercentage)690 TEST_P(OutlierDetectionTest, FailurePercentageEnforcementPercentage) {
691 CreateAndStartBackends(2);
692 auto cluster = default_cluster_;
693 cluster.set_lb_policy(Cluster::RING_HASH);
694 auto* outlier_detection = cluster.mutable_outlier_detection();
695 SetProtoDuration(grpc_core::Duration::Seconds(1),
696 outlier_detection->mutable_interval());
697 SetProtoDuration(grpc_core::Duration::Seconds(1),
698 outlier_detection->mutable_base_ejection_time());
699 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
700 // Setting enforcing_success_rate to 0 to ensure we will never eject.
701 // Note this parameter is the only difference between this test and
702 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 100)
703 // and this one value changes means the difference between guaranteed not
704 // ejecting in this test and guaranteed ejecting in the other test.
705 outlier_detection->mutable_enforcing_failure_percentage()->set_value(0);
706 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
707 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
708 balancer_->ads_service()->SetCdsResource(cluster);
709 auto new_route_config = default_route_config_;
710 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
711 auto* hash_policy = route->mutable_route()->add_hash_policy();
712 hash_policy->mutable_header()->set_header_name("address_hash");
713 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
714 new_route_config);
715 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
716 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
717 // Note each type of RPC will contain a header value that will always be
718 // hashed to a specific backend as the header value matches the value used
719 // to create the entry in the ring.
720 std::vector<std::pair<std::string, std::string>> metadata = {
721 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
722 std::vector<std::pair<std::string, std::string>> metadata1 = {
723 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
724 const auto rpc_options = RpcOptions().set_metadata(metadata);
725 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
726 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
727 WaitForBackendOptions(), rpc_options);
728 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
729 WaitForBackendOptions(), rpc_options1);
730 // Cause an error and wait for 1 outlier detection interval to pass to cause
731 // the backend to be ejected.
732 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
733 RpcOptions()
734 .set_metadata(std::move(metadata))
735 .set_server_expected_error(StatusCode::CANCELLED));
736 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
737 3000 * grpc_test_slowdown_factor()));
738 ResetBackendCounters();
739 // 1 backend experenced failure, but since the enforcement percentage is 0, no
740 // backend will be ejected.
741 // Both backends are still getting the RPCs intended for them.
742 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
743 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
744 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
745 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
746 }
747
748 // Failure percentage does not eject if there are less than minimum_hosts
749 // backends Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,FailurePercentageMinimumHosts)750 TEST_P(OutlierDetectionTest, FailurePercentageMinimumHosts) {
751 CreateAndStartBackends(2);
752 auto cluster = default_cluster_;
753 cluster.set_lb_policy(Cluster::RING_HASH);
754 // Setup outlier failure percentage parameters.
755 // Any failure will cause an potential ejection with the probability of 100%
756 // (to eliminate flakiness of the test).
757 auto* outlier_detection = cluster.mutable_outlier_detection();
758 SetProtoDuration(grpc_core::Duration::Seconds(1),
759 outlier_detection->mutable_interval());
760 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
761 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
762 // Set failure_percentage_minimum_hosts to 3 when we only have 2 backends
763 // Note this parameter is the only difference between this test and
764 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
765 // and this one value changes means the difference between not ejecting in
766 // this test and ejecting in the other test.
767 cluster.mutable_outlier_detection()
768 ->mutable_failure_percentage_minimum_hosts()
769 ->set_value(3);
770 cluster.mutable_outlier_detection()
771 ->mutable_failure_percentage_request_volume()
772 ->set_value(1);
773 balancer_->ads_service()->SetCdsResource(cluster);
774 auto new_route_config = default_route_config_;
775 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
776 auto* hash_policy = route->mutable_route()->add_hash_policy();
777 hash_policy->mutable_header()->set_header_name("address_hash");
778 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
779 new_route_config);
780 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
781 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
782 // Note each type of RPC will contain a header value that will always be
783 // hashed to a specific backend as the header value matches the value used
784 // to create the entry in the ring.
785 std::vector<std::pair<std::string, std::string>> metadata = {
786 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
787 std::vector<std::pair<std::string, std::string>> metadata1 = {
788 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
789 const auto rpc_options = RpcOptions().set_metadata(metadata);
790 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
791 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
792 WaitForBackendOptions(), rpc_options);
793 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
794 WaitForBackendOptions(), rpc_options1);
795 // Cause an error and wait for 1 outlier detection interval to pass to cause
796 // the backend to be ejected.
797 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
798 RpcOptions()
799 .set_metadata(std::move(metadata))
800 .set_server_expected_error(StatusCode::CANCELLED));
801 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
802 3000 * grpc_test_slowdown_factor()));
803 ResetBackendCounters();
804 // All traffic still reaching the original backends and no backends are
805 // ejected.
806 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
807 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
808 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
809 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
810 }
811
812 // Failure percentage does not eject if there are less than request_volume
813 // requests
814 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
815 // interval.
TEST_P(OutlierDetectionTest,FailurePercentageRequestVolume)816 TEST_P(OutlierDetectionTest, FailurePercentageRequestVolume) {
817 CreateAndStartBackends(2);
818 auto cluster = default_cluster_;
819 cluster.set_lb_policy(Cluster::RING_HASH);
820 // Setup outlier failure percentage parameters.
821 // Any failure will cause an potential ejection with the probability of 100%
822 // (to eliminate flakiness of the test).
823 auto* outlier_detection = cluster.mutable_outlier_detection();
824 SetProtoDuration(grpc_core::Duration::Seconds(1),
825 outlier_detection->mutable_interval());
826 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
827 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
828 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
829 // Set failure_percentage_request_volume to 4 when we only send 3 RPC in the
830 // interval.
831 // // Note this parameter is the only difference between this test and
832 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
833 // and this one value changes means the difference between not ejecting in
834 // this test and ejecting in the other test.
835 outlier_detection->mutable_failure_percentage_request_volume()->set_value(4);
836 balancer_->ads_service()->SetCdsResource(cluster);
837 auto new_route_config = default_route_config_;
838 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
839 auto* hash_policy = route->mutable_route()->add_hash_policy();
840 hash_policy->mutable_header()->set_header_name("address_hash");
841 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
842 new_route_config);
843 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
844 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
845 // Note each type of RPC will contain a header value that will always be
846 // hashed to a specific backend as the header value matches the value used
847 // to create the entry in the ring.
848 std::vector<std::pair<std::string, std::string>> metadata = {
849 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
850 std::vector<std::pair<std::string, std::string>> metadata1 = {
851 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
852 const auto rpc_options = RpcOptions().set_metadata(metadata);
853 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
854 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
855 WaitForBackendOptions(), rpc_options);
856 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
857 WaitForBackendOptions(), rpc_options1);
858 // Cause an error and wait for 1 outlier detection interval to pass to cause
859 // the backend to be ejected.
860 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
861 RpcOptions()
862 .set_metadata(std::move(metadata))
863 .set_server_expected_error(StatusCode::CANCELLED));
864 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
865 3000 * grpc_test_slowdown_factor()));
866 ResetBackendCounters();
867 // All traffic still reaching the original backends and no backends are
868 // ejected.
869 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
870 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
871 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
872 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
873 }
874
875 // Tests SuccessRate and FailurePercentage both configured
876 // Configure max_ejection_percent to 50% which means max 2/4 backends can be
877 // ejected.
878 // Configure success rate to eject 1 and failure percentage to eject 2.
879 // Verify that maximum 2 backends are ejected, not 3!
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentage)880 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentage) {
881 CreateAndStartBackends(4);
882 auto cluster = default_cluster_;
883 cluster.set_lb_policy(Cluster::RING_HASH);
884 // Setup outlier failure percentage parameters.
885 // Any failure will cause an potential ejection with the probability of 100%
886 // (to eliminate flakiness of the test).
887 auto* outlier_detection = cluster.mutable_outlier_detection();
888 SetProtoDuration(grpc_core::Duration::Seconds(1),
889 outlier_detection->mutable_interval());
890 outlier_detection->mutable_max_ejection_percent()->set_value(50);
891 // This stdev of 500 will ensure the number of ok RPC and error RPC we send
892 // will make 1 outlier out of the 4 backends.
893 outlier_detection->mutable_success_rate_stdev_factor()->set_value(500);
894 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
895 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
896 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
897 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
898 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
899 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
900 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
901 balancer_->ads_service()->SetCdsResource(cluster);
902 auto new_route_config = default_route_config_;
903 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
904 auto* hash_policy = route->mutable_route()->add_hash_policy();
905 hash_policy->mutable_header()->set_header_name("address_hash");
906 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
907 new_route_config);
908 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
909 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
910 // Note each type of RPC will contain a header value that will always be
911 // hashed to a specific backend as the header value matches the value used
912 // to create the entry in the ring.
913 std::vector<std::pair<std::string, std::string>> metadata = {
914 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
915 std::vector<std::pair<std::string, std::string>> metadata1 = {
916 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
917 std::vector<std::pair<std::string, std::string>> metadata2 = {
918 {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
919 std::vector<std::pair<std::string, std::string>> metadata3 = {
920 {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
921 const auto rpc_options = RpcOptions().set_metadata(metadata);
922 const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
923 const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
924 const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
925 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
926 WaitForBackendOptions(), rpc_options);
927 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
928 WaitForBackendOptions(), rpc_options1);
929 WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
930 WaitForBackendOptions(), rpc_options2);
931 WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
932 WaitForBackendOptions(), rpc_options3);
933 // Cause 2 errors on 1 backend and 1 error on 2 backends and wait for 2
934 // backends to be ejected. The 2 errors to the 1 backend will make exactly 1
935 // outlier from the success rate algorithm; all 4 errors will make 3 outliers
936 // from the failure pecentage algorithm because the threahold is set to 0. I
937 // have verified through debug logs we eject 1 backend because of success
938 // rate, 1 backend because of failure percentage; but as we attempt to eject
939 // another backend because of failure percentage we will stop as we have
940 // reached our 50% limit.
941 CheckRpcSendFailure(
942 DEBUG_LOCATION, StatusCode::CANCELLED, "",
943 RpcOptions().set_metadata(metadata).set_server_expected_error(
944 StatusCode::CANCELLED));
945 CheckRpcSendFailure(
946 DEBUG_LOCATION, StatusCode::CANCELLED, "",
947 RpcOptions().set_metadata(metadata).set_server_expected_error(
948 StatusCode::CANCELLED));
949 CheckRpcSendFailure(
950 DEBUG_LOCATION, StatusCode::CANCELLED, "",
951 RpcOptions().set_metadata(metadata1).set_server_expected_error(
952 StatusCode::CANCELLED));
953 CheckRpcSendFailure(
954 DEBUG_LOCATION, StatusCode::CANCELLED, "",
955 RpcOptions().set_metadata(metadata2).set_server_expected_error(
956 StatusCode::CANCELLED));
957 absl::Time deadline =
958 absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
959 std::vector<size_t> idx = {0, 1, 2, 3};
960 while (true) {
961 ResetBackendCounters();
962 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
963 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
964 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options2);
965 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options3);
966 if (std::count_if(idx.begin(), idx.end(),
967 [this](size_t i) { return SeenBackend(i); }) == 2) {
968 break;
969 }
970 EXPECT_LE(absl::Now(), deadline);
971 if (absl::Now() >= deadline) break;
972 }
973 ResetBackendCounters();
974 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
975 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
976 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
977 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
978 size_t empty_load_backend_count = 0;
979 size_t double_load_backend_count = 0;
980 for (size_t i = 0; i < backends_.size(); ++i) {
981 if (backends_[i]->backend_service()->request_count() == 0) {
982 ++empty_load_backend_count;
983 } else if (backends_[i]->backend_service()->request_count() >= 100) {
984 // The extra load could go to 2 remaining backends or just 1 of them.
985 ++double_load_backend_count;
986 } else if (backends_[i]->backend_service()->request_count() > 300) {
987 GPR_ASSERT(1);
988 }
989 }
990 EXPECT_EQ(2, empty_load_backend_count);
991 EXPECT_EQ(2, double_load_backend_count);
992 }
993
994 // Tests SuccessRate and FailurePercentage both unconfigured;
995 // This is the case where according to the gRFC we need to instruct the picker
996 // not to do counting or even start the timer. The result of not counting is
997 // that there will be no ejection taking place since we can't do any
998 // calculations.
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentageBothDisabled)999 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentageBothDisabled) {
1000 CreateAndStartBackends(2);
1001 auto cluster = default_cluster_;
1002 cluster.set_lb_policy(Cluster::RING_HASH);
1003 auto* outlier_detection = cluster.mutable_outlier_detection();
1004 SetProtoDuration(grpc_core::Duration::Seconds(1),
1005 outlier_detection->mutable_interval());
1006 SetProtoDuration(grpc_core::Duration::Seconds(1),
1007 outlier_detection->mutable_base_ejection_time());
1008 balancer_->ads_service()->SetCdsResource(cluster);
1009 auto new_route_config = default_route_config_;
1010 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1011 auto* hash_policy = route->mutable_route()->add_hash_policy();
1012 hash_policy->mutable_header()->set_header_name("address_hash");
1013 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1014 new_route_config);
1015 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1016 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1017 // Note each type of RPC will contain a header value that will always be
1018 // hashed to a specific backend as the header value matches the value used
1019 // to create the entry in the ring.
1020 std::vector<std::pair<std::string, std::string>> metadata = {
1021 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1022 std::vector<std::pair<std::string, std::string>> metadata1 = {
1023 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1024 const auto rpc_options = RpcOptions().set_metadata(metadata);
1025 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1026 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1027 WaitForBackendOptions(), rpc_options);
1028 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1029 WaitForBackendOptions(), rpc_options1);
1030 // Cause an error and wait for 1 outlier detection interval to pass
1031 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
1032 RpcOptions()
1033 .set_metadata(std::move(metadata))
1034 .set_server_expected_error(StatusCode::CANCELLED));
1035 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
1036 3000 * grpc_test_slowdown_factor()));
1037 ResetBackendCounters();
1038 // 1 backend experenced failure, but since there is no counting there is no
1039 // ejection. Both backends are still getting the RPCs intended for them.
1040 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
1041 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
1042 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
1043 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
1044 }
1045
1046 // Tests that we uneject any ejected addresses when the OD policy is
1047 // disabled.
TEST_P(OutlierDetectionTest,DisableOutlierDetectionWhileAddressesAreEjected)1048 TEST_P(OutlierDetectionTest, DisableOutlierDetectionWhileAddressesAreEjected) {
1049 CreateAndStartBackends(2);
1050 auto cluster = default_cluster_;
1051 cluster.set_lb_policy(Cluster::RING_HASH);
1052 // Setup outlier failure percentage parameters.
1053 // Any failure will cause an potential ejection with the probability of 100%
1054 // (to eliminate flakiness of the test).
1055 auto* outlier_detection = cluster.mutable_outlier_detection();
1056 SetProtoDuration(grpc_core::Duration::Seconds(1),
1057 outlier_detection->mutable_interval());
1058 SetProtoDuration(grpc_core::Duration::Seconds(3),
1059 outlier_detection->mutable_base_ejection_time());
1060 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1061 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1062 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1063 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1064 balancer_->ads_service()->SetCdsResource(cluster);
1065 auto new_route_config = default_route_config_;
1066 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1067 auto* hash_policy = route->mutable_route()->add_hash_policy();
1068 hash_policy->mutable_header()->set_header_name("address_hash");
1069 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1070 new_route_config);
1071 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1072 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1073 // Note each type of RPC will contain a header value that will always be
1074 // hashed to a specific backend as the header value matches the value used
1075 // to create the entry in the ring.
1076 std::vector<std::pair<std::string, std::string>> metadata = {
1077 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1078 std::vector<std::pair<std::string, std::string>> metadata1 = {
1079 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1080 const auto rpc_options = RpcOptions().set_metadata(metadata);
1081 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1082 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1083 WaitForBackendOptions(), rpc_options);
1084 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1085 WaitForBackendOptions(), rpc_options1);
1086 // Cause an error and wait for traffic aimed at backend 0 to start going to
1087 // backend 1.
1088 CheckRpcSendFailure(
1089 DEBUG_LOCATION, StatusCode::CANCELLED, "",
1090 RpcOptions().set_metadata(metadata).set_server_expected_error(
1091 StatusCode::CANCELLED));
1092 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1093 WaitForBackendOptions().set_timeout_ms(
1094 3000 * grpc_test_slowdown_factor()),
1095 rpc_options);
1096 // 1 backend is ejected all traffic going to the ejected backend should now
1097 // all be going to the other backend.
1098 // failure percentage enforcement_percentage of 100% is honored as this test
1099 // will consistently reject 1 backend.
1100 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
1101 EXPECT_EQ(1, backends_[1]->backend_service()->request_count());
1102 // Send an update that disables outlier detection.
1103 cluster.clear_outlier_detection();
1104 balancer_->ads_service()->SetCdsResource(cluster);
1105 // Wait for the backend to start being used again.
1106 WaitForBackend(
1107 DEBUG_LOCATION, 0,
1108 [](const RpcResult& result) {
1109 EXPECT_EQ(result.status.error_code(), StatusCode::CANCELLED)
1110 << "Error: " << result.status.error_message();
1111 },
1112 WaitForBackendOptions(),
1113 RpcOptions()
1114 .set_metadata(std::move(metadata))
1115 .set_server_expected_error(StatusCode::CANCELLED));
1116 }
1117
TEST_P(OutlierDetectionTest,EjectionRetainedAcrossPriorities)1118 TEST_P(OutlierDetectionTest, EjectionRetainedAcrossPriorities) {
1119 CreateAndStartBackends(3);
1120 auto cluster = default_cluster_;
1121 // Setup outlier failure percentage parameters.
1122 // Any failure will cause an potential ejection with the probability of 100%
1123 // (to eliminate flakiness of the test).
1124 auto* outlier_detection = cluster.mutable_outlier_detection();
1125 SetProtoDuration(grpc_core::Duration::Seconds(1),
1126 outlier_detection->mutable_interval());
1127 SetProtoDuration(grpc_core::Duration::Minutes(10),
1128 outlier_detection->mutable_base_ejection_time());
1129 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1130 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1131 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1132 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1133 balancer_->ads_service()->SetCdsResource(cluster);
1134 // Priority 0: backend 0 and a non-existent backend.
1135 // Priority 1: backend 1.
1136 EdsResourceArgs args({
1137 {"locality0", {CreateEndpoint(0), MakeNonExistantEndpoint()}},
1138 {"locality1", {CreateEndpoint(1)}, kDefaultLocalityWeight, 1},
1139 });
1140 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1141 WaitForBackend(DEBUG_LOCATION, 0);
1142 // Trigger an error to backend 0.
1143 // The success rate enforcement_percentage is 100%, so this will cause
1144 // the backend to be ejected when the ejection timer fires.
1145 CheckRpcSendFailure(
1146 DEBUG_LOCATION, StatusCode::CANCELLED, "",
1147 RpcOptions().set_server_expected_error(StatusCode::CANCELLED));
1148 // Wait for traffic aimed at backend 0 to start going to backend 1.
1149 // This tells us that backend 0 has been ejected.
1150 // It should take no more than one ejection timer interval.
1151 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1152 WaitForBackendOptions().set_timeout_ms(
1153 3000 * grpc_test_slowdown_factor()));
1154 // Now send an EDS update that moves backend 0 to priority 1.
1155 // We also add backend 2, so that we know when the client sees the update.
1156 args = EdsResourceArgs({
1157 {"locality0", {MakeNonExistantEndpoint()}},
1158 {"locality1", CreateEndpointsForBackends(), kDefaultLocalityWeight, 1},
1159 });
1160 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1161 WaitForBackend(DEBUG_LOCATION, 2);
1162 // Now send 100 RPCs and make sure they all go to backends 1 and 2,
1163 // because backend 0 should still be ejected.
1164 CheckRpcSendOk(DEBUG_LOCATION, 100);
1165 EXPECT_EQ(0, backends_[0]->backend_service()->request_count());
1166 EXPECT_EQ(50, backends_[1]->backend_service()->request_count());
1167 EXPECT_EQ(50, backends_[2]->backend_service()->request_count());
1168 }
1169
1170 } // namespace
1171 } // namespace testing
1172 } // namespace grpc
1173
main(int argc,char ** argv)1174 int main(int argc, char** argv) {
1175 grpc::testing::TestEnvironment env(&argc, argv);
1176 ::testing::InitGoogleTest(&argc, argv);
1177 // Make the backup poller poll very frequently in order to pick up
1178 // updates from all the subchannels's FDs.
1179 grpc_core::ConfigVars::Overrides overrides;
1180 overrides.client_channel_backup_poll_interval_ms = 1;
1181 grpc_core::ConfigVars::SetOverrides(overrides);
1182 #if TARGET_OS_IPHONE
1183 // Workaround Apple CFStream bug
1184 grpc_core::SetEnv("grpc_cfstream", "0");
1185 #endif
1186 grpc_init();
1187 const auto result = RUN_ALL_TESTS();
1188 grpc_shutdown();
1189 return result;
1190 }
1191