xref: /aosp_15_r20/external/grpc-grpc/test/cpp/end2end/xds/xds_outlier_detection_end2end_test.cc (revision cc02d7e222339f7a4f6ba5f422e6413f4bd931f2)
1 // Copyright 2017 gRPC authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <chrono>
16 #include <string>
17 #include <thread>
18 #include <vector>
19 
20 #include <gmock/gmock.h>
21 #include <gtest/gtest.h>
22 
23 #include "src/core/client_channel/backup_poller.h"
24 #include "src/core/lib/config/config_vars.h"
25 #include "src/proto/grpc/testing/xds/v3/cluster.grpc.pb.h"
26 #include "src/proto/grpc/testing/xds/v3/fault.grpc.pb.h"
27 #include "src/proto/grpc/testing/xds/v3/outlier_detection.grpc.pb.h"
28 #include "src/proto/grpc/testing/xds/v3/router.grpc.pb.h"
29 #include "test/core/util/resolve_localhost_ip46.h"
30 #include "test/cpp/end2end/xds/xds_end2end_test_lib.h"
31 
32 namespace grpc {
33 namespace testing {
34 namespace {
35 
36 class OutlierDetectionTest : public XdsEnd2endTest {
37  protected:
CreateMetadataValueThatHashesToBackend(int index)38   std::string CreateMetadataValueThatHashesToBackend(int index) {
39     return absl::StrCat(grpc_core::LocalIp(), ":", backends_[index]->port(),
40                         "_0");
41   }
42 };
43 
44 INSTANTIATE_TEST_SUITE_P(XdsTest, OutlierDetectionTest,
45                          ::testing::Values(XdsTestType()), &XdsTestType::Name);
46 // TODO(donnadionne): add non-xds test a new
47 // test/cpp/end2end/outlier_detection_end2end_test.cc
48 
49 // Tests SuccessRateEjectionAndUnejection:
50 // 1. Use ring hash policy that hashes using a header value to ensure rpcs
51 //    go to all backends.
52 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
53 //    interval to pass.
54 // 3. We should skip exactly 1 backend due to ejection and all the loads
55 //    sticky to that backend should go to 1 other backend.
56 // 4. Let the ejection period pass and verify we can go back to both backends
57 //    after the uneject.
TEST_P(OutlierDetectionTest,SuccessRateEjectionAndUnejection)58 TEST_P(OutlierDetectionTest, SuccessRateEjectionAndUnejection) {
59   CreateAndStartBackends(2);
60   auto cluster = default_cluster_;
61   cluster.set_lb_policy(Cluster::RING_HASH);
62   // Setup outlier failure percentage parameters.
63   // Any failure will cause an potential ejection with the probability of 100%
64   // (to eliminate flakiness of the test).
65   auto* outlier_detection = cluster.mutable_outlier_detection();
66   SetProtoDuration(grpc_core::Duration::Seconds(1),
67                    outlier_detection->mutable_interval());
68   SetProtoDuration(grpc_core::Duration::Seconds(1),
69                    outlier_detection->mutable_base_ejection_time());
70   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
71   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
72   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
73   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
74   balancer_->ads_service()->SetCdsResource(cluster);
75   auto new_route_config = default_route_config_;
76   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
77   auto* hash_policy = route->mutable_route()->add_hash_policy();
78   hash_policy->mutable_header()->set_header_name("address_hash");
79   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
80                                    new_route_config);
81   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
82   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
83   // Note each type of RPC will contain a header value that will always be
84   // hashed to a specific backend as the header value matches the value used
85   // to create the entry in the ring.
86   std::vector<std::pair<std::string, std::string>> metadata = {
87       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
88   std::vector<std::pair<std::string, std::string>> metadata1 = {
89       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
90   const auto rpc_options = RpcOptions().set_metadata(metadata);
91   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
92   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
93                  WaitForBackendOptions(), rpc_options);
94   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
95                  WaitForBackendOptions(), rpc_options1);
96   // Trigger an error to backend 0.
97   // The success rate enforcement_percentage is 100%, so this will cause
98   // the backend to be ejected when the ejection timer fires.
99   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
100                       RpcOptions()
101                           .set_metadata(std::move(metadata))
102                           .set_server_expected_error(StatusCode::CANCELLED));
103   // Wait for traffic aimed at backend 0 to start going to backend 1.
104   // This tells us that backend 0 has been ejected.
105   // It should take no more than one ejection timer interval.
106   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
107                  WaitForBackendOptions().set_timeout_ms(
108                      3000 * grpc_test_slowdown_factor()),
109                  rpc_options);
110   // Now wait for traffic aimed at backend 0 to switch back to backend 0.
111   // This tells us that backend 0 has been unejected.
112   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
113                  WaitForBackendOptions().set_timeout_ms(
114                      3000 * grpc_test_slowdown_factor()),
115                  rpc_options);
116 }
117 
118 // We don't eject more than max_ejection_percent (default 10%) of the backends
119 // beyond the first one.
TEST_P(OutlierDetectionTest,SuccessRateMaxPercent)120 TEST_P(OutlierDetectionTest, SuccessRateMaxPercent) {
121   CreateAndStartBackends(4);
122   auto cluster = default_cluster_;
123   cluster.set_lb_policy(Cluster::RING_HASH);
124   // Setup outlier failure percentage parameters.
125   // Any failure will cause an potential ejection with the probability of 100%
126   // (to eliminate flakiness of the test).
127   auto* outlier_detection = cluster.mutable_outlier_detection();
128   SetProtoDuration(grpc_core::Duration::Seconds(1),
129                    outlier_detection->mutable_interval());
130   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
131   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
132   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
133   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
134   balancer_->ads_service()->SetCdsResource(cluster);
135   auto new_route_config = default_route_config_;
136   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
137   auto* hash_policy = route->mutable_route()->add_hash_policy();
138   hash_policy->mutable_header()->set_header_name("address_hash");
139   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
140                                    new_route_config);
141   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
142   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
143   // Note each type of RPC will contain a header value that will always be
144   // hashed to a specific backend as the header value matches the value used
145   // to create the entry in the ring.
146   std::vector<std::pair<std::string, std::string>> metadata = {
147       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
148   std::vector<std::pair<std::string, std::string>> metadata1 = {
149       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
150   std::vector<std::pair<std::string, std::string>> metadata2 = {
151       {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
152   std::vector<std::pair<std::string, std::string>> metadata3 = {
153       {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
154   const auto rpc_options = RpcOptions().set_metadata(metadata);
155   const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
156   const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
157   const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
158   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
159                  WaitForBackendOptions(), rpc_options);
160   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
161                  WaitForBackendOptions(), rpc_options1);
162   WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
163                  WaitForBackendOptions(), rpc_options2);
164   WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
165                  WaitForBackendOptions(), rpc_options3);
166   // Cause 2 errors and wait until one ejection happens.
167   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
168                       RpcOptions()
169                           .set_metadata(std::move(metadata))
170                           .set_server_expected_error(StatusCode::CANCELLED));
171   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
172                       RpcOptions()
173                           .set_metadata(std::move(metadata1))
174                           .set_server_expected_error(StatusCode::CANCELLED));
175   absl::Time deadline =
176       absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
177   while (true) {
178     ResetBackendCounters();
179     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
180     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
181     if (!SeenAllBackends(0, 2)) {
182       break;
183     }
184     EXPECT_LE(absl::Now(), deadline);
185     if (absl::Now() >= deadline) break;
186   }
187   // 1 backend should be ejected, trafficed picked up by another backend.
188   // No other backend should be ejected.
189   ResetBackendCounters();
190   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
191   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
192   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
193   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
194   size_t empty_load_backend_count = 0;
195   size_t double_load_backend_count = 0;
196   size_t regular_load_backend_count = 0;
197   for (size_t i = 0; i < backends_.size(); ++i) {
198     if (backends_[i]->backend_service()->request_count() == 0) {
199       ++empty_load_backend_count;
200     } else if (backends_[i]->backend_service()->request_count() == 200) {
201       ++double_load_backend_count;
202     } else if (backends_[i]->backend_service()->request_count() == 100) {
203       ++regular_load_backend_count;
204     } else {
205       GPR_ASSERT(1);
206     }
207   }
208   EXPECT_EQ(1, empty_load_backend_count);
209   EXPECT_EQ(1, double_load_backend_count);
210   EXPECT_EQ(2, regular_load_backend_count);
211 }
212 
213 // Success rate stdev_factor is honored, a higher value would ensure ejection
214 // does not occur.
TEST_P(OutlierDetectionTest,SuccessRateStdevFactor)215 TEST_P(OutlierDetectionTest, SuccessRateStdevFactor) {
216   CreateAndStartBackends(2);
217   auto cluster = default_cluster_;
218   cluster.set_lb_policy(Cluster::RING_HASH);
219   // Setup outlier failure percentage parameters.
220   // Any failure will cause an potential ejection with the probability of 100%
221   // (to eliminate flakiness of the test).
222   auto* outlier_detection = cluster.mutable_outlier_detection();
223   SetProtoDuration(grpc_core::Duration::Seconds(1),
224                    outlier_detection->mutable_interval());
225   SetProtoDuration(grpc_core::Duration::Seconds(1),
226                    outlier_detection->mutable_base_ejection_time());
227   // We know a stdev factor of 100 will ensure the ejection occurs, so setting
228   // it to something higher like 1000 to test that ejection will not occur.
229   // Note this parameter is the only difference between this test and
230   // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
231   // this one value changes means the difference between not ejecting in this
232   // test and ejecting in the other test.
233   outlier_detection->mutable_success_rate_stdev_factor()->set_value(1000);
234   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
235   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
236   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
237   balancer_->ads_service()->SetCdsResource(cluster);
238   auto new_route_config = default_route_config_;
239   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
240   auto* hash_policy = route->mutable_route()->add_hash_policy();
241   hash_policy->mutable_header()->set_header_name("address_hash");
242   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
243                                    new_route_config);
244   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
245   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
246   // Note each type of RPC will contain a header value that will always be
247   // hashed to a specific backend as the header value matches the value used
248   // to create the entry in the ring.
249   std::vector<std::pair<std::string, std::string>> metadata = {
250       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
251   std::vector<std::pair<std::string, std::string>> metadata1 = {
252       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
253   const auto rpc_options = RpcOptions().set_metadata(metadata);
254   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
255   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
256                  WaitForBackendOptions(), rpc_options);
257   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
258                  WaitForBackendOptions(), rpc_options1);
259   // Cause an error and wait for 1 outlier detection interval to pass
260   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
261                       RpcOptions()
262                           .set_metadata(std::move(metadata))
263                           .set_server_expected_error(StatusCode::CANCELLED));
264   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
265       3000 * grpc_test_slowdown_factor()));
266   ResetBackendCounters();
267   // 1 backend experenced failure, but since the stdev_factor is high, no
268   // backend will be noticed as an outlier so no ejection.
269   // Both backends are still getting the RPCs intended for them.
270   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
271   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
272   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
273   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
274 }
275 
276 // Success rate enforcement percentage is honored, setting it to 0 so guarantee
277 // the randomized number between 1 to 100 will always be great, so nothing will
278 // be ejected.
TEST_P(OutlierDetectionTest,SuccessRateEnforcementPercentage)279 TEST_P(OutlierDetectionTest, SuccessRateEnforcementPercentage) {
280   CreateAndStartBackends(2);
281   auto cluster = default_cluster_;
282   cluster.set_lb_policy(Cluster::RING_HASH);
283   auto* outlier_detection = cluster.mutable_outlier_detection();
284   SetProtoDuration(grpc_core::Duration::Seconds(1),
285                    outlier_detection->mutable_interval());
286   SetProtoDuration(grpc_core::Duration::Seconds(1),
287                    outlier_detection->mutable_base_ejection_time());
288   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
289   // Setting enforcing_success_rate to 0 to ensure we will never eject.
290   // Note this parameter is the only difference between this test and
291   // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
292   // this one value changes means the difference between guaranteed not ejecting
293   // in this test and guaranteed ejecting in the other test.
294   outlier_detection->mutable_enforcing_success_rate()->set_value(0);
295   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
296   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
297   balancer_->ads_service()->SetCdsResource(cluster);
298   auto new_route_config = default_route_config_;
299   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
300   auto* hash_policy = route->mutable_route()->add_hash_policy();
301   hash_policy->mutable_header()->set_header_name("address_hash");
302   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
303                                    new_route_config);
304   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
305   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
306   // Note each type of RPC will contain a header value that will always be
307   // hashed to a specific backend as the header value matches the value used
308   // to create the entry in the ring.
309   std::vector<std::pair<std::string, std::string>> metadata = {
310       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
311   std::vector<std::pair<std::string, std::string>> metadata1 = {
312       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
313   const auto rpc_options = RpcOptions().set_metadata(metadata);
314   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
315   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
316                  WaitForBackendOptions(), rpc_options);
317   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
318                  WaitForBackendOptions(), rpc_options1);
319   // Cause an error and wait for 1 outlier detection interval to pass
320   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
321                       RpcOptions()
322                           .set_metadata(std::move(metadata))
323                           .set_server_expected_error(StatusCode::CANCELLED));
324   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
325       3000 * grpc_test_slowdown_factor()));
326   ResetBackendCounters();
327   // 1 backend experenced failure, but since the enforcement percentage is 0, no
328   // backend will be ejected.
329   // Both backends are still getting the RPCs intended for them.
330   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
331   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
332   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
333   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
334 }
335 
336 // Success rate does not eject if there are less than minimum_hosts backends
337 // Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,SuccessRateMinimumHosts)338 TEST_P(OutlierDetectionTest, SuccessRateMinimumHosts) {
339   CreateAndStartBackends(2);
340   auto cluster = default_cluster_;
341   cluster.set_lb_policy(Cluster::RING_HASH);
342   // Setup outlier failure percentage parameters.
343   // Any failure will cause an potential ejection with the probability of 100%
344   // (to eliminate flakiness of the test).
345   auto* outlier_detection = cluster.mutable_outlier_detection();
346   SetProtoDuration(grpc_core::Duration::Seconds(1),
347                    outlier_detection->mutable_interval());
348   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
349   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
350   // Set success_rate_minimum_hosts to 3 when we only have 2 backends
351   // Note this parameter is the only difference between this test and
352   // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
353   // this one value changes means the difference between not ejecting in this
354   // test and ejecting in the other test.
355   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(3);
356   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
357   balancer_->ads_service()->SetCdsResource(cluster);
358   auto new_route_config = default_route_config_;
359   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
360   auto* hash_policy = route->mutable_route()->add_hash_policy();
361   hash_policy->mutable_header()->set_header_name("address_hash");
362   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
363                                    new_route_config);
364   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
365   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
366   // Note each type of RPC will contain a header value that will always be
367   // hashed to a specific backend as the header value matches the value used
368   // to create the entry in the ring.
369   std::vector<std::pair<std::string, std::string>> metadata = {
370       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
371   std::vector<std::pair<std::string, std::string>> metadata1 = {
372       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
373   const auto rpc_options = RpcOptions().set_metadata(metadata);
374   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
375   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
376                  WaitForBackendOptions(), rpc_options);
377   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
378                  WaitForBackendOptions(), rpc_options1);
379   // Cause an error and wait for 1 outlier detection interval to pass
380   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
381                       RpcOptions()
382                           .set_metadata(std::move(metadata))
383                           .set_server_expected_error(StatusCode::CANCELLED));
384   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
385       3000 * grpc_test_slowdown_factor()));
386   ResetBackendCounters();
387   // All traffic still reaching the original backends and no backends are
388   // ejected.
389   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
390   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
391   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
392   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
393 }
394 
395 // Success rate does not eject if there are less than request_volume requests
396 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
397 // interval.
TEST_P(OutlierDetectionTest,SuccessRateRequestVolume)398 TEST_P(OutlierDetectionTest, SuccessRateRequestVolume) {
399   CreateAndStartBackends(2);
400   auto cluster = default_cluster_;
401   cluster.set_lb_policy(Cluster::RING_HASH);
402   // Setup outlier failure percentage parameters.
403   // Any failure will cause an potential ejection with the probability of 100%
404   // (to eliminate flakiness of the test).
405   auto* outlier_detection = cluster.mutable_outlier_detection();
406   SetProtoDuration(grpc_core::Duration::Seconds(1),
407                    outlier_detection->mutable_interval());
408   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
409   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
410   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
411   // Set success_rate_request_volume to 4 when we only send 3 RPC in the
412   // interval.
413   // Note this parameter is the only difference between this test and
414   // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
415   // this one value changes means the difference between not ejecting in this
416   // test and ejecting in the other test.
417   outlier_detection->mutable_success_rate_request_volume()->set_value(4);
418   balancer_->ads_service()->SetCdsResource(cluster);
419   auto new_route_config = default_route_config_;
420   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
421   auto* hash_policy = route->mutable_route()->add_hash_policy();
422   hash_policy->mutable_header()->set_header_name("address_hash");
423   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
424                                    new_route_config);
425   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
426   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
427   // Note each type of RPC will contain a header value that will always be
428   // hashed to a specific backend as the header value matches the value used
429   // to create the entry in the ring.
430   std::vector<std::pair<std::string, std::string>> metadata = {
431       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
432   std::vector<std::pair<std::string, std::string>> metadata1 = {
433       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
434   const auto rpc_options = RpcOptions().set_metadata(metadata);
435   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
436   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
437                  WaitForBackendOptions(), rpc_options);
438   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
439                  WaitForBackendOptions(), rpc_options1);
440   // Cause an error and wait for 1 outlier detection interval to pass
441   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
442                       RpcOptions()
443                           .set_metadata(std::move(metadata))
444                           .set_server_expected_error(StatusCode::CANCELLED));
445   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
446       3000 * grpc_test_slowdown_factor()));
447   ResetBackendCounters();
448   // All traffic still reaching the original backends and no backends are
449   // ejected.
450   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
451   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
452   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
453   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
454 }
455 
456 // Tests FailurePercentageEjectionAndUnejection:
457 // 1. Use ring hash policy that hashes using a header value to ensure RPCs
458 //    go to all backends.
459 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
460 //    interval to pass.
461 // 3. We should skip exactly 1 backend due to ejection and all the loads
462 //    sticky to that backend should go to 1 other backend.
463 // 4. Let the ejection period pass and verify that traffic will again go both
464 //    backends as we have unejected the backend.
TEST_P(OutlierDetectionTest,FailurePercentageEjectionAndUnejection)465 TEST_P(OutlierDetectionTest, FailurePercentageEjectionAndUnejection) {
466   CreateAndStartBackends(2);
467   auto cluster = default_cluster_;
468   cluster.set_lb_policy(Cluster::RING_HASH);
469   // Setup outlier failure percentage parameters.
470   // Any failure will cause an potential ejection with the probability of 100%
471   // (to eliminate flakiness of the test).
472   auto* outlier_detection = cluster.mutable_outlier_detection();
473   SetProtoDuration(grpc_core::Duration::Seconds(1),
474                    outlier_detection->mutable_interval());
475   SetProtoDuration(grpc_core::Duration::Seconds(3),
476                    outlier_detection->mutable_base_ejection_time());
477   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
478   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
479   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
480   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
481   balancer_->ads_service()->SetCdsResource(cluster);
482   auto new_route_config = default_route_config_;
483   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
484   auto* hash_policy = route->mutable_route()->add_hash_policy();
485   hash_policy->mutable_header()->set_header_name("address_hash");
486   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
487                                    new_route_config);
488   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
489   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
490   // Note each type of RPC will contain a header value that will always be
491   // hashed to a specific backend as the header value matches the value used
492   // to create the entry in the ring.
493   std::vector<std::pair<std::string, std::string>> metadata = {
494       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
495   std::vector<std::pair<std::string, std::string>> metadata1 = {
496       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
497   const auto rpc_options = RpcOptions().set_metadata(metadata);
498   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
499   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
500                  WaitForBackendOptions(), rpc_options);
501   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
502                  WaitForBackendOptions(), rpc_options1);
503   // Cause an error and wait for traffic aimed at backend 0 to start going to
504   // backend 1.
505   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
506                       RpcOptions()
507                           .set_metadata(std::move(metadata))
508                           .set_server_expected_error(StatusCode::CANCELLED));
509   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
510                  WaitForBackendOptions().set_timeout_ms(
511                      3000 * grpc_test_slowdown_factor()),
512                  rpc_options);
513   // 1 backend is ejected all traffic going to the ejected backend should now
514   // all be going to the other backend.
515   // failure percentage enforcement_percentage of 100% is honored as this test
516   // will consistently reject 1 backend.
517   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
518   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
519   // Now wait for traffic aimed at backend 0 to switch back to backend 0.
520   // This tells us that backend 0 has been unejected.
521   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
522                  WaitForBackendOptions().set_timeout_ms(
523                      30000 * grpc_test_slowdown_factor()),
524                  rpc_options);
525   // Verify that rpcs go to their expectedly hashed backends.
526   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
527   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
528   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
529   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
530 }
531 
532 // We don't eject more than max_ejection_percent (default 10%) of the backends
533 // beyond the first one.
TEST_P(OutlierDetectionTest,FailurePercentageMaxPercentage)534 TEST_P(OutlierDetectionTest, FailurePercentageMaxPercentage) {
535   CreateAndStartBackends(4);
536   auto cluster = default_cluster_;
537   cluster.set_lb_policy(Cluster::RING_HASH);
538   // Setup outlier failure percentage parameters.
539   // Any failure will cause an potential ejection with the probability of 100%
540   // (to eliminate flakiness of the test).
541   auto* outlier_detection = cluster.mutable_outlier_detection();
542   SetProtoDuration(grpc_core::Duration::Seconds(1),
543                    outlier_detection->mutable_interval());
544   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
545   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
546   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
547   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
548   balancer_->ads_service()->SetCdsResource(cluster);
549   auto new_route_config = default_route_config_;
550   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
551   auto* hash_policy = route->mutable_route()->add_hash_policy();
552   hash_policy->mutable_header()->set_header_name("address_hash");
553   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
554                                    new_route_config);
555   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
556   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
557   // Note each type of RPC will contain a header value that will always be
558   // hashed to a specific backend as the header value matches the value used
559   // to create the entry in the ring.
560   std::vector<std::pair<std::string, std::string>> metadata = {
561       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
562   std::vector<std::pair<std::string, std::string>> metadata1 = {
563       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
564   std::vector<std::pair<std::string, std::string>> metadata2 = {
565       {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
566   std::vector<std::pair<std::string, std::string>> metadata3 = {
567       {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
568   const auto rpc_options = RpcOptions().set_metadata(metadata);
569   const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
570   const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
571   const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
572   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
573                  WaitForBackendOptions(), rpc_options);
574   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
575                  WaitForBackendOptions(), rpc_options1);
576   WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
577                  WaitForBackendOptions(), rpc_options2);
578   WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
579                  WaitForBackendOptions(), rpc_options3);
580   // Cause 2 errors and wait until one ejection happens.
581   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
582                       RpcOptions()
583                           .set_metadata(std::move(metadata))
584                           .set_server_expected_error(StatusCode::CANCELLED));
585   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
586                       RpcOptions()
587                           .set_metadata(std::move(metadata1))
588                           .set_server_expected_error(StatusCode::CANCELLED));
589   absl::Time deadline =
590       absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
591   while (true) {
592     ResetBackendCounters();
593     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
594     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
595     if (!SeenAllBackends(0, 2)) {
596       break;
597     }
598     EXPECT_LE(absl::Now(), deadline);
599     if (absl::Now() >= deadline) break;
600   }
601   // 1 backend should be ejected, trafficed picked up by another backend.
602   // No other backend should be ejected.
603   ResetBackendCounters();
604   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
605   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
606   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
607   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
608   size_t empty_load_backend_count = 0;
609   size_t double_load_backend_count = 0;
610   size_t regular_load_backend_count = 0;
611   for (size_t i = 0; i < backends_.size(); ++i) {
612     if (backends_[i]->backend_service()->request_count() == 0) {
613       ++empty_load_backend_count;
614     } else if (backends_[i]->backend_service()->request_count() == 200) {
615       ++double_load_backend_count;
616     } else if (backends_[i]->backend_service()->request_count() == 100) {
617       ++regular_load_backend_count;
618     } else {
619       GPR_ASSERT(1);
620     }
621   }
622   EXPECT_EQ(1, empty_load_backend_count);
623   EXPECT_EQ(1, double_load_backend_count);
624   EXPECT_EQ(2, regular_load_backend_count);
625 }
626 
627 // Failure percentage threshold is honored, a higher value would ensure ejection
628 // does not occur
TEST_P(OutlierDetectionTest,FailurePercentageThreshold)629 TEST_P(OutlierDetectionTest, FailurePercentageThreshold) {
630   CreateAndStartBackends(2);
631   auto cluster = default_cluster_;
632   cluster.set_lb_policy(Cluster::RING_HASH);
633   auto* outlier_detection = cluster.mutable_outlier_detection();
634   SetProtoDuration(grpc_core::Duration::Seconds(1),
635                    outlier_detection->mutable_interval());
636   SetProtoDuration(grpc_core::Duration::Seconds(1),
637                    outlier_detection->mutable_base_ejection_time());
638   // Setup outlier failure percentage parameter to 50
639   // Note this parameter is the only difference between this test and
640   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 0)
641   // and this one value changes means the difference between not ejecting in
642   // this test and ejecting in the other test.
643   outlier_detection->mutable_failure_percentage_threshold()->set_value(50);
644   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
645   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
646   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
647   balancer_->ads_service()->SetCdsResource(cluster);
648   auto new_route_config = default_route_config_;
649   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
650   auto* hash_policy = route->mutable_route()->add_hash_policy();
651   hash_policy->mutable_header()->set_header_name("address_hash");
652   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
653                                    new_route_config);
654   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
655   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
656   // Note each type of RPC will contain a header value that will always be
657   // hashed to a specific backend as the header value matches the value used
658   // to create the entry in the ring.
659   std::vector<std::pair<std::string, std::string>> metadata = {
660       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
661   std::vector<std::pair<std::string, std::string>> metadata1 = {
662       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
663   const auto rpc_options = RpcOptions().set_metadata(metadata);
664   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
665   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
666                  WaitForBackendOptions(), rpc_options);
667   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
668                  WaitForBackendOptions(), rpc_options1);
669   // Cause an error and wait for 1 outlier detection interval to pass to cause
670   // the backend to be ejected.
671   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
672                       RpcOptions()
673                           .set_metadata(std::move(metadata))
674                           .set_server_expected_error(StatusCode::CANCELLED));
675   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
676       3000 * grpc_test_slowdown_factor()));
677   ResetBackendCounters();
678   // 1 backend experenced 1 failure, but since the threshold is 50 % no
679   // backend will be noticed as an outlier so no ejection.
680   // Both backends are still getting the RPCs intended for them.
681   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
682   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
683   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
684   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
685 }
686 
687 // Failure percentage enforcement percentage is honored, setting it to 0 so
688 // guarantee the randomized number between 1 to 100 will always be great, so
689 // nothing will be ejected.
TEST_P(OutlierDetectionTest,FailurePercentageEnforcementPercentage)690 TEST_P(OutlierDetectionTest, FailurePercentageEnforcementPercentage) {
691   CreateAndStartBackends(2);
692   auto cluster = default_cluster_;
693   cluster.set_lb_policy(Cluster::RING_HASH);
694   auto* outlier_detection = cluster.mutable_outlier_detection();
695   SetProtoDuration(grpc_core::Duration::Seconds(1),
696                    outlier_detection->mutable_interval());
697   SetProtoDuration(grpc_core::Duration::Seconds(1),
698                    outlier_detection->mutable_base_ejection_time());
699   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
700   // Setting enforcing_success_rate to 0 to ensure we will never eject.
701   // Note this parameter is the only difference between this test and
702   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 100)
703   // and this one value changes means the difference between guaranteed not
704   // ejecting in this test and guaranteed ejecting in the other test.
705   outlier_detection->mutable_enforcing_failure_percentage()->set_value(0);
706   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
707   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
708   balancer_->ads_service()->SetCdsResource(cluster);
709   auto new_route_config = default_route_config_;
710   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
711   auto* hash_policy = route->mutable_route()->add_hash_policy();
712   hash_policy->mutable_header()->set_header_name("address_hash");
713   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
714                                    new_route_config);
715   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
716   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
717   // Note each type of RPC will contain a header value that will always be
718   // hashed to a specific backend as the header value matches the value used
719   // to create the entry in the ring.
720   std::vector<std::pair<std::string, std::string>> metadata = {
721       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
722   std::vector<std::pair<std::string, std::string>> metadata1 = {
723       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
724   const auto rpc_options = RpcOptions().set_metadata(metadata);
725   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
726   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
727                  WaitForBackendOptions(), rpc_options);
728   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
729                  WaitForBackendOptions(), rpc_options1);
730   // Cause an error and wait for 1 outlier detection interval to pass to cause
731   // the backend to be ejected.
732   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
733                       RpcOptions()
734                           .set_metadata(std::move(metadata))
735                           .set_server_expected_error(StatusCode::CANCELLED));
736   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
737       3000 * grpc_test_slowdown_factor()));
738   ResetBackendCounters();
739   // 1 backend experenced failure, but since the enforcement percentage is 0, no
740   // backend will be ejected.
741   // Both backends are still getting the RPCs intended for them.
742   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
743   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
744   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
745   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
746 }
747 
748 // Failure percentage does not eject if there are less than minimum_hosts
749 // backends Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,FailurePercentageMinimumHosts)750 TEST_P(OutlierDetectionTest, FailurePercentageMinimumHosts) {
751   CreateAndStartBackends(2);
752   auto cluster = default_cluster_;
753   cluster.set_lb_policy(Cluster::RING_HASH);
754   // Setup outlier failure percentage parameters.
755   // Any failure will cause an potential ejection with the probability of 100%
756   // (to eliminate flakiness of the test).
757   auto* outlier_detection = cluster.mutable_outlier_detection();
758   SetProtoDuration(grpc_core::Duration::Seconds(1),
759                    outlier_detection->mutable_interval());
760   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
761   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
762   // Set failure_percentage_minimum_hosts to 3 when we only have 2 backends
763   // Note this parameter is the only difference between this test and
764   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
765   // and this one value changes means the difference between not ejecting in
766   // this test and ejecting in the other test.
767   cluster.mutable_outlier_detection()
768       ->mutable_failure_percentage_minimum_hosts()
769       ->set_value(3);
770   cluster.mutable_outlier_detection()
771       ->mutable_failure_percentage_request_volume()
772       ->set_value(1);
773   balancer_->ads_service()->SetCdsResource(cluster);
774   auto new_route_config = default_route_config_;
775   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
776   auto* hash_policy = route->mutable_route()->add_hash_policy();
777   hash_policy->mutable_header()->set_header_name("address_hash");
778   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
779                                    new_route_config);
780   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
781   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
782   // Note each type of RPC will contain a header value that will always be
783   // hashed to a specific backend as the header value matches the value used
784   // to create the entry in the ring.
785   std::vector<std::pair<std::string, std::string>> metadata = {
786       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
787   std::vector<std::pair<std::string, std::string>> metadata1 = {
788       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
789   const auto rpc_options = RpcOptions().set_metadata(metadata);
790   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
791   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
792                  WaitForBackendOptions(), rpc_options);
793   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
794                  WaitForBackendOptions(), rpc_options1);
795   // Cause an error and wait for 1 outlier detection interval to pass to cause
796   // the backend to be ejected.
797   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
798                       RpcOptions()
799                           .set_metadata(std::move(metadata))
800                           .set_server_expected_error(StatusCode::CANCELLED));
801   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
802       3000 * grpc_test_slowdown_factor()));
803   ResetBackendCounters();
804   // All traffic still reaching the original backends and no backends are
805   // ejected.
806   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
807   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
808   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
809   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
810 }
811 
812 // Failure percentage does not eject if there are less than request_volume
813 // requests
814 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
815 // interval.
TEST_P(OutlierDetectionTest,FailurePercentageRequestVolume)816 TEST_P(OutlierDetectionTest, FailurePercentageRequestVolume) {
817   CreateAndStartBackends(2);
818   auto cluster = default_cluster_;
819   cluster.set_lb_policy(Cluster::RING_HASH);
820   // Setup outlier failure percentage parameters.
821   // Any failure will cause an potential ejection with the probability of 100%
822   // (to eliminate flakiness of the test).
823   auto* outlier_detection = cluster.mutable_outlier_detection();
824   SetProtoDuration(grpc_core::Duration::Seconds(1),
825                    outlier_detection->mutable_interval());
826   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
827   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
828   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
829   // Set failure_percentage_request_volume to 4 when we only send 3 RPC in the
830   // interval.
831   // // Note this parameter is the only difference between this test and
832   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
833   // and this one value changes means the difference between not ejecting in
834   // this test and ejecting in the other test.
835   outlier_detection->mutable_failure_percentage_request_volume()->set_value(4);
836   balancer_->ads_service()->SetCdsResource(cluster);
837   auto new_route_config = default_route_config_;
838   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
839   auto* hash_policy = route->mutable_route()->add_hash_policy();
840   hash_policy->mutable_header()->set_header_name("address_hash");
841   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
842                                    new_route_config);
843   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
844   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
845   // Note each type of RPC will contain a header value that will always be
846   // hashed to a specific backend as the header value matches the value used
847   // to create the entry in the ring.
848   std::vector<std::pair<std::string, std::string>> metadata = {
849       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
850   std::vector<std::pair<std::string, std::string>> metadata1 = {
851       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
852   const auto rpc_options = RpcOptions().set_metadata(metadata);
853   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
854   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
855                  WaitForBackendOptions(), rpc_options);
856   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
857                  WaitForBackendOptions(), rpc_options1);
858   // Cause an error and wait for 1 outlier detection interval to pass to cause
859   // the backend to be ejected.
860   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
861                       RpcOptions()
862                           .set_metadata(std::move(metadata))
863                           .set_server_expected_error(StatusCode::CANCELLED));
864   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
865       3000 * grpc_test_slowdown_factor()));
866   ResetBackendCounters();
867   // All traffic still reaching the original backends and no backends are
868   // ejected.
869   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
870   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
871   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
872   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
873 }
874 
875 // Tests SuccessRate and FailurePercentage both configured
876 // Configure max_ejection_percent to 50% which means max 2/4 backends can be
877 // ejected.
878 // Configure success rate to eject 1 and failure percentage to eject 2.
879 // Verify that maximum 2 backends are ejected, not 3!
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentage)880 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentage) {
881   CreateAndStartBackends(4);
882   auto cluster = default_cluster_;
883   cluster.set_lb_policy(Cluster::RING_HASH);
884   // Setup outlier failure percentage parameters.
885   // Any failure will cause an potential ejection with the probability of 100%
886   // (to eliminate flakiness of the test).
887   auto* outlier_detection = cluster.mutable_outlier_detection();
888   SetProtoDuration(grpc_core::Duration::Seconds(1),
889                    outlier_detection->mutable_interval());
890   outlier_detection->mutable_max_ejection_percent()->set_value(50);
891   // This stdev of 500 will ensure the number of ok RPC and error RPC we send
892   // will make 1 outlier out of the 4 backends.
893   outlier_detection->mutable_success_rate_stdev_factor()->set_value(500);
894   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
895   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
896   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
897   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
898   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
899   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
900   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
901   balancer_->ads_service()->SetCdsResource(cluster);
902   auto new_route_config = default_route_config_;
903   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
904   auto* hash_policy = route->mutable_route()->add_hash_policy();
905   hash_policy->mutable_header()->set_header_name("address_hash");
906   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
907                                    new_route_config);
908   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
909   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
910   // Note each type of RPC will contain a header value that will always be
911   // hashed to a specific backend as the header value matches the value used
912   // to create the entry in the ring.
913   std::vector<std::pair<std::string, std::string>> metadata = {
914       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
915   std::vector<std::pair<std::string, std::string>> metadata1 = {
916       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
917   std::vector<std::pair<std::string, std::string>> metadata2 = {
918       {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
919   std::vector<std::pair<std::string, std::string>> metadata3 = {
920       {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
921   const auto rpc_options = RpcOptions().set_metadata(metadata);
922   const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
923   const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
924   const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
925   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
926                  WaitForBackendOptions(), rpc_options);
927   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
928                  WaitForBackendOptions(), rpc_options1);
929   WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
930                  WaitForBackendOptions(), rpc_options2);
931   WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
932                  WaitForBackendOptions(), rpc_options3);
933   // Cause 2 errors on 1 backend and 1 error on 2 backends and wait for 2
934   // backends to be ejected. The 2 errors to the 1 backend will make exactly 1
935   // outlier from the success rate algorithm; all 4 errors will make 3 outliers
936   // from the failure pecentage algorithm because the threahold is set to 0. I
937   // have verified through debug logs we eject 1 backend because of success
938   // rate, 1 backend because of failure percentage; but as we attempt to eject
939   // another backend because of failure percentage we will stop as we have
940   // reached our 50% limit.
941   CheckRpcSendFailure(
942       DEBUG_LOCATION, StatusCode::CANCELLED, "",
943       RpcOptions().set_metadata(metadata).set_server_expected_error(
944           StatusCode::CANCELLED));
945   CheckRpcSendFailure(
946       DEBUG_LOCATION, StatusCode::CANCELLED, "",
947       RpcOptions().set_metadata(metadata).set_server_expected_error(
948           StatusCode::CANCELLED));
949   CheckRpcSendFailure(
950       DEBUG_LOCATION, StatusCode::CANCELLED, "",
951       RpcOptions().set_metadata(metadata1).set_server_expected_error(
952           StatusCode::CANCELLED));
953   CheckRpcSendFailure(
954       DEBUG_LOCATION, StatusCode::CANCELLED, "",
955       RpcOptions().set_metadata(metadata2).set_server_expected_error(
956           StatusCode::CANCELLED));
957   absl::Time deadline =
958       absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
959   std::vector<size_t> idx = {0, 1, 2, 3};
960   while (true) {
961     ResetBackendCounters();
962     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
963     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
964     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options2);
965     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options3);
966     if (std::count_if(idx.begin(), idx.end(),
967                       [this](size_t i) { return SeenBackend(i); }) == 2) {
968       break;
969     }
970     EXPECT_LE(absl::Now(), deadline);
971     if (absl::Now() >= deadline) break;
972   }
973   ResetBackendCounters();
974   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
975   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
976   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
977   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
978   size_t empty_load_backend_count = 0;
979   size_t double_load_backend_count = 0;
980   for (size_t i = 0; i < backends_.size(); ++i) {
981     if (backends_[i]->backend_service()->request_count() == 0) {
982       ++empty_load_backend_count;
983     } else if (backends_[i]->backend_service()->request_count() >= 100) {
984       // The extra load could go to 2 remaining backends or just 1 of them.
985       ++double_load_backend_count;
986     } else if (backends_[i]->backend_service()->request_count() > 300) {
987       GPR_ASSERT(1);
988     }
989   }
990   EXPECT_EQ(2, empty_load_backend_count);
991   EXPECT_EQ(2, double_load_backend_count);
992 }
993 
994 // Tests SuccessRate and FailurePercentage both unconfigured;
995 // This is the case where according to the gRFC we need to instruct the picker
996 // not to do counting or even start the timer. The result of not counting is
997 // that there will be no ejection taking place since we can't do any
998 // calculations.
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentageBothDisabled)999 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentageBothDisabled) {
1000   CreateAndStartBackends(2);
1001   auto cluster = default_cluster_;
1002   cluster.set_lb_policy(Cluster::RING_HASH);
1003   auto* outlier_detection = cluster.mutable_outlier_detection();
1004   SetProtoDuration(grpc_core::Duration::Seconds(1),
1005                    outlier_detection->mutable_interval());
1006   SetProtoDuration(grpc_core::Duration::Seconds(1),
1007                    outlier_detection->mutable_base_ejection_time());
1008   balancer_->ads_service()->SetCdsResource(cluster);
1009   auto new_route_config = default_route_config_;
1010   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1011   auto* hash_policy = route->mutable_route()->add_hash_policy();
1012   hash_policy->mutable_header()->set_header_name("address_hash");
1013   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1014                                    new_route_config);
1015   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1016   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1017   // Note each type of RPC will contain a header value that will always be
1018   // hashed to a specific backend as the header value matches the value used
1019   // to create the entry in the ring.
1020   std::vector<std::pair<std::string, std::string>> metadata = {
1021       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1022   std::vector<std::pair<std::string, std::string>> metadata1 = {
1023       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1024   const auto rpc_options = RpcOptions().set_metadata(metadata);
1025   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1026   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1027                  WaitForBackendOptions(), rpc_options);
1028   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1029                  WaitForBackendOptions(), rpc_options1);
1030   // Cause an error and wait for 1 outlier detection interval to pass
1031   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
1032                       RpcOptions()
1033                           .set_metadata(std::move(metadata))
1034                           .set_server_expected_error(StatusCode::CANCELLED));
1035   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
1036       3000 * grpc_test_slowdown_factor()));
1037   ResetBackendCounters();
1038   // 1 backend experenced failure, but since there is no counting there is no
1039   // ejection.  Both backends are still getting the RPCs intended for them.
1040   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
1041   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
1042   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
1043   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
1044 }
1045 
1046 // Tests that we uneject any ejected addresses when the OD policy is
1047 // disabled.
TEST_P(OutlierDetectionTest,DisableOutlierDetectionWhileAddressesAreEjected)1048 TEST_P(OutlierDetectionTest, DisableOutlierDetectionWhileAddressesAreEjected) {
1049   CreateAndStartBackends(2);
1050   auto cluster = default_cluster_;
1051   cluster.set_lb_policy(Cluster::RING_HASH);
1052   // Setup outlier failure percentage parameters.
1053   // Any failure will cause an potential ejection with the probability of 100%
1054   // (to eliminate flakiness of the test).
1055   auto* outlier_detection = cluster.mutable_outlier_detection();
1056   SetProtoDuration(grpc_core::Duration::Seconds(1),
1057                    outlier_detection->mutable_interval());
1058   SetProtoDuration(grpc_core::Duration::Seconds(3),
1059                    outlier_detection->mutable_base_ejection_time());
1060   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1061   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1062   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1063   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1064   balancer_->ads_service()->SetCdsResource(cluster);
1065   auto new_route_config = default_route_config_;
1066   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1067   auto* hash_policy = route->mutable_route()->add_hash_policy();
1068   hash_policy->mutable_header()->set_header_name("address_hash");
1069   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1070                                    new_route_config);
1071   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1072   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1073   // Note each type of RPC will contain a header value that will always be
1074   // hashed to a specific backend as the header value matches the value used
1075   // to create the entry in the ring.
1076   std::vector<std::pair<std::string, std::string>> metadata = {
1077       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1078   std::vector<std::pair<std::string, std::string>> metadata1 = {
1079       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1080   const auto rpc_options = RpcOptions().set_metadata(metadata);
1081   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1082   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1083                  WaitForBackendOptions(), rpc_options);
1084   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1085                  WaitForBackendOptions(), rpc_options1);
1086   // Cause an error and wait for traffic aimed at backend 0 to start going to
1087   // backend 1.
1088   CheckRpcSendFailure(
1089       DEBUG_LOCATION, StatusCode::CANCELLED, "",
1090       RpcOptions().set_metadata(metadata).set_server_expected_error(
1091           StatusCode::CANCELLED));
1092   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1093                  WaitForBackendOptions().set_timeout_ms(
1094                      3000 * grpc_test_slowdown_factor()),
1095                  rpc_options);
1096   // 1 backend is ejected all traffic going to the ejected backend should now
1097   // all be going to the other backend.
1098   // failure percentage enforcement_percentage of 100% is honored as this test
1099   // will consistently reject 1 backend.
1100   CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
1101   EXPECT_EQ(1, backends_[1]->backend_service()->request_count());
1102   // Send an update that disables outlier detection.
1103   cluster.clear_outlier_detection();
1104   balancer_->ads_service()->SetCdsResource(cluster);
1105   // Wait for the backend to start being used again.
1106   WaitForBackend(
1107       DEBUG_LOCATION, 0,
1108       [](const RpcResult& result) {
1109         EXPECT_EQ(result.status.error_code(), StatusCode::CANCELLED)
1110             << "Error: " << result.status.error_message();
1111       },
1112       WaitForBackendOptions(),
1113       RpcOptions()
1114           .set_metadata(std::move(metadata))
1115           .set_server_expected_error(StatusCode::CANCELLED));
1116 }
1117 
TEST_P(OutlierDetectionTest,EjectionRetainedAcrossPriorities)1118 TEST_P(OutlierDetectionTest, EjectionRetainedAcrossPriorities) {
1119   CreateAndStartBackends(3);
1120   auto cluster = default_cluster_;
1121   // Setup outlier failure percentage parameters.
1122   // Any failure will cause an potential ejection with the probability of 100%
1123   // (to eliminate flakiness of the test).
1124   auto* outlier_detection = cluster.mutable_outlier_detection();
1125   SetProtoDuration(grpc_core::Duration::Seconds(1),
1126                    outlier_detection->mutable_interval());
1127   SetProtoDuration(grpc_core::Duration::Minutes(10),
1128                    outlier_detection->mutable_base_ejection_time());
1129   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1130   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1131   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1132   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1133   balancer_->ads_service()->SetCdsResource(cluster);
1134   // Priority 0: backend 0 and a non-existent backend.
1135   // Priority 1: backend 1.
1136   EdsResourceArgs args({
1137       {"locality0", {CreateEndpoint(0), MakeNonExistantEndpoint()}},
1138       {"locality1", {CreateEndpoint(1)}, kDefaultLocalityWeight, 1},
1139   });
1140   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1141   WaitForBackend(DEBUG_LOCATION, 0);
1142   // Trigger an error to backend 0.
1143   // The success rate enforcement_percentage is 100%, so this will cause
1144   // the backend to be ejected when the ejection timer fires.
1145   CheckRpcSendFailure(
1146       DEBUG_LOCATION, StatusCode::CANCELLED, "",
1147       RpcOptions().set_server_expected_error(StatusCode::CANCELLED));
1148   // Wait for traffic aimed at backend 0 to start going to backend 1.
1149   // This tells us that backend 0 has been ejected.
1150   // It should take no more than one ejection timer interval.
1151   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1152                  WaitForBackendOptions().set_timeout_ms(
1153                      3000 * grpc_test_slowdown_factor()));
1154   // Now send an EDS update that moves backend 0 to priority 1.
1155   // We also add backend 2, so that we know when the client sees the update.
1156   args = EdsResourceArgs({
1157       {"locality0", {MakeNonExistantEndpoint()}},
1158       {"locality1", CreateEndpointsForBackends(), kDefaultLocalityWeight, 1},
1159   });
1160   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1161   WaitForBackend(DEBUG_LOCATION, 2);
1162   // Now send 100 RPCs and make sure they all go to backends 1 and 2,
1163   // because backend 0 should still be ejected.
1164   CheckRpcSendOk(DEBUG_LOCATION, 100);
1165   EXPECT_EQ(0, backends_[0]->backend_service()->request_count());
1166   EXPECT_EQ(50, backends_[1]->backend_service()->request_count());
1167   EXPECT_EQ(50, backends_[2]->backend_service()->request_count());
1168 }
1169 
1170 }  // namespace
1171 }  // namespace testing
1172 }  // namespace grpc
1173 
main(int argc,char ** argv)1174 int main(int argc, char** argv) {
1175   grpc::testing::TestEnvironment env(&argc, argv);
1176   ::testing::InitGoogleTest(&argc, argv);
1177   // Make the backup poller poll very frequently in order to pick up
1178   // updates from all the subchannels's FDs.
1179   grpc_core::ConfigVars::Overrides overrides;
1180   overrides.client_channel_backup_poll_interval_ms = 1;
1181   grpc_core::ConfigVars::SetOverrides(overrides);
1182 #if TARGET_OS_IPHONE
1183   // Workaround Apple CFStream bug
1184   grpc_core::SetEnv("grpc_cfstream", "0");
1185 #endif
1186   grpc_init();
1187   const auto result = RUN_ALL_TESTS();
1188   grpc_shutdown();
1189   return result;
1190 }
1191