1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "testing/base/public/benchmark.h"
16 #include "gmock/gmock.h"
17 #include "third_party/absl/flags/flag.h"
18 #include "icing/document-builder.h"
19 #include "icing/feature-flags.h"
20 #include "icing/file/filesystem.h"
21 #include "icing/proto/schema.pb.h"
22 #include "icing/proto/search.pb.h"
23 #include "icing/result/snippet-retriever.h"
24 #include "icing/schema-builder.h"
25 #include "icing/schema/schema-store.h"
26 #include "icing/schema/section.h"
27 #include "icing/testing/common-matchers.h"
28 #include "icing/testing/random-string.h"
29 #include "icing/testing/test-data.h"
30 #include "icing/testing/test-feature-flags.h"
31 #include "icing/testing/tmp-directory.h"
32 #include "icing/tokenization/language-segmenter-factory.h"
33 #include "icing/transform/normalizer-factory.h"
34 #include "icing/util/clock.h"
35 #include "icing/util/icu-data-file-helper.h"
36 #include "icing/util/logging.h"
37 #include "unicode/uloc.h"
38
39 // Run on a Linux workstation:
40 // $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
41 // //icing/result:snippet-retriever_benchmark
42 //
43 // $ blaze-bin/icing/result/snippet-retriever_benchmark
44 // --benchmark_filter=all
45 //
46 // Run on an Android device:
47 // Make target //icing/tokenization:language-segmenter depend on
48 // //third_party/icu
49 //
50 // Make target //icing/transform:normalizer depend on
51 // //third_party/icu
52 //
53 // $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
54 // --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
55 // //icing/result:snippet-retriever_benchmark
56 //
57 // $ adb push blaze-bin/icing/result/snippet-retriever_benchmark
58 // /data/local/tmp/
59 //
60 // $ adb shell /data/local/tmp/snippet-retriever_benchmark
61 // --benchmark_filter=all --adb
62
63 // Flag to tell the benchmark that it'll be run on an Android device via adb,
64 // the benchmark will set up data files accordingly.
65 ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
66
67 namespace icing {
68 namespace lib {
69
70 namespace {
71
72 using ::testing::SizeIs;
73
BM_SnippetOneProperty(benchmark::State & state)74 void BM_SnippetOneProperty(benchmark::State& state) {
75 bool run_via_adb = absl::GetFlag(FLAGS_adb);
76 if (!run_via_adb) {
77 ICING_ASSERT_OK(icu_data_file_helper::SetUpIcuDataFile(
78 GetTestFilePath("icing/icu.dat")));
79 }
80
81 FeatureFlags feature_flags = GetTestFeatureFlags();
82 const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
83 const std::string schema_dir = base_dir + "/schema";
84 Filesystem filesystem;
85 filesystem.DeleteDirectoryRecursively(base_dir.c_str());
86 if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
87 ICING_LOG(ERROR) << "Failed to create test directories";
88 }
89
90 language_segmenter_factory::SegmenterOptions options(ULOC_US);
91 std::unique_ptr<LanguageSegmenter> language_segmenter =
92 language_segmenter_factory::Create(std::move(options)).ValueOrDie();
93 std::unique_ptr<Normalizer> normalizer =
94 normalizer_factory::Create(
95 /*max_term_byte_size=*/std::numeric_limits<int>::max())
96 .ValueOrDie();
97
98 SchemaProto schema =
99 SchemaBuilder()
100 .AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
101 PropertyConfigBuilder()
102 .SetName("prop1")
103 .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
104 .SetCardinality(CARDINALITY_OPTIONAL)))
105 .Build();
106 Clock clock;
107 ICING_ASSERT_OK_AND_ASSIGN(
108 std::unique_ptr<SchemaStore> schema_store,
109 SchemaStore::Create(&filesystem, schema_dir, &clock, &feature_flags));
110 ICING_ASSERT_OK(schema_store->SetSchema(
111 schema, /*ignore_errors_and_delete_documents=*/false,
112 /*allow_circular_schema_definitions=*/false));
113
114 auto snippet_retriever =
115 SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
116 normalizer.get())
117 .ValueOrDie();
118
119 int num_matches = state.range(0);
120 int total_terms = state.range(1);
121
122 std::default_random_engine random;
123 std::vector<std::string> language =
124 CreateLanguages(/*language_size=*/1000, &random);
125 std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
126 std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
127
128 std::string text;
129 int num_actual_matches = 0;
130 double match_chance;
131 while (total_terms-- > 0) {
132 std::string term;
133 match_chance = static_cast<double>(num_matches) / total_terms;
134 if (uniform_double(random) <= match_chance) {
135 --num_matches;
136 ++num_actual_matches;
137 term = "foo";
138 } else {
139 term = language.at(uniform(random));
140 }
141 absl_ports::StrAppend(&text, " ", term);
142 }
143 DocumentProto document = DocumentBuilder()
144 .SetKey("icing", "uri1")
145 .SetSchema("type1")
146 .AddStringProperty("prop1", text)
147 .Build();
148 SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
149 ResultSpecProto::SnippetSpecProto snippet_spec;
150 snippet_spec.set_num_to_snippet(100000);
151 snippet_spec.set_num_matches_per_property(100000);
152 snippet_spec.set_max_window_utf32_length(64);
153
154 SectionIdMask section_id_mask = 0x01;
155 SnippetProto snippet_proto;
156 for (auto _ : state) {
157 snippet_proto = snippet_retriever->RetrieveSnippet(
158 query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
159 section_id_mask);
160 ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
161 ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
162 SizeIs(num_actual_matches));
163 }
164
165 // Destroy the schema store before the whole directory is removed because they
166 // persist data in destructor.
167 schema_store.reset();
168 filesystem.DeleteDirectoryRecursively(base_dir.c_str());
169 }
170 BENCHMARK(BM_SnippetOneProperty)
171 // Arguments: num_matches, total_terms
172 ->ArgPair(1, 1)
173 ->ArgPair(1, 16) // single match
174 ->ArgPair(2, 16) // ~10% matches
175 ->ArgPair(3, 16) // ~20% matches
176 ->ArgPair(8, 16) // 50% matches
177 ->ArgPair(16, 16) // 100% matches
178 ->ArgPair(1, 128) // single match
179 ->ArgPair(13, 128) // ~10% matches
180 ->ArgPair(26, 128) // ~20% matches
181 ->ArgPair(64, 128) // 50% matches
182 ->ArgPair(128, 128) // 100% matches
183 ->ArgPair(1, 512) // single match
184 ->ArgPair(51, 512) // ~10% matches
185 ->ArgPair(102, 512) // ~20% matches
186 ->ArgPair(256, 512) // 50% matches
187 ->ArgPair(512, 512) // 100% matches
188 ->ArgPair(1, 1024) // single match
189 ->ArgPair(102, 1024) // ~10% matches
190 ->ArgPair(205, 1024) // ~20% matches
191 ->ArgPair(512, 1024) // 50% matches
192 ->ArgPair(1024, 1024) // 100% matches
193 ->ArgPair(1, 4096) // single match
194 ->ArgPair(410, 4096) // ~10% matches
195 ->ArgPair(819, 4096) // ~20% matches
196 ->ArgPair(2048, 4096) // 50% matches
197 ->ArgPair(4096, 4096) // 100% matches
198 ->ArgPair(1, 16384) // single match
199 ->ArgPair(1638, 16384) // ~10% matches
200 ->ArgPair(3277, 16384) // ~20% matches
201 ->ArgPair(8192, 16384) // 50% matches
202 ->ArgPair(16384, 16384); // 100% matches
203
BM_SnippetRfcOneProperty(benchmark::State & state)204 void BM_SnippetRfcOneProperty(benchmark::State& state) {
205 bool run_via_adb = absl::GetFlag(FLAGS_adb);
206 if (!run_via_adb) {
207 ICING_ASSERT_OK(icu_data_file_helper::SetUpIcuDataFile(
208 GetTestFilePath("icing/icu.dat")));
209 }
210
211 FeatureFlags feature_flags = GetTestFeatureFlags();
212 const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
213 const std::string schema_dir = base_dir + "/schema";
214 Filesystem filesystem;
215 filesystem.DeleteDirectoryRecursively(base_dir.c_str());
216 if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
217 ICING_LOG(ERROR) << "Failed to create test directories";
218 }
219
220 language_segmenter_factory::SegmenterOptions options(ULOC_US);
221 std::unique_ptr<LanguageSegmenter> language_segmenter =
222 language_segmenter_factory::Create(std::move(options)).ValueOrDie();
223 std::unique_ptr<Normalizer> normalizer =
224 normalizer_factory::Create(
225 /*max_term_byte_size=*/std::numeric_limits<int>::max())
226 .ValueOrDie();
227
228 SchemaProto schema =
229 SchemaBuilder()
230 .AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
231 PropertyConfigBuilder()
232 .SetName("prop1")
233 .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
234 .SetCardinality(CARDINALITY_OPTIONAL)))
235 .Build();
236 Clock clock;
237 ICING_ASSERT_OK_AND_ASSIGN(
238 std::unique_ptr<SchemaStore> schema_store,
239 SchemaStore::Create(&filesystem, schema_dir, &clock, &feature_flags));
240 ICING_ASSERT_OK(schema_store->SetSchema(
241 schema, /*ignore_errors_and_delete_documents=*/false,
242 /*allow_circular_schema_definitions=*/false));
243
244 auto snippet_retriever =
245 SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
246 normalizer.get())
247 .ValueOrDie();
248
249 int num_matches = state.range(0);
250 int total_terms = state.range(1);
251
252 std::default_random_engine random;
253 std::vector<std::string> language =
254 CreateLanguages(/*language_size=*/1000, &random);
255 std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
256 std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
257
258 std::string text;
259 int num_actual_matches = 0;
260 double match_chance;
261 while (total_terms-- > 0) {
262 std::string term;
263 match_chance = static_cast<double>(num_matches) / total_terms;
264 if (uniform_double(random) <= match_chance) {
265 --num_matches;
266 ++num_actual_matches;
267 term = "[email protected]";
268 } else {
269 term = absl_ports::StrCat(language.at(uniform(random)), "@google.com");
270 }
271 absl_ports::StrAppend(&text, ",", term);
272 }
273 DocumentProto document = DocumentBuilder()
274 .SetKey("icing", "uri1")
275 .SetSchema("type1")
276 .AddStringProperty("prop1", text)
277 .Build();
278 SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
279 ResultSpecProto::SnippetSpecProto snippet_spec;
280 snippet_spec.set_num_to_snippet(100000);
281 snippet_spec.set_num_matches_per_property(100000);
282 snippet_spec.set_max_window_utf32_length(64);
283
284 SectionIdMask section_id_mask = 0x01;
285 SnippetProto snippet_proto;
286 for (auto _ : state) {
287 snippet_proto = snippet_retriever->RetrieveSnippet(
288 query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
289 section_id_mask);
290 ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
291 ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
292 SizeIs(num_actual_matches));
293 }
294
295 // Destroy the schema store before the whole directory is removed because they
296 // persist data in destructor.
297 schema_store.reset();
298 filesystem.DeleteDirectoryRecursively(base_dir.c_str());
299 }
300 BENCHMARK(BM_SnippetRfcOneProperty)
301 // Arguments: num_matches, total_terms
302 ->ArgPair(1, 1)
303 ->ArgPair(1, 16) // single match
304 ->ArgPair(2, 16) // ~10% matches
305 ->ArgPair(3, 16) // ~20% matches
306 ->ArgPair(8, 16) // 50% matches
307 ->ArgPair(16, 16) // 100% matches
308 ->ArgPair(1, 128) // single match
309 ->ArgPair(13, 128) // ~10% matches
310 ->ArgPair(26, 128) // ~20% matches
311 ->ArgPair(64, 128) // 50% matches
312 ->ArgPair(128, 128) // 100% matches
313 ->ArgPair(1, 512) // single match
314 ->ArgPair(51, 512) // ~10% matches
315 ->ArgPair(102, 512) // ~20% matches
316 ->ArgPair(256, 512) // 50% matches
317 ->ArgPair(512, 512) // 100% matches
318 ->ArgPair(1, 1024) // single match
319 ->ArgPair(102, 1024) // ~10% matches
320 ->ArgPair(205, 1024) // ~20% matches
321 ->ArgPair(512, 1024) // 50% matches
322 ->ArgPair(1024, 1024) // 100% matches
323 ->ArgPair(1, 4096) // single match
324 ->ArgPair(410, 4096) // ~10% matches
325 ->ArgPair(819, 4096) // ~20% matches
326 ->ArgPair(2048, 4096) // 50% matches
327 ->ArgPair(4096, 4096) // 100% matches
328 ->ArgPair(1, 16384) // single match
329 ->ArgPair(1638, 16384) // ~10% matches
330 ->ArgPair(3277, 16384) // ~20% matches
331 ->ArgPair(8192, 16384) // 50% matches
332 ->ArgPair(16384, 16384); // 100% matches
333
334 } // namespace
335
336 } // namespace lib
337 } // namespace icing
338