xref: /aosp_15_r20/external/icing/icing/result/snippet-retriever_benchmark.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "testing/base/public/benchmark.h"
16 #include "gmock/gmock.h"
17 #include "third_party/absl/flags/flag.h"
18 #include "icing/document-builder.h"
19 #include "icing/feature-flags.h"
20 #include "icing/file/filesystem.h"
21 #include "icing/proto/schema.pb.h"
22 #include "icing/proto/search.pb.h"
23 #include "icing/result/snippet-retriever.h"
24 #include "icing/schema-builder.h"
25 #include "icing/schema/schema-store.h"
26 #include "icing/schema/section.h"
27 #include "icing/testing/common-matchers.h"
28 #include "icing/testing/random-string.h"
29 #include "icing/testing/test-data.h"
30 #include "icing/testing/test-feature-flags.h"
31 #include "icing/testing/tmp-directory.h"
32 #include "icing/tokenization/language-segmenter-factory.h"
33 #include "icing/transform/normalizer-factory.h"
34 #include "icing/util/clock.h"
35 #include "icing/util/icu-data-file-helper.h"
36 #include "icing/util/logging.h"
37 #include "unicode/uloc.h"
38 
39 // Run on a Linux workstation:
40 //    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
41 //    //icing/result:snippet-retriever_benchmark
42 //
43 //    $ blaze-bin/icing/result/snippet-retriever_benchmark
44 //    --benchmark_filter=all
45 //
46 // Run on an Android device:
47 //    Make target //icing/tokenization:language-segmenter depend on
48 //    //third_party/icu
49 //
50 //    Make target //icing/transform:normalizer depend on
51 //    //third_party/icu
52 //
53 //    $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
54 //    --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
55 //    //icing/result:snippet-retriever_benchmark
56 //
57 //    $ adb push blaze-bin/icing/result/snippet-retriever_benchmark
58 //    /data/local/tmp/
59 //
60 //    $ adb shell /data/local/tmp/snippet-retriever_benchmark
61 //    --benchmark_filter=all --adb
62 
63 // Flag to tell the benchmark that it'll be run on an Android device via adb,
64 // the benchmark will set up data files accordingly.
65 ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
66 
67 namespace icing {
68 namespace lib {
69 
70 namespace {
71 
72 using ::testing::SizeIs;
73 
BM_SnippetOneProperty(benchmark::State & state)74 void BM_SnippetOneProperty(benchmark::State& state) {
75   bool run_via_adb = absl::GetFlag(FLAGS_adb);
76   if (!run_via_adb) {
77     ICING_ASSERT_OK(icu_data_file_helper::SetUpIcuDataFile(
78         GetTestFilePath("icing/icu.dat")));
79   }
80 
81   FeatureFlags feature_flags = GetTestFeatureFlags();
82   const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
83   const std::string schema_dir = base_dir + "/schema";
84   Filesystem filesystem;
85   filesystem.DeleteDirectoryRecursively(base_dir.c_str());
86   if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
87     ICING_LOG(ERROR) << "Failed to create test directories";
88   }
89 
90   language_segmenter_factory::SegmenterOptions options(ULOC_US);
91   std::unique_ptr<LanguageSegmenter> language_segmenter =
92       language_segmenter_factory::Create(std::move(options)).ValueOrDie();
93   std::unique_ptr<Normalizer> normalizer =
94       normalizer_factory::Create(
95           /*max_term_byte_size=*/std::numeric_limits<int>::max())
96           .ValueOrDie();
97 
98   SchemaProto schema =
99       SchemaBuilder()
100           .AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
101               PropertyConfigBuilder()
102                   .SetName("prop1")
103                   .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
104                   .SetCardinality(CARDINALITY_OPTIONAL)))
105           .Build();
106   Clock clock;
107   ICING_ASSERT_OK_AND_ASSIGN(
108       std::unique_ptr<SchemaStore> schema_store,
109       SchemaStore::Create(&filesystem, schema_dir, &clock, &feature_flags));
110   ICING_ASSERT_OK(schema_store->SetSchema(
111       schema, /*ignore_errors_and_delete_documents=*/false,
112       /*allow_circular_schema_definitions=*/false));
113 
114   auto snippet_retriever =
115       SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
116                                normalizer.get())
117           .ValueOrDie();
118 
119   int num_matches = state.range(0);
120   int total_terms = state.range(1);
121 
122   std::default_random_engine random;
123   std::vector<std::string> language =
124       CreateLanguages(/*language_size=*/1000, &random);
125   std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
126   std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
127 
128   std::string text;
129   int num_actual_matches = 0;
130   double match_chance;
131   while (total_terms-- > 0) {
132     std::string term;
133     match_chance = static_cast<double>(num_matches) / total_terms;
134     if (uniform_double(random) <= match_chance) {
135       --num_matches;
136       ++num_actual_matches;
137       term = "foo";
138     } else {
139       term = language.at(uniform(random));
140     }
141     absl_ports::StrAppend(&text, " ", term);
142   }
143   DocumentProto document = DocumentBuilder()
144                                .SetKey("icing", "uri1")
145                                .SetSchema("type1")
146                                .AddStringProperty("prop1", text)
147                                .Build();
148   SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
149   ResultSpecProto::SnippetSpecProto snippet_spec;
150   snippet_spec.set_num_to_snippet(100000);
151   snippet_spec.set_num_matches_per_property(100000);
152   snippet_spec.set_max_window_utf32_length(64);
153 
154   SectionIdMask section_id_mask = 0x01;
155   SnippetProto snippet_proto;
156   for (auto _ : state) {
157     snippet_proto = snippet_retriever->RetrieveSnippet(
158         query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
159         section_id_mask);
160     ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
161     ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
162                 SizeIs(num_actual_matches));
163   }
164 
165   // Destroy the schema store before the whole directory is removed because they
166   // persist data in destructor.
167   schema_store.reset();
168   filesystem.DeleteDirectoryRecursively(base_dir.c_str());
169 }
170 BENCHMARK(BM_SnippetOneProperty)
171     // Arguments: num_matches, total_terms
172     ->ArgPair(1, 1)
173     ->ArgPair(1, 16)          // single match
174     ->ArgPair(2, 16)          // ~10% matches
175     ->ArgPair(3, 16)          // ~20% matches
176     ->ArgPair(8, 16)          // 50% matches
177     ->ArgPair(16, 16)         // 100% matches
178     ->ArgPair(1, 128)         // single match
179     ->ArgPair(13, 128)        // ~10% matches
180     ->ArgPair(26, 128)        // ~20% matches
181     ->ArgPair(64, 128)        // 50% matches
182     ->ArgPair(128, 128)       // 100% matches
183     ->ArgPair(1, 512)         // single match
184     ->ArgPair(51, 512)        // ~10% matches
185     ->ArgPair(102, 512)       // ~20% matches
186     ->ArgPair(256, 512)       // 50% matches
187     ->ArgPair(512, 512)       // 100% matches
188     ->ArgPair(1, 1024)        // single match
189     ->ArgPair(102, 1024)      // ~10% matches
190     ->ArgPair(205, 1024)      // ~20% matches
191     ->ArgPair(512, 1024)      // 50% matches
192     ->ArgPair(1024, 1024)     // 100% matches
193     ->ArgPair(1, 4096)        // single match
194     ->ArgPair(410, 4096)      // ~10% matches
195     ->ArgPair(819, 4096)      // ~20% matches
196     ->ArgPair(2048, 4096)     // 50% matches
197     ->ArgPair(4096, 4096)     // 100% matches
198     ->ArgPair(1, 16384)       // single match
199     ->ArgPair(1638, 16384)    // ~10% matches
200     ->ArgPair(3277, 16384)    // ~20% matches
201     ->ArgPair(8192, 16384)    // 50% matches
202     ->ArgPair(16384, 16384);  // 100% matches
203 
BM_SnippetRfcOneProperty(benchmark::State & state)204 void BM_SnippetRfcOneProperty(benchmark::State& state) {
205   bool run_via_adb = absl::GetFlag(FLAGS_adb);
206   if (!run_via_adb) {
207     ICING_ASSERT_OK(icu_data_file_helper::SetUpIcuDataFile(
208         GetTestFilePath("icing/icu.dat")));
209   }
210 
211   FeatureFlags feature_flags = GetTestFeatureFlags();
212   const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
213   const std::string schema_dir = base_dir + "/schema";
214   Filesystem filesystem;
215   filesystem.DeleteDirectoryRecursively(base_dir.c_str());
216   if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
217     ICING_LOG(ERROR) << "Failed to create test directories";
218   }
219 
220   language_segmenter_factory::SegmenterOptions options(ULOC_US);
221   std::unique_ptr<LanguageSegmenter> language_segmenter =
222       language_segmenter_factory::Create(std::move(options)).ValueOrDie();
223   std::unique_ptr<Normalizer> normalizer =
224       normalizer_factory::Create(
225           /*max_term_byte_size=*/std::numeric_limits<int>::max())
226           .ValueOrDie();
227 
228   SchemaProto schema =
229       SchemaBuilder()
230           .AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
231               PropertyConfigBuilder()
232                   .SetName("prop1")
233                   .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
234                   .SetCardinality(CARDINALITY_OPTIONAL)))
235           .Build();
236   Clock clock;
237   ICING_ASSERT_OK_AND_ASSIGN(
238       std::unique_ptr<SchemaStore> schema_store,
239       SchemaStore::Create(&filesystem, schema_dir, &clock, &feature_flags));
240   ICING_ASSERT_OK(schema_store->SetSchema(
241       schema, /*ignore_errors_and_delete_documents=*/false,
242       /*allow_circular_schema_definitions=*/false));
243 
244   auto snippet_retriever =
245       SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
246                                normalizer.get())
247           .ValueOrDie();
248 
249   int num_matches = state.range(0);
250   int total_terms = state.range(1);
251 
252   std::default_random_engine random;
253   std::vector<std::string> language =
254       CreateLanguages(/*language_size=*/1000, &random);
255   std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
256   std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
257 
258   std::string text;
259   int num_actual_matches = 0;
260   double match_chance;
261   while (total_terms-- > 0) {
262     std::string term;
263     match_chance = static_cast<double>(num_matches) / total_terms;
264     if (uniform_double(random) <= match_chance) {
265       --num_matches;
266       ++num_actual_matches;
267       term = "[email protected]";
268     } else {
269       term = absl_ports::StrCat(language.at(uniform(random)), "@google.com");
270     }
271     absl_ports::StrAppend(&text, ",", term);
272   }
273   DocumentProto document = DocumentBuilder()
274                                .SetKey("icing", "uri1")
275                                .SetSchema("type1")
276                                .AddStringProperty("prop1", text)
277                                .Build();
278   SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
279   ResultSpecProto::SnippetSpecProto snippet_spec;
280   snippet_spec.set_num_to_snippet(100000);
281   snippet_spec.set_num_matches_per_property(100000);
282   snippet_spec.set_max_window_utf32_length(64);
283 
284   SectionIdMask section_id_mask = 0x01;
285   SnippetProto snippet_proto;
286   for (auto _ : state) {
287     snippet_proto = snippet_retriever->RetrieveSnippet(
288         query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
289         section_id_mask);
290     ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
291     ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
292                 SizeIs(num_actual_matches));
293   }
294 
295   // Destroy the schema store before the whole directory is removed because they
296   // persist data in destructor.
297   schema_store.reset();
298   filesystem.DeleteDirectoryRecursively(base_dir.c_str());
299 }
300 BENCHMARK(BM_SnippetRfcOneProperty)
301     // Arguments: num_matches, total_terms
302     ->ArgPair(1, 1)
303     ->ArgPair(1, 16)          // single match
304     ->ArgPair(2, 16)          // ~10% matches
305     ->ArgPair(3, 16)          // ~20% matches
306     ->ArgPair(8, 16)          // 50% matches
307     ->ArgPair(16, 16)         // 100% matches
308     ->ArgPair(1, 128)         // single match
309     ->ArgPair(13, 128)        // ~10% matches
310     ->ArgPair(26, 128)        // ~20% matches
311     ->ArgPair(64, 128)        // 50% matches
312     ->ArgPair(128, 128)       // 100% matches
313     ->ArgPair(1, 512)         // single match
314     ->ArgPair(51, 512)        // ~10% matches
315     ->ArgPair(102, 512)       // ~20% matches
316     ->ArgPair(256, 512)       // 50% matches
317     ->ArgPair(512, 512)       // 100% matches
318     ->ArgPair(1, 1024)        // single match
319     ->ArgPair(102, 1024)      // ~10% matches
320     ->ArgPair(205, 1024)      // ~20% matches
321     ->ArgPair(512, 1024)      // 50% matches
322     ->ArgPair(1024, 1024)     // 100% matches
323     ->ArgPair(1, 4096)        // single match
324     ->ArgPair(410, 4096)      // ~10% matches
325     ->ArgPair(819, 4096)      // ~20% matches
326     ->ArgPair(2048, 4096)     // 50% matches
327     ->ArgPair(4096, 4096)     // 100% matches
328     ->ArgPair(1, 16384)       // single match
329     ->ArgPair(1638, 16384)    // ~10% matches
330     ->ArgPair(3277, 16384)    // ~20% matches
331     ->ArgPair(8192, 16384)    // 50% matches
332     ->ArgPair(16384, 16384);  // 100% matches
333 
334 }  // namespace
335 
336 }  // namespace lib
337 }  // namespace icing
338