xref: /aosp_15_r20/external/stg/deduplication.cc (revision 9e3b08ae94a55201065475453d799e8b1378bea6)
1 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
2 // -*- mode: C++ -*-
3 //
4 // Copyright 2022 Google LLC
5 //
6 // Licensed under the Apache License v2.0 with LLVM Exceptions (the
7 // "License"); you may not use this file except in compliance with the
8 // License.  You may obtain a copy of the License at
9 //
10 //     https://llvm.org/LICENSE.txt
11 //
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 // Author: Giuliano Procida
19 
20 #include "deduplication.h"
21 
22 #include <cstddef>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26 
27 #include "equality.h"
28 #include "equality_cache.h"
29 #include "graph.h"
30 #include "hashing.h"
31 #include "runtime.h"
32 #include "substitution.h"
33 
34 namespace stg {
35 
Deduplicate(Runtime & runtime,Graph & graph,Id root,const Hashes & hashes)36 Id Deduplicate(Runtime& runtime, Graph& graph, Id root, const Hashes& hashes) {
37   // Partition the nodes by hash.
38   std::unordered_map<HashValue, std::vector<Id>> partitions;
39   {
40     const Time x(runtime, "partition nodes");
41     for (const auto& [id, fp] : hashes) {
42       partitions[fp].push_back(id);
43     }
44   }
45   Counter(runtime, "deduplicate.nodes") = hashes.size();
46   Counter(runtime, "deduplicate.hashes") = partitions.size();
47 
48   Histogram hash_partition_size(runtime, "deduplicate.hash_partition_size");
49   Counter min_comparisons(runtime, "deduplicate.min_comparisons");
50   Counter max_comparisons(runtime, "deduplicate.max_comparisons");
51   for (const auto& [fp, ids] : partitions) {
52     const auto n = ids.size();
53     hash_partition_size.Add(n);
54     min_comparisons += n - 1;
55     max_comparisons += n * (n - 1) / 2;
56   }
57 
58   // Refine partitions of nodes with the same fingerprints.
59   EqualityCache cache(runtime, hashes);
60   Equals<EqualityCache> equals(graph, cache);
61   Counter equalities(runtime, "deduplicate.equalities");
62   Counter inequalities(runtime, "deduplicate.inequalities");
63   {
64     const Time x(runtime, "find duplicates");
65     for (auto& [fp, ids] : partitions) {
66       while (ids.size() > 1) {
67         std::vector<Id> todo;
68         const Id candidate = ids[0];
69         for (size_t i = 1; i < ids.size(); ++i) {
70           if (equals(ids[i], candidate)) {
71             ++equalities;
72           } else {
73             todo.push_back(ids[i]);
74             ++inequalities;
75           }
76         }
77         std::swap(todo, ids);
78       }
79     }
80   }
81 
82   // Keep one representative of each set of duplicates.
83   Counter unique(runtime, "deduplicate.unique");
84   Counter duplicate(runtime, "deduplicate.duplicate");
85   const auto remap = [&cache](Id& id) {
86     // update id to representative id, avoiding silent stores
87     const Id fid = cache.Find(id);
88     if (fid != id) {
89       id = fid;
90     }
91   };
92   const Substitute substitute(graph, remap);
93   {
94     const Time x(runtime, "rewrite");
95     for (const auto& [id, fp] : hashes) {
96       const Id fid = cache.Find(id);
97       if (fid != id) {
98         graph.Remove(id);
99         ++duplicate;
100       } else {
101         substitute(id);
102         ++unique;
103       }
104     }
105   }
106 
107   // In case the root node was remapped.
108   substitute.Update(root);
109   return root;
110 }
111 
112 }  // namespace stg
113