xref: /aosp_15_r20/external/cronet/third_party/brotli/enc/cluster_inc.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 /* NOLINT(build/header_guard) */
2 /* Copyright 2013 Google Inc. All Rights Reserved.
3 
4    Distributed under MIT license.
5    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6 */
7 
8 /* template parameters: FN, CODE */
9 
10 #define HistogramType FN(Histogram)
11 
12 /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
13    it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
14 BROTLI_INTERNAL void FN(BrotliCompareAndPushToQueue)(
15     const HistogramType* out, HistogramType* tmp, const uint32_t* cluster_size,
16     uint32_t idx1, uint32_t idx2, size_t max_num_pairs, HistogramPair* pairs,
17     size_t* num_pairs) CODE({
18   BROTLI_BOOL is_good_pair = BROTLI_FALSE;
19   HistogramPair p;
20   p.idx1 = p.idx2 = 0;
21   p.cost_diff = p.cost_combo = 0;
22   if (idx1 == idx2) {
23     return;
24   }
25   if (idx2 < idx1) {
26     uint32_t t = idx2;
27     idx2 = idx1;
28     idx1 = t;
29   }
30   p.idx1 = idx1;
31   p.idx2 = idx2;
32   p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
33   p.cost_diff -= out[idx1].bit_cost_;
34   p.cost_diff -= out[idx2].bit_cost_;
35 
36   if (out[idx1].total_count_ == 0) {
37     p.cost_combo = out[idx2].bit_cost_;
38     is_good_pair = BROTLI_TRUE;
39   } else if (out[idx2].total_count_ == 0) {
40     p.cost_combo = out[idx1].bit_cost_;
41     is_good_pair = BROTLI_TRUE;
42   } else {
43     double threshold = *num_pairs == 0 ? 1e99 :
44         BROTLI_MAX(double, 0.0, pairs[0].cost_diff);
45     double cost_combo;
46     *tmp = out[idx1];
47     FN(HistogramAddHistogram)(tmp, &out[idx2]);
48     cost_combo = FN(BrotliPopulationCost)(tmp);
49     if (cost_combo < threshold - p.cost_diff) {
50       p.cost_combo = cost_combo;
51       is_good_pair = BROTLI_TRUE;
52     }
53   }
54   if (is_good_pair) {
55     p.cost_diff += p.cost_combo;
56     if (*num_pairs > 0 && HistogramPairIsLess(&pairs[0], &p)) {
57       /* Replace the top of the queue if needed. */
58       if (*num_pairs < max_num_pairs) {
59         pairs[*num_pairs] = pairs[0];
60         ++(*num_pairs);
61       }
62       pairs[0] = p;
63     } else if (*num_pairs < max_num_pairs) {
64       pairs[*num_pairs] = p;
65       ++(*num_pairs);
66     }
67   }
68 })
69 
70 BROTLI_INTERNAL size_t FN(BrotliHistogramCombine)(HistogramType* out,
71                                                   HistogramType* tmp,
72                                                   uint32_t* cluster_size,
73                                                   uint32_t* symbols,
74                                                   uint32_t* clusters,
75                                                   HistogramPair* pairs,
76                                                   size_t num_clusters,
77                                                   size_t symbols_size,
78                                                   size_t max_clusters,
79                                                   size_t max_num_pairs) CODE({
80   double cost_diff_threshold = 0.0;
81   size_t min_cluster_size = 1;
82   size_t num_pairs = 0;
83 
84   {
85     /* We maintain a vector of histogram pairs, with the property that the pair
86        with the maximum bit cost reduction is the first. */
87     size_t idx1;
88     for (idx1 = 0; idx1 < num_clusters; ++idx1) {
89       size_t idx2;
90       for (idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
91         FN(BrotliCompareAndPushToQueue)(out, tmp, cluster_size, clusters[idx1],
92             clusters[idx2], max_num_pairs, &pairs[0], &num_pairs);
93       }
94     }
95   }
96 
97   while (num_clusters > min_cluster_size) {
98     uint32_t best_idx1;
99     uint32_t best_idx2;
100     size_t i;
101     if (pairs[0].cost_diff >= cost_diff_threshold) {
102       cost_diff_threshold = 1e99;
103       min_cluster_size = max_clusters;
104       continue;
105     }
106     /* Take the best pair from the top of heap. */
107     best_idx1 = pairs[0].idx1;
108     best_idx2 = pairs[0].idx2;
109     FN(HistogramAddHistogram)(&out[best_idx1], &out[best_idx2]);
110     out[best_idx1].bit_cost_ = pairs[0].cost_combo;
111     cluster_size[best_idx1] += cluster_size[best_idx2];
112     for (i = 0; i < symbols_size; ++i) {
113       if (symbols[i] == best_idx2) {
114         symbols[i] = best_idx1;
115       }
116     }
117     for (i = 0; i < num_clusters; ++i) {
118       if (clusters[i] == best_idx2) {
119         memmove(&clusters[i], &clusters[i + 1],
120                 (num_clusters - i - 1) * sizeof(clusters[0]));
121         break;
122       }
123     }
124     --num_clusters;
125     {
126       /* Remove pairs intersecting the just combined best pair. */
127       size_t copy_to_idx = 0;
128       for (i = 0; i < num_pairs; ++i) {
129         HistogramPair* p = &pairs[i];
130         if (p->idx1 == best_idx1 || p->idx2 == best_idx1 ||
131             p->idx1 == best_idx2 || p->idx2 == best_idx2) {
132           /* Remove invalid pair from the queue. */
133           continue;
134         }
135         if (HistogramPairIsLess(&pairs[0], p)) {
136           /* Replace the top of the queue if needed. */
137           HistogramPair front = pairs[0];
138           pairs[0] = *p;
139           pairs[copy_to_idx] = front;
140         } else {
141           pairs[copy_to_idx] = *p;
142         }
143         ++copy_to_idx;
144       }
145       num_pairs = copy_to_idx;
146     }
147 
148     /* Push new pairs formed with the combined histogram to the heap. */
149     for (i = 0; i < num_clusters; ++i) {
150       FN(BrotliCompareAndPushToQueue)(out, tmp, cluster_size, best_idx1,
151           clusters[i], max_num_pairs, &pairs[0], &num_pairs);
152     }
153   }
154   return num_clusters;
155 })
156 
157 /* What is the bit cost of moving histogram from cur_symbol to candidate. */
158 BROTLI_INTERNAL double FN(BrotliHistogramBitCostDistance)(
159     const HistogramType* histogram, const HistogramType* candidate,
160     HistogramType* tmp) CODE({
161   if (histogram->total_count_ == 0) {
162     return 0.0;
163   } else {
164     *tmp = *histogram;
165     FN(HistogramAddHistogram)(tmp, candidate);
166     return FN(BrotliPopulationCost)(tmp) - candidate->bit_cost_;
167   }
168 })
169 
170 /* Find the best 'out' histogram for each of the 'in' histograms.
171    When called, clusters[0..num_clusters) contains the unique values from
172    symbols[0..in_size), but this property is not preserved in this function.
173    Note: we assume that out[]->bit_cost_ is already up-to-date. */
174 BROTLI_INTERNAL void FN(BrotliHistogramRemap)(const HistogramType* in,
175     size_t in_size, const uint32_t* clusters, size_t num_clusters,
176     HistogramType* out, HistogramType* tmp, uint32_t* symbols) CODE({
177   size_t i;
178   for (i = 0; i < in_size; ++i) {
179     uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
180     double best_bits =
181         FN(BrotliHistogramBitCostDistance)(&in[i], &out[best_out], tmp);
182     size_t j;
183     for (j = 0; j < num_clusters; ++j) {
184       const double cur_bits =
185           FN(BrotliHistogramBitCostDistance)(&in[i], &out[clusters[j]], tmp);
186       if (cur_bits < best_bits) {
187         best_bits = cur_bits;
188         best_out = clusters[j];
189       }
190     }
191     symbols[i] = best_out;
192   }
193 
194   /* Recompute each out based on raw and symbols. */
195   for (i = 0; i < num_clusters; ++i) {
196     FN(HistogramClear)(&out[clusters[i]]);
197   }
198   for (i = 0; i < in_size; ++i) {
199     FN(HistogramAddHistogram)(&out[symbols[i]], &in[i]);
200   }
201 })
202 
203 /* Reorders elements of the out[0..length) array and changes values in
204    symbols[0..length) array in the following way:
205      * when called, symbols[] contains indexes into out[], and has N unique
206        values (possibly N < length)
207      * on return, symbols'[i] = f(symbols[i]) and
208                   out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
209        where f is a bijection between the range of symbols[] and [0..N), and
210        the first occurrences of values in symbols'[i] come in consecutive
211        increasing order.
212    Returns N, the number of unique values in symbols[]. */
213 BROTLI_INTERNAL size_t FN(BrotliHistogramReindex)(MemoryManager* m,
214     HistogramType* out, uint32_t* symbols, size_t length) CODE({
215   static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
216   uint32_t* new_index = BROTLI_ALLOC(m, uint32_t, length);
217   uint32_t next_index;
218   HistogramType* tmp;
219   size_t i;
220   if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return 0;
221   for (i = 0; i < length; ++i) {
222       new_index[i] = kInvalidIndex;
223   }
224   next_index = 0;
225   for (i = 0; i < length; ++i) {
226     if (new_index[symbols[i]] == kInvalidIndex) {
227       new_index[symbols[i]] = next_index;
228       ++next_index;
229     }
230   }
231   /* TODO(eustas): by using idea of "cycle-sort" we can avoid allocation of
232      tmp and reduce the number of copying by the factor of 2. */
233   tmp = BROTLI_ALLOC(m, HistogramType, next_index);
234   if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(tmp)) return 0;
235   next_index = 0;
236   for (i = 0; i < length; ++i) {
237     if (new_index[symbols[i]] == next_index) {
238       tmp[next_index] = out[symbols[i]];
239       ++next_index;
240     }
241     symbols[i] = new_index[symbols[i]];
242   }
243   BROTLI_FREE(m, new_index);
244   for (i = 0; i < next_index; ++i) {
245     out[i] = tmp[i];
246   }
247   BROTLI_FREE(m, tmp);
248   return next_index;
249 })
250 
251 BROTLI_INTERNAL void FN(BrotliClusterHistograms)(
252     MemoryManager* m, const HistogramType* in, const size_t in_size,
253     size_t max_histograms, HistogramType* out, size_t* out_size,
254     uint32_t* histogram_symbols) CODE({
255   uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, in_size);
256   uint32_t* clusters = BROTLI_ALLOC(m, uint32_t, in_size);
257   size_t num_clusters = 0;
258   const size_t max_input_histograms = 64;
259   size_t pairs_capacity = max_input_histograms * max_input_histograms / 2;
260   /* For the first pass of clustering, we allow all pairs. */
261   HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity + 1);
262   /* TODO(eustas): move to "persistent" arena? */
263   HistogramType* tmp = BROTLI_ALLOC(m, HistogramType, 1);
264   size_t i;
265 
266   if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(cluster_size) ||
267       BROTLI_IS_NULL(clusters) || BROTLI_IS_NULL(pairs)|| BROTLI_IS_NULL(tmp)) {
268     return;
269   }
270 
271   for (i = 0; i < in_size; ++i) {
272     cluster_size[i] = 1;
273   }
274 
275   for (i = 0; i < in_size; ++i) {
276     out[i] = in[i];
277     out[i].bit_cost_ = FN(BrotliPopulationCost)(&in[i]);
278     histogram_symbols[i] = (uint32_t)i;
279   }
280 
281   for (i = 0; i < in_size; i += max_input_histograms) {
282     size_t num_to_combine =
283         BROTLI_MIN(size_t, in_size - i, max_input_histograms);
284     size_t num_new_clusters;
285     size_t j;
286     for (j = 0; j < num_to_combine; ++j) {
287       clusters[num_clusters + j] = (uint32_t)(i + j);
288     }
289     num_new_clusters =
290         FN(BrotliHistogramCombine)(out, tmp, cluster_size,
291                                    &histogram_symbols[i],
292                                    &clusters[num_clusters], pairs,
293                                    num_to_combine, num_to_combine,
294                                    max_histograms, pairs_capacity);
295     num_clusters += num_new_clusters;
296   }
297 
298   {
299     /* For the second pass, we limit the total number of histogram pairs.
300        After this limit is reached, we only keep searching for the best pair. */
301     size_t max_num_pairs = BROTLI_MIN(size_t,
302         64 * num_clusters, (num_clusters / 2) * num_clusters);
303     BROTLI_ENSURE_CAPACITY(
304         m, HistogramPair, pairs, pairs_capacity, max_num_pairs + 1);
305     if (BROTLI_IS_OOM(m)) return;
306 
307     /* Collapse similar histograms. */
308     num_clusters = FN(BrotliHistogramCombine)(out, tmp, cluster_size,
309                                               histogram_symbols, clusters,
310                                               pairs, num_clusters, in_size,
311                                               max_histograms, max_num_pairs);
312   }
313   BROTLI_FREE(m, pairs);
314   BROTLI_FREE(m, cluster_size);
315   /* Find the optimal map from original histograms to the final ones. */
316   FN(BrotliHistogramRemap)(in, in_size, clusters, num_clusters,
317                            out, tmp, histogram_symbols);
318   BROTLI_FREE(m, tmp);
319   BROTLI_FREE(m, clusters);
320   /* Convert the context map to a canonical form. */
321   *out_size = FN(BrotliHistogramReindex)(m, out, histogram_symbols, in_size);
322   if (BROTLI_IS_OOM(m)) return;
323 })
324 
325 #undef HistogramType
326