1*f4ee7fbaSAndroid Build Coastguard Worker /* Copyright 2013 Google Inc. All Rights Reserved.
2*f4ee7fbaSAndroid Build Coastguard Worker
3*f4ee7fbaSAndroid Build Coastguard Worker Distributed under MIT license.
4*f4ee7fbaSAndroid Build Coastguard Worker See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*f4ee7fbaSAndroid Build Coastguard Worker */
6*f4ee7fbaSAndroid Build Coastguard Worker
7*f4ee7fbaSAndroid Build Coastguard Worker /* Literal cost model to allow backward reference replacement to be efficient.
8*f4ee7fbaSAndroid Build Coastguard Worker */
9*f4ee7fbaSAndroid Build Coastguard Worker
10*f4ee7fbaSAndroid Build Coastguard Worker #include "./literal_cost.h"
11*f4ee7fbaSAndroid Build Coastguard Worker
12*f4ee7fbaSAndroid Build Coastguard Worker #include "../common/platform.h"
13*f4ee7fbaSAndroid Build Coastguard Worker #include <brotli/types.h>
14*f4ee7fbaSAndroid Build Coastguard Worker #include "./fast_log.h"
15*f4ee7fbaSAndroid Build Coastguard Worker #include "./utf8_util.h"
16*f4ee7fbaSAndroid Build Coastguard Worker
17*f4ee7fbaSAndroid Build Coastguard Worker #if defined(__cplusplus) || defined(c_plusplus)
18*f4ee7fbaSAndroid Build Coastguard Worker extern "C" {
19*f4ee7fbaSAndroid Build Coastguard Worker #endif
20*f4ee7fbaSAndroid Build Coastguard Worker
UTF8Position(size_t last,size_t c,size_t clamp)21*f4ee7fbaSAndroid Build Coastguard Worker static size_t UTF8Position(size_t last, size_t c, size_t clamp) {
22*f4ee7fbaSAndroid Build Coastguard Worker if (c < 128) {
23*f4ee7fbaSAndroid Build Coastguard Worker return 0; /* Next one is the 'Byte 1' again. */
24*f4ee7fbaSAndroid Build Coastguard Worker } else if (c >= 192) { /* Next one is the 'Byte 2' of utf-8 encoding. */
25*f4ee7fbaSAndroid Build Coastguard Worker return BROTLI_MIN(size_t, 1, clamp);
26*f4ee7fbaSAndroid Build Coastguard Worker } else {
27*f4ee7fbaSAndroid Build Coastguard Worker /* Let's decide over the last byte if this ends the sequence. */
28*f4ee7fbaSAndroid Build Coastguard Worker if (last < 0xE0) {
29*f4ee7fbaSAndroid Build Coastguard Worker return 0; /* Completed two or three byte coding. */
30*f4ee7fbaSAndroid Build Coastguard Worker } else { /* Next one is the 'Byte 3' of utf-8 encoding. */
31*f4ee7fbaSAndroid Build Coastguard Worker return BROTLI_MIN(size_t, 2, clamp);
32*f4ee7fbaSAndroid Build Coastguard Worker }
33*f4ee7fbaSAndroid Build Coastguard Worker }
34*f4ee7fbaSAndroid Build Coastguard Worker }
35*f4ee7fbaSAndroid Build Coastguard Worker
DecideMultiByteStatsLevel(size_t pos,size_t len,size_t mask,const uint8_t * data)36*f4ee7fbaSAndroid Build Coastguard Worker static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
37*f4ee7fbaSAndroid Build Coastguard Worker const uint8_t* data) {
38*f4ee7fbaSAndroid Build Coastguard Worker size_t counts[3] = { 0 };
39*f4ee7fbaSAndroid Build Coastguard Worker size_t max_utf8 = 1; /* should be 2, but 1 compresses better. */
40*f4ee7fbaSAndroid Build Coastguard Worker size_t last_c = 0;
41*f4ee7fbaSAndroid Build Coastguard Worker size_t i;
42*f4ee7fbaSAndroid Build Coastguard Worker for (i = 0; i < len; ++i) {
43*f4ee7fbaSAndroid Build Coastguard Worker size_t c = data[(pos + i) & mask];
44*f4ee7fbaSAndroid Build Coastguard Worker ++counts[UTF8Position(last_c, c, 2)];
45*f4ee7fbaSAndroid Build Coastguard Worker last_c = c;
46*f4ee7fbaSAndroid Build Coastguard Worker }
47*f4ee7fbaSAndroid Build Coastguard Worker if (counts[2] < 500) {
48*f4ee7fbaSAndroid Build Coastguard Worker max_utf8 = 1;
49*f4ee7fbaSAndroid Build Coastguard Worker }
50*f4ee7fbaSAndroid Build Coastguard Worker if (counts[1] + counts[2] < 25) {
51*f4ee7fbaSAndroid Build Coastguard Worker max_utf8 = 0;
52*f4ee7fbaSAndroid Build Coastguard Worker }
53*f4ee7fbaSAndroid Build Coastguard Worker return max_utf8;
54*f4ee7fbaSAndroid Build Coastguard Worker }
55*f4ee7fbaSAndroid Build Coastguard Worker
EstimateBitCostsForLiteralsUTF8(size_t pos,size_t len,size_t mask,const uint8_t * data,float * cost)56*f4ee7fbaSAndroid Build Coastguard Worker static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
57*f4ee7fbaSAndroid Build Coastguard Worker const uint8_t* data, float* cost) {
58*f4ee7fbaSAndroid Build Coastguard Worker /* max_utf8 is 0 (normal ASCII single byte modeling),
59*f4ee7fbaSAndroid Build Coastguard Worker 1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
60*f4ee7fbaSAndroid Build Coastguard Worker const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
61*f4ee7fbaSAndroid Build Coastguard Worker size_t histogram[3][256] = { { 0 } };
62*f4ee7fbaSAndroid Build Coastguard Worker size_t window_half = 495;
63*f4ee7fbaSAndroid Build Coastguard Worker size_t in_window = BROTLI_MIN(size_t, window_half, len);
64*f4ee7fbaSAndroid Build Coastguard Worker size_t in_window_utf8[3] = { 0 };
65*f4ee7fbaSAndroid Build Coastguard Worker
66*f4ee7fbaSAndroid Build Coastguard Worker size_t i;
67*f4ee7fbaSAndroid Build Coastguard Worker { /* Bootstrap histograms. */
68*f4ee7fbaSAndroid Build Coastguard Worker size_t last_c = 0;
69*f4ee7fbaSAndroid Build Coastguard Worker size_t utf8_pos = 0;
70*f4ee7fbaSAndroid Build Coastguard Worker for (i = 0; i < in_window; ++i) {
71*f4ee7fbaSAndroid Build Coastguard Worker size_t c = data[(pos + i) & mask];
72*f4ee7fbaSAndroid Build Coastguard Worker ++histogram[utf8_pos][c];
73*f4ee7fbaSAndroid Build Coastguard Worker ++in_window_utf8[utf8_pos];
74*f4ee7fbaSAndroid Build Coastguard Worker utf8_pos = UTF8Position(last_c, c, max_utf8);
75*f4ee7fbaSAndroid Build Coastguard Worker last_c = c;
76*f4ee7fbaSAndroid Build Coastguard Worker }
77*f4ee7fbaSAndroid Build Coastguard Worker }
78*f4ee7fbaSAndroid Build Coastguard Worker
79*f4ee7fbaSAndroid Build Coastguard Worker /* Compute bit costs with sliding window. */
80*f4ee7fbaSAndroid Build Coastguard Worker for (i = 0; i < len; ++i) {
81*f4ee7fbaSAndroid Build Coastguard Worker if (i >= window_half) {
82*f4ee7fbaSAndroid Build Coastguard Worker /* Remove a byte in the past. */
83*f4ee7fbaSAndroid Build Coastguard Worker size_t c =
84*f4ee7fbaSAndroid Build Coastguard Worker i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask];
85*f4ee7fbaSAndroid Build Coastguard Worker size_t last_c =
86*f4ee7fbaSAndroid Build Coastguard Worker i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask];
87*f4ee7fbaSAndroid Build Coastguard Worker size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
88*f4ee7fbaSAndroid Build Coastguard Worker --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
89*f4ee7fbaSAndroid Build Coastguard Worker --in_window_utf8[utf8_pos2];
90*f4ee7fbaSAndroid Build Coastguard Worker }
91*f4ee7fbaSAndroid Build Coastguard Worker if (i + window_half < len) {
92*f4ee7fbaSAndroid Build Coastguard Worker /* Add a byte in the future. */
93*f4ee7fbaSAndroid Build Coastguard Worker size_t c = data[(pos + i + window_half - 1) & mask];
94*f4ee7fbaSAndroid Build Coastguard Worker size_t last_c = data[(pos + i + window_half - 2) & mask];
95*f4ee7fbaSAndroid Build Coastguard Worker size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
96*f4ee7fbaSAndroid Build Coastguard Worker ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
97*f4ee7fbaSAndroid Build Coastguard Worker ++in_window_utf8[utf8_pos2];
98*f4ee7fbaSAndroid Build Coastguard Worker }
99*f4ee7fbaSAndroid Build Coastguard Worker {
100*f4ee7fbaSAndroid Build Coastguard Worker size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask];
101*f4ee7fbaSAndroid Build Coastguard Worker size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
102*f4ee7fbaSAndroid Build Coastguard Worker size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
103*f4ee7fbaSAndroid Build Coastguard Worker size_t masked_pos = (pos + i) & mask;
104*f4ee7fbaSAndroid Build Coastguard Worker size_t histo = histogram[utf8_pos][data[masked_pos]];
105*f4ee7fbaSAndroid Build Coastguard Worker double lit_cost;
106*f4ee7fbaSAndroid Build Coastguard Worker if (histo == 0) {
107*f4ee7fbaSAndroid Build Coastguard Worker histo = 1;
108*f4ee7fbaSAndroid Build Coastguard Worker }
109*f4ee7fbaSAndroid Build Coastguard Worker lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
110*f4ee7fbaSAndroid Build Coastguard Worker lit_cost += 0.02905;
111*f4ee7fbaSAndroid Build Coastguard Worker if (lit_cost < 1.0) {
112*f4ee7fbaSAndroid Build Coastguard Worker lit_cost *= 0.5;
113*f4ee7fbaSAndroid Build Coastguard Worker lit_cost += 0.5;
114*f4ee7fbaSAndroid Build Coastguard Worker }
115*f4ee7fbaSAndroid Build Coastguard Worker /* Make the first bytes more expensive -- seems to help, not sure why.
116*f4ee7fbaSAndroid Build Coastguard Worker Perhaps because the entropy source is changing its properties
117*f4ee7fbaSAndroid Build Coastguard Worker rapidly in the beginning of the file, perhaps because the beginning
118*f4ee7fbaSAndroid Build Coastguard Worker of the data is a statistical "anomaly". */
119*f4ee7fbaSAndroid Build Coastguard Worker if (i < 2000) {
120*f4ee7fbaSAndroid Build Coastguard Worker lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35);
121*f4ee7fbaSAndroid Build Coastguard Worker }
122*f4ee7fbaSAndroid Build Coastguard Worker cost[i] = (float)lit_cost;
123*f4ee7fbaSAndroid Build Coastguard Worker }
124*f4ee7fbaSAndroid Build Coastguard Worker }
125*f4ee7fbaSAndroid Build Coastguard Worker }
126*f4ee7fbaSAndroid Build Coastguard Worker
BrotliEstimateBitCostsForLiterals(size_t pos,size_t len,size_t mask,const uint8_t * data,float * cost)127*f4ee7fbaSAndroid Build Coastguard Worker void BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
128*f4ee7fbaSAndroid Build Coastguard Worker const uint8_t* data, float* cost) {
129*f4ee7fbaSAndroid Build Coastguard Worker if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
130*f4ee7fbaSAndroid Build Coastguard Worker EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
131*f4ee7fbaSAndroid Build Coastguard Worker return;
132*f4ee7fbaSAndroid Build Coastguard Worker } else {
133*f4ee7fbaSAndroid Build Coastguard Worker size_t histogram[256] = { 0 };
134*f4ee7fbaSAndroid Build Coastguard Worker size_t window_half = 2000;
135*f4ee7fbaSAndroid Build Coastguard Worker size_t in_window = BROTLI_MIN(size_t, window_half, len);
136*f4ee7fbaSAndroid Build Coastguard Worker
137*f4ee7fbaSAndroid Build Coastguard Worker /* Bootstrap histogram. */
138*f4ee7fbaSAndroid Build Coastguard Worker size_t i;
139*f4ee7fbaSAndroid Build Coastguard Worker for (i = 0; i < in_window; ++i) {
140*f4ee7fbaSAndroid Build Coastguard Worker ++histogram[data[(pos + i) & mask]];
141*f4ee7fbaSAndroid Build Coastguard Worker }
142*f4ee7fbaSAndroid Build Coastguard Worker
143*f4ee7fbaSAndroid Build Coastguard Worker /* Compute bit costs with sliding window. */
144*f4ee7fbaSAndroid Build Coastguard Worker for (i = 0; i < len; ++i) {
145*f4ee7fbaSAndroid Build Coastguard Worker size_t histo;
146*f4ee7fbaSAndroid Build Coastguard Worker if (i >= window_half) {
147*f4ee7fbaSAndroid Build Coastguard Worker /* Remove a byte in the past. */
148*f4ee7fbaSAndroid Build Coastguard Worker --histogram[data[(pos + i - window_half) & mask]];
149*f4ee7fbaSAndroid Build Coastguard Worker --in_window;
150*f4ee7fbaSAndroid Build Coastguard Worker }
151*f4ee7fbaSAndroid Build Coastguard Worker if (i + window_half < len) {
152*f4ee7fbaSAndroid Build Coastguard Worker /* Add a byte in the future. */
153*f4ee7fbaSAndroid Build Coastguard Worker ++histogram[data[(pos + i + window_half) & mask]];
154*f4ee7fbaSAndroid Build Coastguard Worker ++in_window;
155*f4ee7fbaSAndroid Build Coastguard Worker }
156*f4ee7fbaSAndroid Build Coastguard Worker histo = histogram[data[(pos + i) & mask]];
157*f4ee7fbaSAndroid Build Coastguard Worker if (histo == 0) {
158*f4ee7fbaSAndroid Build Coastguard Worker histo = 1;
159*f4ee7fbaSAndroid Build Coastguard Worker }
160*f4ee7fbaSAndroid Build Coastguard Worker {
161*f4ee7fbaSAndroid Build Coastguard Worker double lit_cost = FastLog2(in_window) - FastLog2(histo);
162*f4ee7fbaSAndroid Build Coastguard Worker lit_cost += 0.029;
163*f4ee7fbaSAndroid Build Coastguard Worker if (lit_cost < 1.0) {
164*f4ee7fbaSAndroid Build Coastguard Worker lit_cost *= 0.5;
165*f4ee7fbaSAndroid Build Coastguard Worker lit_cost += 0.5;
166*f4ee7fbaSAndroid Build Coastguard Worker }
167*f4ee7fbaSAndroid Build Coastguard Worker cost[i] = (float)lit_cost;
168*f4ee7fbaSAndroid Build Coastguard Worker }
169*f4ee7fbaSAndroid Build Coastguard Worker }
170*f4ee7fbaSAndroid Build Coastguard Worker }
171*f4ee7fbaSAndroid Build Coastguard Worker }
172*f4ee7fbaSAndroid Build Coastguard Worker
173*f4ee7fbaSAndroid Build Coastguard Worker #if defined(__cplusplus) || defined(c_plusplus)
174*f4ee7fbaSAndroid Build Coastguard Worker } /* extern "C" */
175*f4ee7fbaSAndroid Build Coastguard Worker #endif
176