1 // Copyright 2023 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 #pragma once
15
16 /// @file pw_varint/varint.h
17 ///
18 /// The `pw_varint` module provides functions for encoding and decoding variable
19 /// length integers or varints. For smaller values, varints require less memory
20 /// than a fixed-size encoding. For example, a 32-bit (4-byte) integer requires
21 /// 1–5 bytes when varint-encoded.
22 ///
23 /// `pw_varint` supports custom variable-length encodings with different
24 /// terminator bit values and positions (@cpp_enum{pw::varint::Format}).
25 /// The basic encoding for unsigned integers is Little Endian Base 128 (LEB128).
26 /// ZigZag encoding is also supported, which maps negative integers to positive
27 /// integers to improve encoding density for LEB128.
28 ///
29 /// <a
30 /// href=https://developers.google.com/protocol-buffers/docs/encoding#varints>
31 /// Protocol Buffers</a> and @rstref{HDLC <module-pw_hdlc>} use variable-length
32 /// integer encodings for integers.
33
34 #include <stdbool.h>
35 #include <stddef.h>
36 #include <stdint.h>
37
38 #include "pw_preprocessor/compiler.h"
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 /// Maximum size of an LEB128-encoded `uint32_t`.
45 #define PW_VARINT_MAX_INT32_SIZE_BYTES 5
46
47 /// Maximum size of an LEB128-encoded `uint64_t`.
48 #define PW_VARINT_MAX_INT64_SIZE_BYTES 10
49
50 /// Encodes a 32-bit integer as LEB128.
51 ///
52 /// @returns the number of bytes written
53 size_t pw_varint_Encode32(uint32_t integer,
54 void* output,
55 size_t output_size_bytes);
56
57 /// Encodes a 64-bit integer as LEB128.
58 ///
59 /// @returns the number of bytes written
60 size_t pw_varint_Encode64(uint64_t integer,
61 void* output,
62 size_t output_size_bytes);
63
64 /// Zig-zag encodes an `int32_t`, returning it as a `uint32_t`.
pw_varint_ZigZagEncode32(int32_t n)65 static inline uint32_t pw_varint_ZigZagEncode32(int32_t n) {
66 return (uint32_t)((uint32_t)n << 1) ^ (uint32_t)(n >> (sizeof(n) * 8 - 1));
67 }
68
69 /// Zig-zag encodes an `int64_t`, returning it as a `uint64_t`.
pw_varint_ZigZagEncode64(int64_t n)70 static inline uint64_t pw_varint_ZigZagEncode64(int64_t n) {
71 return (uint64_t)((uint64_t)n << 1) ^ (uint64_t)(n >> (sizeof(n) * 8 - 1));
72 }
73
74 /// Extracts and encodes 7 bits from the integer. Sets the top bit to indicate
75 /// more data is coming, which must be cleared if this was the last byte.
pw_varint_EncodeOneByte32(uint32_t * integer)76 static inline uint8_t pw_varint_EncodeOneByte32(uint32_t* integer) {
77 const uint8_t bits = (uint8_t)((*integer & 0x7Fu) | 0x80u);
78 *integer >>= 7;
79 return bits;
80 }
81
82 /// @copydoc pw_varint_EncodeOneByte32
pw_varint_EncodeOneByte64(uint64_t * integer)83 static inline uint8_t pw_varint_EncodeOneByte64(uint64_t* integer) {
84 const uint8_t bits = (uint8_t)((*integer & 0x7Fu) | 0x80u);
85 *integer >>= 7;
86 return bits;
87 }
88
89 /// Zig-zag decodes a `uint64_t`, returning it as an `int64_t`.
pw_varint_ZigZagDecode32(uint32_t n)90 static inline int32_t pw_varint_ZigZagDecode32(uint32_t n)
91 PW_NO_SANITIZE("unsigned-integer-overflow") {
92 return (int32_t)((n >> 1) ^ (~(n & 1) + 1));
93 }
94
95 /// Zig-zag decodes a `uint64_t`, returning it as an `int64_t`.
pw_varint_ZigZagDecode64(uint64_t n)96 static inline int64_t pw_varint_ZigZagDecode64(uint64_t n)
97 PW_NO_SANITIZE("unsigned-integer-overflow") {
98 return (int64_t)((n >> 1) ^ (~(n & 1) + 1));
99 }
100
101 /// Decodes an LEB128-encoded integer to a `uint32_t`.
102 /// @returns the number of bytes read; 0 if decoding failed
103 size_t pw_varint_Decode32(const void* input,
104 size_t input_size_bytes,
105 uint32_t* output);
106
107 /// Decodes an LEB128-encoded integer to a `uint64_t`.
108 /// @returns the number of bytes read; 0 if decoding failed
109 size_t pw_varint_Decode64(const void* input,
110 size_t input_size_bytes,
111 uint64_t* output);
112
113 /// Decodes one byte of an LEB128-encoded integer to a `uint32_t`.
114 /// @returns true if there is more data to decode (top bit is set).
pw_varint_DecodeOneByte32(uint8_t byte,size_t count,uint32_t * value)115 static inline bool pw_varint_DecodeOneByte32(uint8_t byte,
116 size_t count,
117 uint32_t* value) {
118 *value |= (uint32_t)(byte & 0x7fu) << (count * 7);
119 return (byte & 0x80u) != 0u;
120 }
121
122 /// Decodes one byte of an LEB128-encoded integer to a `uint64_t`.
123 /// @returns true if there is more data to decode (top bit is set).
pw_varint_DecodeOneByte64(uint8_t byte,size_t count,uint64_t * value)124 static inline bool pw_varint_DecodeOneByte64(uint8_t byte,
125 size_t count,
126 uint64_t* value) {
127 *value |= (uint64_t)(byte & 0x7fu) << (count * 7);
128 return (byte & 0x80u) != 0u;
129 }
130
131 /// Macro that returns the encoded size of up to a 64-bit integer. This is
132 /// inefficient, but is a constant expression if the input is a constant. Use
133 /// `pw_varint_EncodedSizeBytes` for runtime encoded size calculation.
134 #define PW_VARINT_ENCODED_SIZE_BYTES(value) \
135 ((unsigned long long)value < (1u << 7) ? 1u \
136 : (unsigned long long)value < (1u << 14) ? 2u \
137 : (unsigned long long)value < (1u << 21) ? 3u \
138 : (unsigned long long)value < (1u << 28) ? 4u \
139 : (unsigned long long)value < (1llu << 35) ? 5u \
140 : (unsigned long long)value < (1llu << 42) ? 6u \
141 : (unsigned long long)value < (1llu << 49) ? 7u \
142 : (unsigned long long)value < (1llu << 56) ? 8u \
143 : (unsigned long long)value < (1llu << 63) ? 9u \
144 : 10u)
145
146 /// Returns the size of a `uint64_t` when encoded as a varint (LEB128).
147 size_t pw_varint_EncodedSizeBytes(uint64_t integer);
148
149 /// Describes a custom varint format.
150 typedef enum {
151 PW_VARINT_ZERO_TERMINATED_LEAST_SIGNIFICANT = 0,
152 PW_VARINT_ZERO_TERMINATED_MOST_SIGNIFICANT = 1,
153 PW_VARINT_ONE_TERMINATED_LEAST_SIGNIFICANT = 2,
154 PW_VARINT_ONE_TERMINATED_MOST_SIGNIFICANT = 3,
155 } pw_varint_Format;
156
157 /// Encodes a `uint64_t` using a custom varint format.
158 size_t pw_varint_EncodeCustom(uint64_t integer,
159 void* output,
160 size_t output_size,
161 pw_varint_Format format);
162
163 /// Decodes a `uint64_t` using a custom varint format.
164 size_t pw_varint_DecodeCustom(const void* input,
165 size_t input_size,
166 uint64_t* output,
167 pw_varint_Format format);
168
169 #ifdef __cplusplus
170
171 } // extern "C"
172
173 #include <limits>
174 #include <type_traits>
175
176 #include "lib/stdcompat/bit.h"
177 #include "pw_span/span.h"
178
179 namespace pw {
180 namespace varint {
181
182 /// Maximum size of a varint (LEB128) encoded `uint32_t`.
183 inline constexpr size_t kMaxVarint32SizeBytes = PW_VARINT_MAX_INT32_SIZE_BYTES;
184
185 /// Maximum size of a varint (LEB128) encoded `uint64_t`.
186 inline constexpr size_t kMaxVarint64SizeBytes = PW_VARINT_MAX_INT64_SIZE_BYTES;
187
188 /// ZigZag encodes a signed integer. This maps small negative numbers to small,
189 /// unsigned positive numbers, which improves their density for LEB128 encoding.
190 ///
191 /// ZigZag encoding works by moving the sign bit from the most-significant bit
192 /// to the least-significant bit. For the signed `k`-bit integer `n`, the
193 /// formula is:
194 ///
195 /// @code{.cpp}
196 /// (n << 1) ^ (n >> (k - 1))
197 /// @endcode
198 ///
199 /// See the following for a description of ZigZag encoding:
200 /// https://protobuf.dev/programming-guides/encoding/#signed-ints
201 template <typename T>
ZigZagEncode(T n)202 constexpr std::make_unsigned_t<T> ZigZagEncode(T n) {
203 static_assert(std::is_signed<T>(), "Zig-zag encoding is for signed integers");
204 using U = std::make_unsigned_t<T>;
205 return static_cast<U>(static_cast<U>(n) << 1) ^
206 static_cast<U>(n >> (sizeof(T) * 8 - 1));
207 }
208
209 /// ZigZag decodes a signed integer.
210 ///
211 /// The calculation is done modulo `std::numeric_limits<T>::max()+1`, so the
212 /// unsigned integer overflows are intentional.
213 template <typename T>
ZigZagDecode(T n)214 constexpr std::make_signed_t<T> ZigZagDecode(T n)
215 PW_NO_SANITIZE("unsigned-integer-overflow") {
216 static_assert(std::is_unsigned<T>(),
217 "Zig-zag decoding is for unsigned integers");
218 return static_cast<std::make_signed_t<T>>((n >> 1) ^ (~(n & 1) + 1));
219 }
220
221 /// @brief Computes the size of an integer when encoded as a varint.
222 ///
223 /// @param integer The integer whose encoded size is to be computed. `integer`
224 /// can be signed or unsigned.
225 ///
226 /// @returns The size of `integer` when encoded as a varint.
227 template <typename T,
228 typename = std::enable_if_t<std::is_integral<T>::value ||
229 std::is_convertible<T, uint64_t>::value>>
EncodedSize(T integer)230 constexpr size_t EncodedSize(T integer) {
231 if (integer == 0) {
232 return 1;
233 }
234 return static_cast<size_t>(
235 (64 - cpp20::countl_zero(static_cast<uint64_t>(integer)) + 6) / 7);
236 }
237
238 /// Encodes a `uint64_t` with Little-Endian Base 128 (LEB128) encoding.
239 /// @returns the number of bytes written; 0 if the buffer is too small
EncodeLittleEndianBase128(uint64_t integer,const span<std::byte> & output)240 inline size_t EncodeLittleEndianBase128(uint64_t integer,
241 const span<std::byte>& output) {
242 return pw_varint_Encode64(integer, output.data(), output.size());
243 }
244
245 /// Encodes the provided integer using a variable-length encoding and returns
246 /// the number of bytes written.
247 ///
248 /// The encoding is the same as used in protocol buffers. Signed integers are
249 /// ZigZag encoded to remove leading 1s from small negative numbers, then the
250 /// resulting number is encoded as Little Endian Base 128 (LEB128). Unsigned
251 /// integers are encoded directly as LEB128.
252 ///
253 /// Returns the number of bytes written or 0 if the result didn't fit in the
254 /// encoding buffer.
255 template <typename T>
Encode(T integer,const span<std::byte> & output)256 size_t Encode(T integer, const span<std::byte>& output) {
257 if (std::is_signed<T>()) {
258 using Signed =
259 std::conditional_t<std::is_signed<T>::value, T, std::make_signed_t<T>>;
260 return EncodeLittleEndianBase128(ZigZagEncode(static_cast<Signed>(integer)),
261 output);
262 } else {
263 using Unsigned = std::
264 conditional_t<std::is_signed<T>::value, std::make_unsigned_t<T>, T>;
265 return EncodeLittleEndianBase128(static_cast<Unsigned>(integer), output);
266 }
267 }
268
269 /// Decodes a varint-encoded value. If reading into a signed integer, the value
270 /// is ZigZag decoded.
271 ///
272 /// Returns the number of bytes read from the input if successful. Returns zero
273 /// if the result does not fit in a `int64_t`/ `uint64_t` or if the input is
274 /// exhausted before the number terminates. Reads a maximum of 10 bytes.
275 ///
276 /// The following example decodes multiple varints from a buffer:
277 ///
278 /// @code{.cpp}
279 ///
280 /// while (!data.empty()) {
281 /// int64_t value;
282 /// size_t bytes = Decode(data, &value);
283 ///
284 /// if (bytes == 0u) {
285 /// return Status::DataLoss();
286 /// }
287 /// results.push_back(value);
288 /// data = data.subspan(bytes)
289 /// }
290 ///
291 /// @endcode
Decode(const span<const std::byte> & input,int64_t * output)292 inline size_t Decode(const span<const std::byte>& input, int64_t* output) {
293 uint64_t value = 0;
294 size_t bytes_read = pw_varint_Decode64(input.data(), input.size(), &value);
295 *output = pw_varint_ZigZagDecode64(value);
296 return bytes_read;
297 }
298
299 /// @overload
Decode(const span<const std::byte> & input,uint64_t * value)300 inline size_t Decode(const span<const std::byte>& input, uint64_t* value) {
301 return pw_varint_Decode64(input.data(), input.size(), value);
302 }
303
304 /// Describes a custom varint format.
305 enum class Format {
306 kZeroTerminatedLeastSignificant = PW_VARINT_ZERO_TERMINATED_LEAST_SIGNIFICANT,
307 kZeroTerminatedMostSignificant = PW_VARINT_ZERO_TERMINATED_MOST_SIGNIFICANT,
308 kOneTerminatedLeastSignificant = PW_VARINT_ONE_TERMINATED_LEAST_SIGNIFICANT,
309 kOneTerminatedMostSignificant = PW_VARINT_ONE_TERMINATED_MOST_SIGNIFICANT,
310 };
311
312 /// Encodes a varint in a custom format.
Encode(uint64_t value,span<std::byte> output,Format format)313 inline size_t Encode(uint64_t value, span<std::byte> output, Format format) {
314 return pw_varint_EncodeCustom(value,
315 output.data(),
316 output.size(),
317 static_cast<pw_varint_Format>(format));
318 }
319
320 /// Decodes a varint from a custom format.
Decode(span<const std::byte> input,uint64_t * value,Format format)321 inline size_t Decode(span<const std::byte> input,
322 uint64_t* value,
323 Format format) {
324 return pw_varint_DecodeCustom(
325 input.data(), input.size(), value, static_cast<pw_varint_Format>(format));
326 }
327
328 /// @brief Returns the maximum (max) integer value that can be encoded as a
329 /// varint into the specified number of bytes.
330 ///
331 /// The following table lists the max value for each byte size:
332 ///
333 /// | Bytes | Max value |
334 /// | ----- | ------------------------- |
335 /// | 1 | 127 |
336 /// | 2 | 16,383 |
337 /// | 3 | 2,097,151 |
338 /// | 4 | 268,435,455 |
339 /// | 5 | 34,359,738,367 |
340 /// | 6 | 4,398,046,511,103 |
341 /// | 7 | 562,949,953,421,311 |
342 /// | 8 | 72,057,594,037,927,935 |
343 /// | 9 | 9,223,372,036,854,775,807 |
344 /// | 10 | (uint64 max value) |
345 ///
346 /// @param bytes The size of the varint, in bytes. 5 bytes are needed for the
347 /// max `uint32` value. 10 bytes are needed for the max `uint64` value.
348 ///
349 /// @return The max integer value for a varint of size `bytes`.
MaxValueInBytes(size_t bytes)350 constexpr uint64_t MaxValueInBytes(size_t bytes) {
351 return bytes >= kMaxVarint64SizeBytes ? std::numeric_limits<uint64_t>::max()
352 : (uint64_t(1) << (7 * bytes)) - 1;
353 }
354
355 } // namespace varint
356 } // namespace pw
357
358 #endif // __cplusplus
359