1 //===-- String utils --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Standalone string utility functions. Utilities requiring memory allocations
10 // should be placed in allocating_string_utils.h instead.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15 #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
16
17 #include "src/__support/CPP/bitset.h"
18 #include "src/__support/macros/config.h"
19 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
20 #include "src/string/memory_utils/inline_bzero.h"
21 #include "src/string/memory_utils/inline_memcpy.h"
22 #include <stddef.h> // For size_t
23
24 namespace LIBC_NAMESPACE_DECL {
25 namespace internal {
26
repeat_byte(Word byte)27 template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
28 constexpr size_t BITS_IN_BYTE = 8;
29 constexpr size_t BYTE_MASK = 0xff;
30 Word result = 0;
31 byte = byte & BYTE_MASK;
32 for (size_t i = 0; i < sizeof(Word); ++i)
33 result = (result << BITS_IN_BYTE) | byte;
34 return result;
35 }
36
37 // The goal of this function is to take in a block of arbitrary size and return
38 // if it has any bytes equal to zero without branching. This is done by
39 // transforming the block such that zero bytes become non-zero and non-zero
40 // bytes become zero.
41 // The first transformation relies on the properties of carrying in arithmetic
42 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
43 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
44 // needs a carry as well).
45 // The next transformation is a simple mask. All zero bytes will have the high
46 // bit set after the subtraction, so each byte is masked with 0x80. This narrows
47 // the set of bytes that result in a non-zero value to only zero bytes and bytes
48 // with the high bit and any other bit set.
49 // The final transformation masks the result of the previous transformations
50 // with the inverse of the original byte. This means that any byte that had the
51 // high bit set will no longer have it set, narrowing the list of bytes which
52 // result in non-zero values to just the zero byte.
has_zeroes(Word block)53 template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
54 constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
55 constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
56 Word subtracted = block - LOW_BITS;
57 Word inverted = ~block;
58 return (subtracted & inverted & HIGH_BITS) != 0;
59 }
60
61 template <typename Word>
string_length_wide_read(const char * src)62 LIBC_INLINE size_t string_length_wide_read(const char *src) {
63 const char *char_ptr = src;
64 // Step 1: read 1 byte at a time to align to block size
65 for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
66 ++char_ptr) {
67 if (*char_ptr == '\0')
68 return char_ptr - src;
69 }
70 // Step 2: read blocks
71 for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
72 !has_zeroes<Word>(*block_ptr); ++block_ptr) {
73 char_ptr = reinterpret_cast<const char *>(block_ptr);
74 }
75 // Step 3: find the zero in the block
76 for (; *char_ptr != '\0'; ++char_ptr) {
77 ;
78 }
79 return char_ptr - src;
80 }
81
string_length_byte_read(const char * src)82 LIBC_INLINE size_t string_length_byte_read(const char *src) {
83 size_t length;
84 for (length = 0; *src; ++src, ++length)
85 ;
86 return length;
87 }
88
89 // Returns the length of a string, denoted by the first occurrence
90 // of a null terminator.
string_length(const char * src)91 LIBC_INLINE size_t string_length(const char *src) {
92 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
93 // Unsigned int is the default size for most processors, and on x86-64 it
94 // performs better than larger sizes when the src pointer can't be assumed to
95 // be aligned to a word boundary, so it's the size we use for reading the
96 // string a block at a time.
97 return string_length_wide_read<unsigned int>(src);
98 #else
99 return string_length_byte_read(src);
100 #endif
101 }
102
103 template <typename Word>
find_first_character_wide_read(const unsigned char * src,unsigned char ch,size_t n)104 LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
105 unsigned char ch, size_t n) {
106 const unsigned char *char_ptr = src;
107 size_t cur = 0;
108
109 // Step 1: read 1 byte at a time to align to block size
110 for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
111 ++char_ptr, ++cur) {
112 if (*char_ptr == ch)
113 return const_cast<unsigned char *>(char_ptr);
114 }
115
116 const Word ch_mask = repeat_byte<Word>(ch);
117
118 // Step 2: read blocks
119 for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
120 !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
121 ++block_ptr, cur += sizeof(Word)) {
122 char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
123 }
124
125 // Step 3: find the match in the block
126 for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
127 ;
128 }
129
130 if (*char_ptr != ch || cur >= n)
131 return static_cast<void *>(nullptr);
132
133 return const_cast<unsigned char *>(char_ptr);
134 }
135
find_first_character_byte_read(const unsigned char * src,unsigned char ch,size_t n)136 LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
137 unsigned char ch, size_t n) {
138 for (; n && *src != ch; --n, ++src)
139 ;
140 return n ? const_cast<unsigned char *>(src) : nullptr;
141 }
142
143 // Returns the first occurrence of 'ch' within the first 'n' characters of
144 // 'src'. If 'ch' is not found, returns nullptr.
find_first_character(const unsigned char * src,unsigned char ch,size_t max_strlen)145 LIBC_INLINE void *find_first_character(const unsigned char *src,
146 unsigned char ch, size_t max_strlen) {
147 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
148 // If the maximum size of the string is small, the overhead of aligning to a
149 // word boundary and generating a bitmask of the appropriate size may be
150 // greater than the gains from reading larger chunks. Based on some testing,
151 // the crossover point between when it's faster to just read bytewise and read
152 // blocks is somewhere between 16 and 32, so 4 times the size of the block
153 // should be in that range.
154 // Unsigned int is used for the same reason as in strlen.
155 using BlockType = unsigned int;
156 if (max_strlen > (sizeof(BlockType) * 4)) {
157 return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
158 }
159 #endif
160 return find_first_character_byte_read(src, ch, max_strlen);
161 }
162
163 // Returns the maximum length span that contains only characters not found in
164 // 'segment'. If no characters are found, returns the length of 'src'.
complementary_span(const char * src,const char * segment)165 LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
166 const char *initial = src;
167 cpp::bitset<256> bitset;
168
169 for (; *segment; ++segment)
170 bitset.set(*reinterpret_cast<const unsigned char *>(segment));
171 for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
172 ++src)
173 ;
174 return src - initial;
175 }
176
177 // Given the similarities between strtok and strtok_r, we can implement both
178 // using a utility function. On the first call, 'src' is scanned for the
179 // first character not found in 'delimiter_string'. Once found, it scans until
180 // the first character in the 'delimiter_string' or the null terminator is
181 // found. We define this span as a token. The end of the token is appended with
182 // a null terminator, and the token is returned. The point where the last token
183 // is found is then stored within 'context' for subsequent calls. Subsequent
184 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
185 // terminating character is reached, returns a nullptr.
186 template <bool SkipDelim = true>
string_token(char * __restrict src,const char * __restrict delimiter_string,char ** __restrict saveptr)187 LIBC_INLINE char *string_token(char *__restrict src,
188 const char *__restrict delimiter_string,
189 char **__restrict saveptr) {
190 // Return nullptr immediately if both src AND saveptr are nullptr
191 if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
192 return nullptr;
193
194 cpp::bitset<256> delimiter_set;
195 for (; *delimiter_string != '\0'; ++delimiter_string)
196 delimiter_set.set(*delimiter_string);
197
198 if constexpr (SkipDelim)
199 for (; *src != '\0' && delimiter_set.test(*src); ++src)
200 ;
201 if (*src == '\0') {
202 *saveptr = src;
203 return nullptr;
204 }
205 char *token = src;
206 for (; *src != '\0'; ++src) {
207 if (delimiter_set.test(*src)) {
208 *src = '\0';
209 ++src;
210 break;
211 }
212 }
213 *saveptr = src;
214 return token;
215 }
216
strlcpy(char * __restrict dst,const char * __restrict src,size_t size)217 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
218 size_t size) {
219 size_t len = internal::string_length(src);
220 if (!size)
221 return len;
222 size_t n = len < size - 1 ? len : size - 1;
223 inline_memcpy(dst, src, n);
224 dst[n] = '\0';
225 return len;
226 }
227
228 template <bool ReturnNull = true>
strchr_implementation(const char * src,int c)229 LIBC_INLINE constexpr static char *strchr_implementation(const char *src,
230 int c) {
231 char ch = static_cast<char>(c);
232 for (; *src && *src != ch; ++src)
233 ;
234 char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
235 return *src == ch ? const_cast<char *>(src) : ret;
236 }
237
strrchr_implementation(const char * src,int c)238 LIBC_INLINE constexpr static char *strrchr_implementation(const char *src,
239 int c) {
240 char ch = static_cast<char>(c);
241 char *last_occurrence = nullptr;
242 while (true) {
243 if (*src == ch)
244 last_occurrence = const_cast<char *>(src);
245 if (!*src)
246 return last_occurrence;
247 ++src;
248 }
249 }
250
251 } // namespace internal
252 } // namespace LIBC_NAMESPACE_DECL
253
254 #endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H
255