xref: /aosp_15_r20/external/llvm-libc/src/string/string_utils.h (revision 71db0c75aadcf003ffe3238005f61d7618a3fead)
1 //===-- String utils --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Standalone string utility functions. Utilities requiring memory allocations
10 // should be placed in allocating_string_utils.h instead.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15 #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
16 
17 #include "src/__support/CPP/bitset.h"
18 #include "src/__support/macros/config.h"
19 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
20 #include "src/string/memory_utils/inline_bzero.h"
21 #include "src/string/memory_utils/inline_memcpy.h"
22 #include <stddef.h> // For size_t
23 
24 namespace LIBC_NAMESPACE_DECL {
25 namespace internal {
26 
repeat_byte(Word byte)27 template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
28   constexpr size_t BITS_IN_BYTE = 8;
29   constexpr size_t BYTE_MASK = 0xff;
30   Word result = 0;
31   byte = byte & BYTE_MASK;
32   for (size_t i = 0; i < sizeof(Word); ++i)
33     result = (result << BITS_IN_BYTE) | byte;
34   return result;
35 }
36 
37 // The goal of this function is to take in a block of arbitrary size and return
38 // if it has any bytes equal to zero without branching. This is done by
39 // transforming the block such that zero bytes become non-zero and non-zero
40 // bytes become zero.
41 // The first transformation relies on the properties of carrying in arithmetic
42 // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
43 // then the result for that byte must be equal to 0xff (or 0xfe if the next byte
44 // needs a carry as well).
45 // The next transformation is a simple mask. All zero bytes will have the high
46 // bit set after the subtraction, so each byte is masked with 0x80. This narrows
47 // the set of bytes that result in a non-zero value to only zero bytes and bytes
48 // with the high bit and any other bit set.
49 // The final transformation masks the result of the previous transformations
50 // with the inverse of the original byte. This means that any byte that had the
51 // high bit set will no longer have it set, narrowing the list of bytes which
52 // result in non-zero values to just the zero byte.
has_zeroes(Word block)53 template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
54   constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
55   constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
56   Word subtracted = block - LOW_BITS;
57   Word inverted = ~block;
58   return (subtracted & inverted & HIGH_BITS) != 0;
59 }
60 
61 template <typename Word>
string_length_wide_read(const char * src)62 LIBC_INLINE size_t string_length_wide_read(const char *src) {
63   const char *char_ptr = src;
64   // Step 1: read 1 byte at a time to align to block size
65   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
66        ++char_ptr) {
67     if (*char_ptr == '\0')
68       return char_ptr - src;
69   }
70   // Step 2: read blocks
71   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
72        !has_zeroes<Word>(*block_ptr); ++block_ptr) {
73     char_ptr = reinterpret_cast<const char *>(block_ptr);
74   }
75   // Step 3: find the zero in the block
76   for (; *char_ptr != '\0'; ++char_ptr) {
77     ;
78   }
79   return char_ptr - src;
80 }
81 
string_length_byte_read(const char * src)82 LIBC_INLINE size_t string_length_byte_read(const char *src) {
83   size_t length;
84   for (length = 0; *src; ++src, ++length)
85     ;
86   return length;
87 }
88 
89 // Returns the length of a string, denoted by the first occurrence
90 // of a null terminator.
string_length(const char * src)91 LIBC_INLINE size_t string_length(const char *src) {
92 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
93   // Unsigned int is the default size for most processors, and on x86-64 it
94   // performs better than larger sizes when the src pointer can't be assumed to
95   // be aligned to a word boundary, so it's the size we use for reading the
96   // string a block at a time.
97   return string_length_wide_read<unsigned int>(src);
98 #else
99   return string_length_byte_read(src);
100 #endif
101 }
102 
103 template <typename Word>
find_first_character_wide_read(const unsigned char * src,unsigned char ch,size_t n)104 LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src,
105                                                  unsigned char ch, size_t n) {
106   const unsigned char *char_ptr = src;
107   size_t cur = 0;
108 
109   // Step 1: read 1 byte at a time to align to block size
110   for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
111        ++char_ptr, ++cur) {
112     if (*char_ptr == ch)
113       return const_cast<unsigned char *>(char_ptr);
114   }
115 
116   const Word ch_mask = repeat_byte<Word>(ch);
117 
118   // Step 2: read blocks
119   for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
120        !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
121        ++block_ptr, cur += sizeof(Word)) {
122     char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
123   }
124 
125   // Step 3: find the match in the block
126   for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
127     ;
128   }
129 
130   if (*char_ptr != ch || cur >= n)
131     return static_cast<void *>(nullptr);
132 
133   return const_cast<unsigned char *>(char_ptr);
134 }
135 
find_first_character_byte_read(const unsigned char * src,unsigned char ch,size_t n)136 LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
137                                                  unsigned char ch, size_t n) {
138   for (; n && *src != ch; --n, ++src)
139     ;
140   return n ? const_cast<unsigned char *>(src) : nullptr;
141 }
142 
143 // Returns the first occurrence of 'ch' within the first 'n' characters of
144 // 'src'. If 'ch' is not found, returns nullptr.
find_first_character(const unsigned char * src,unsigned char ch,size_t max_strlen)145 LIBC_INLINE void *find_first_character(const unsigned char *src,
146                                        unsigned char ch, size_t max_strlen) {
147 #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
148   // If the maximum size of the string is small, the overhead of aligning to a
149   // word boundary and generating a bitmask of the appropriate size may be
150   // greater than the gains from reading larger chunks. Based on some testing,
151   // the crossover point between when it's faster to just read bytewise and read
152   // blocks is somewhere between 16 and 32, so 4 times the size of the block
153   // should be in that range.
154   // Unsigned int is used for the same reason as in strlen.
155   using BlockType = unsigned int;
156   if (max_strlen > (sizeof(BlockType) * 4)) {
157     return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
158   }
159 #endif
160   return find_first_character_byte_read(src, ch, max_strlen);
161 }
162 
163 // Returns the maximum length span that contains only characters not found in
164 // 'segment'. If no characters are found, returns the length of 'src'.
complementary_span(const char * src,const char * segment)165 LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
166   const char *initial = src;
167   cpp::bitset<256> bitset;
168 
169   for (; *segment; ++segment)
170     bitset.set(*reinterpret_cast<const unsigned char *>(segment));
171   for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src));
172        ++src)
173     ;
174   return src - initial;
175 }
176 
177 // Given the similarities between strtok and strtok_r, we can implement both
178 // using a utility function. On the first call, 'src' is scanned for the
179 // first character not found in 'delimiter_string'. Once found, it scans until
180 // the first character in the 'delimiter_string' or the null terminator is
181 // found. We define this span as a token. The end of the token is appended with
182 // a null terminator, and the token is returned. The point where the last token
183 // is found is then stored within 'context' for subsequent calls. Subsequent
184 // calls will use 'context' when a nullptr is passed in for 'src'. Once the null
185 // terminating character is reached, returns a nullptr.
186 template <bool SkipDelim = true>
string_token(char * __restrict src,const char * __restrict delimiter_string,char ** __restrict saveptr)187 LIBC_INLINE char *string_token(char *__restrict src,
188                                const char *__restrict delimiter_string,
189                                char **__restrict saveptr) {
190   // Return nullptr immediately if both src AND saveptr are nullptr
191   if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
192     return nullptr;
193 
194   cpp::bitset<256> delimiter_set;
195   for (; *delimiter_string != '\0'; ++delimiter_string)
196     delimiter_set.set(*delimiter_string);
197 
198   if constexpr (SkipDelim)
199     for (; *src != '\0' && delimiter_set.test(*src); ++src)
200       ;
201   if (*src == '\0') {
202     *saveptr = src;
203     return nullptr;
204   }
205   char *token = src;
206   for (; *src != '\0'; ++src) {
207     if (delimiter_set.test(*src)) {
208       *src = '\0';
209       ++src;
210       break;
211     }
212   }
213   *saveptr = src;
214   return token;
215 }
216 
strlcpy(char * __restrict dst,const char * __restrict src,size_t size)217 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
218                            size_t size) {
219   size_t len = internal::string_length(src);
220   if (!size)
221     return len;
222   size_t n = len < size - 1 ? len : size - 1;
223   inline_memcpy(dst, src, n);
224   dst[n] = '\0';
225   return len;
226 }
227 
228 template <bool ReturnNull = true>
strchr_implementation(const char * src,int c)229 LIBC_INLINE constexpr static char *strchr_implementation(const char *src,
230                                                          int c) {
231   char ch = static_cast<char>(c);
232   for (; *src && *src != ch; ++src)
233     ;
234   char *ret = ReturnNull ? nullptr : const_cast<char *>(src);
235   return *src == ch ? const_cast<char *>(src) : ret;
236 }
237 
strrchr_implementation(const char * src,int c)238 LIBC_INLINE constexpr static char *strrchr_implementation(const char *src,
239                                                           int c) {
240   char ch = static_cast<char>(c);
241   char *last_occurrence = nullptr;
242   while (true) {
243     if (*src == ch)
244       last_occurrence = const_cast<char *>(src);
245     if (!*src)
246       return last_occurrence;
247     ++src;
248   }
249 }
250 
251 } // namespace internal
252 } // namespace LIBC_NAMESPACE_DECL
253 
254 #endif //  LLVM_LIBC_SRC_STRING_STRING_UTILS_H
255