1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/ascii.h"
16
17 #include <climits>
18 #include <cstddef>
19 #include <cstring>
20 #include <string>
21
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 #include "absl/base/nullability.h"
25 #include "absl/base/optimization.h"
26
27 namespace absl {
28 ABSL_NAMESPACE_BEGIN
29 namespace ascii_internal {
30
31 // # Table generated by this Python code (bit 0x02 is currently unused):
32 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
33
34 // NOTE: The kAsciiPropertyBits table used within this code was generated by
35 // Python code of the following form. (Bit 0x02 is currently unused and
36 // available.)
37 //
38 // def Hex2(n):
39 // return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
40 // def IsPunct(ch):
41 // return (ord(ch) >= 32 and ord(ch) < 127 and
42 // not ch.isspace() and not ch.isalnum())
43 // def IsBlank(ch):
44 // return ch in ' \t'
45 // def IsCntrl(ch):
46 // return ord(ch) < 32 or ord(ch) == 127
47 // def IsXDigit(ch):
48 // return ch.isdigit() or ch.lower() in 'abcdef'
49 // for i in range(128):
50 // ch = chr(i)
51 // mask = ((ch.isalpha() and 0x01 or 0) |
52 // (ch.isalnum() and 0x04 or 0) |
53 // (ch.isspace() and 0x08 or 0) |
54 // (IsPunct(ch) and 0x10 or 0) |
55 // (IsBlank(ch) and 0x20 or 0) |
56 // (IsCntrl(ch) and 0x40 or 0) |
57 // (IsXDigit(ch) and 0x80 or 0))
58 // print Hex2(mask) + ',',
59 // if i % 16 == 7:
60 // print ' //', Hex2(i & 0x78)
61 // elif i % 16 == 15:
62 // print
63
64 // clang-format off
65 // Array of bitfields holding character information. Each bit value corresponds
66 // to a particular character feature. For readability, and because the value
67 // of these bits is tightly coupled to this implementation, the individual bits
68 // are not named. Note that bitfields for all characters above ASCII 127 are
69 // zero-initialized.
70 ABSL_DLL const unsigned char kPropertyBits[256] = {
71 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
72 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
73 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
74 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
75 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
76 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
77 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
78 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
79 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
80 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
81 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
82 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
83 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
84 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
85 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
86 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
87 };
88
89 // Array of characters for the ascii_tolower() function. For values 'A'
90 // through 'Z', return the lower-case character; otherwise, return the
91 // identity of the passed character.
92 ABSL_DLL const char kToLower[256] = {
93 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
94 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
95 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
96 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
97 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
98 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
99 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
100 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
101 '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
102 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
103 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
104 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
105 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
106 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
107 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
108 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
109 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
110 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
111 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
112 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
113 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
114 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
115 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
116 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
117 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
118 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
119 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
120 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
121 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
122 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
123 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
124 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
125 };
126
127 // Array of characters for the ascii_toupper() function. For values 'a'
128 // through 'z', return the upper-case character; otherwise, return the
129 // identity of the passed character.
130 ABSL_DLL const char kToUpper[256] = {
131 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
132 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
133 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
134 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
135 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
136 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
137 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
138 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
139 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
140 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
141 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
142 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
143 '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
144 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
145 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
146 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
147 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
148 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
149 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
150 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
151 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
152 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
153 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
154 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
155 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
156 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
157 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
158 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
159 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
160 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
161 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
162 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
163 };
164 // clang-format on
165
166 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
167 // Implemented by:
168 // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
169 // 2. Comparing to SCHAR_MIN + 26.
170 template <bool ToUpper>
AsciiInAZRange(unsigned char c)171 constexpr bool AsciiInAZRange(unsigned char c) {
172 constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
173 constexpr signed char threshold = SCHAR_MIN + 26; // 26 = alphabet size.
174 // Using unsigned arithmetic as overflows/underflows are well defined.
175 unsigned char u = c - sub;
176 // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
177 return static_cast<signed char>(u) < threshold;
178 }
179
180 // Force-inline so the compiler won't merge the short and long implementations.
181 template <bool ToUpper>
AsciiStrCaseFoldImpl(absl::Nonnull<char * > p,size_t size)182 ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl(
183 absl::Nonnull<char*> p, size_t size) {
184 // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
185 // When we need to flip the case, we can xor with this bit to achieve the
186 // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
187 // could have chosen 'z' and 'Z', or any other pair of characters as they all
188 // have the same single bit difference.
189 constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
190
191 for (size_t i = 0; i < size; ++i) {
192 unsigned char v = static_cast<unsigned char>(p[i]);
193 v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
194 p[i] = static_cast<char>(v);
195 }
196 }
197
198 // The string size threshold for starting using the long string version.
199 constexpr size_t kCaseFoldThreshold = 16;
200
201 // No-inline so the compiler won't merge the short and long implementations.
202 template <bool ToUpper>
AsciiStrCaseFoldLong(absl::Nonnull<char * > p,size_t size)203 ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong(
204 absl::Nonnull<char*> p, size_t size) {
205 ABSL_ASSUME(size >= kCaseFoldThreshold);
206 AsciiStrCaseFoldImpl<ToUpper>(p, size);
207 }
208
209 // Splitting to short and long strings to allow vectorization decisions
210 // to be made separately in the long and short cases.
211 template <bool ToUpper>
AsciiStrCaseFold(absl::Nonnull<char * > p,size_t size)212 constexpr void AsciiStrCaseFold(absl::Nonnull<char*> p, size_t size) {
213 size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl<ToUpper>(p, size)
214 : AsciiStrCaseFoldLong<ToUpper>(p, size);
215 }
216
ValidateAsciiCasefold()217 static constexpr size_t ValidateAsciiCasefold() {
218 constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
219 size_t incorrect_index = 0;
220 char lowered[num_chars] = {};
221 char uppered[num_chars] = {};
222 for (unsigned int i = 0; i < num_chars; ++i) {
223 uppered[i] = lowered[i] = static_cast<char>(i);
224 }
225 AsciiStrCaseFold<false>(&lowered[0], num_chars);
226 AsciiStrCaseFold<true>(&uppered[0], num_chars);
227 for (size_t i = 0; i < num_chars; ++i) {
228 const char ch = static_cast<char>(i),
229 ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
230 ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
231 if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
232 incorrect_index = i > 0 ? i : num_chars;
233 break;
234 }
235 }
236 return incorrect_index;
237 }
238
239 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
240
241 } // namespace ascii_internal
242
AsciiStrToLower(absl::Nonnull<std::string * > s)243 void AsciiStrToLower(absl::Nonnull<std::string*> s) {
244 return ascii_internal::AsciiStrCaseFold<false>(&(*s)[0], s->size());
245 }
246
AsciiStrToUpper(absl::Nonnull<std::string * > s)247 void AsciiStrToUpper(absl::Nonnull<std::string*> s) {
248 return ascii_internal::AsciiStrCaseFold<true>(&(*s)[0], s->size());
249 }
250
RemoveExtraAsciiWhitespace(absl::Nonnull<std::string * > str)251 void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) {
252 auto stripped = StripAsciiWhitespace(*str);
253
254 if (stripped.empty()) {
255 str->clear();
256 return;
257 }
258
259 auto input_it = stripped.begin();
260 auto input_end = stripped.end();
261 auto output_it = &(*str)[0];
262 bool is_ws = false;
263
264 for (; input_it < input_end; ++input_it) {
265 if (is_ws) {
266 // Consecutive whitespace? Keep only the last.
267 is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
268 if (is_ws) --output_it;
269 } else {
270 is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
271 }
272
273 *output_it = *input_it;
274 ++output_it;
275 }
276
277 str->erase(static_cast<size_t>(output_it - &(*str)[0]));
278 }
279
280 ABSL_NAMESPACE_END
281 } // namespace absl
282