1 // 2 // Copyright 2017 The Abseil Authors. 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // https://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // ----------------------------------------------------------------------------- 17 // File: str_split.h 18 // ----------------------------------------------------------------------------- 19 // 20 // This file contains functions for splitting strings. It defines the main 21 // `StrSplit()` function, several delimiters for determining the boundaries on 22 // which to split the string, and predicates for filtering delimited results. 23 // `StrSplit()` adapts the returned collection to the type specified by the 24 // caller. 25 // 26 // Example: 27 // 28 // // Splits the given string on commas. Returns the results in a 29 // // vector of strings. 30 // std::vector<std::string> v = absl::StrSplit("a,b,c", ','); 31 // // Can also use "," 32 // // v[0] == "a", v[1] == "b", v[2] == "c" 33 // 34 // See StrSplit() below for more information. 35 #ifndef ABSL_STRINGS_STR_SPLIT_H_ 36 #define ABSL_STRINGS_STR_SPLIT_H_ 37 38 #include <algorithm> 39 #include <cstddef> 40 #include <map> 41 #include <set> 42 #include <string> 43 #include <utility> 44 #include <vector> 45 46 #include "absl/base/internal/raw_logging.h" 47 #include "absl/base/macros.h" 48 #include "absl/strings/internal/str_split_internal.h" 49 #include "absl/strings/string_view.h" 50 #include "absl/strings/strip.h" 51 52 namespace absl { 53 ABSL_NAMESPACE_BEGIN 54 55 //------------------------------------------------------------------------------ 56 // Delimiters 57 //------------------------------------------------------------------------------ 58 // 59 // `StrSplit()` uses delimiters to define the boundaries between elements in the 60 // provided input. Several `Delimiter` types are defined below. If a string 61 // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of 62 // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it 63 // were passed a `ByString` delimiter. 64 // 65 // A `Delimiter` is an object with a `Find()` function that knows how to find 66 // the first occurrence of itself in a given `absl::string_view`. 67 // 68 // The following `Delimiter` types are available for use within `StrSplit()`: 69 // 70 // - `ByString` (default for string arguments) 71 // - `ByChar` (default for a char argument) 72 // - `ByAnyChar` 73 // - `ByLength` 74 // - `MaxSplits` 75 // 76 // A Delimiter's `Find()` member function will be passed an input `text` that is 77 // to be split and a position (`pos`) to begin searching for the next delimiter 78 // in `text`. The returned absl::string_view should refer to the next occurrence 79 // (after `pos`) of the represented delimiter; this returned absl::string_view 80 // represents the next location where the input `text` should be broken. 81 // 82 // The returned absl::string_view may be zero-length if the Delimiter does not 83 // represent a part of the string (e.g., a fixed-length delimiter). If no 84 // delimiter is found in the input `text`, a zero-length absl::string_view 85 // referring to `text.end()` should be returned (e.g., 86 // `text.substr(text.size())`). It is important that the returned 87 // absl::string_view always be within the bounds of the input `text` given as an 88 // argument--it must not refer to a string that is physically located outside of 89 // the given string. 90 // 91 // The following example is a simple Delimiter object that is created with a 92 // single char and will look for that char in the text passed to the `Find()` 93 // function: 94 // 95 // struct SimpleDelimiter { 96 // const char c_; 97 // explicit SimpleDelimiter(char c) : c_(c) {} 98 // absl::string_view Find(absl::string_view text, size_t pos) { 99 // auto found = text.find(c_, pos); 100 // if (found == absl::string_view::npos) 101 // return text.substr(text.size()); 102 // 103 // return text.substr(found, 1); 104 // } 105 // }; 106 107 // ByString 108 // 109 // A sub-string delimiter. If `StrSplit()` is passed a string in place of a 110 // `Delimiter` object, the string will be implicitly converted into a 111 // `ByString` delimiter. 112 // 113 // Example: 114 // 115 // // Because a string literal is converted to an `absl::ByString`, 116 // // the following two splits are equivalent. 117 // 118 // std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", "); 119 // 120 // using absl::ByString; 121 // std::vector<std::string> v2 = absl::StrSplit("a, b, c", 122 // ByString(", ")); 123 // // v[0] == "a", v[1] == "b", v[2] == "c" 124 class ByString { 125 public: 126 explicit ByString(absl::string_view sp); 127 absl::string_view Find(absl::string_view text, size_t pos) const; 128 129 private: 130 const std::string delimiter_; 131 }; 132 133 // ByAsciiWhitespace 134 // 135 // A sub-string delimiter that splits by ASCII whitespace 136 // (space, tab, vertical tab, formfeed, linefeed, or carriage return). 137 // Note: you probably want to use absl::SkipEmpty() as well! 138 // 139 // This class is equivalent to ByAnyChar with ASCII whitespace chars. 140 // 141 // Example: 142 // 143 // std::vector<std::string> v = absl::StrSplit( 144 // "a b\tc\n d \n", absl::ByAsciiWhitespace(), absl::SkipEmpty()); 145 // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" 146 class ByAsciiWhitespace { 147 public: 148 absl::string_view Find(absl::string_view text, size_t pos) const; 149 }; 150 151 // ByChar 152 // 153 // A single character delimiter. `ByChar` is functionally equivalent to a 154 // 1-char string within a `ByString` delimiter, but slightly more efficient. 155 // 156 // Example: 157 // 158 // // Because a char literal is converted to a absl::ByChar, 159 // // the following two splits are equivalent. 160 // std::vector<std::string> v1 = absl::StrSplit("a,b,c", ','); 161 // using absl::ByChar; 162 // std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(',')); 163 // // v[0] == "a", v[1] == "b", v[2] == "c" 164 // 165 // `ByChar` is also the default delimiter if a single character is given 166 // as the delimiter to `StrSplit()`. For example, the following calls are 167 // equivalent: 168 // 169 // std::vector<std::string> v = absl::StrSplit("a-b", '-'); 170 // 171 // using absl::ByChar; 172 // std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-')); 173 // 174 class ByChar { 175 public: ByChar(char c)176 explicit ByChar(char c) : c_(c) {} 177 absl::string_view Find(absl::string_view text, size_t pos) const; 178 179 private: 180 char c_; 181 }; 182 183 // ByAnyChar 184 // 185 // A delimiter that will match any of the given byte-sized characters within 186 // its provided string. 187 // 188 // Note: this delimiter works with single-byte string data, but does not work 189 // with variable-width encodings, such as UTF-8. 190 // 191 // Example: 192 // 193 // using absl::ByAnyChar; 194 // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); 195 // // v[0] == "a", v[1] == "b", v[2] == "c" 196 // 197 // If `ByAnyChar` is given the empty string, it behaves exactly like 198 // `ByString` and matches each individual character in the input string. 199 // 200 class ByAnyChar { 201 public: 202 explicit ByAnyChar(absl::string_view sp); 203 absl::string_view Find(absl::string_view text, size_t pos) const; 204 205 private: 206 const std::string delimiters_; 207 }; 208 209 // ByLength 210 // 211 // A delimiter for splitting into equal-length strings. The length argument to 212 // the constructor must be greater than 0. 213 // 214 // Note: this delimiter works with single-byte string data, but does not work 215 // with variable-width encodings, such as UTF-8. 216 // 217 // Example: 218 // 219 // using absl::ByLength; 220 // std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3)); 221 222 // // v[0] == "123", v[1] == "456", v[2] == "789" 223 // 224 // Note that the string does not have to be a multiple of the fixed split 225 // length. In such a case, the last substring will be shorter. 226 // 227 // using absl::ByLength; 228 // std::vector<std::string> v = absl::StrSplit("12345", ByLength(2)); 229 // 230 // // v[0] == "12", v[1] == "34", v[2] == "5" 231 class ByLength { 232 public: 233 explicit ByLength(ptrdiff_t length); 234 absl::string_view Find(absl::string_view text, size_t pos) const; 235 236 private: 237 const ptrdiff_t length_; 238 }; 239 240 namespace strings_internal { 241 242 // A traits-like metafunction for selecting the default Delimiter object type 243 // for a particular Delimiter type. The base case simply exposes type Delimiter 244 // itself as the delimiter's Type. However, there are specializations for 245 // string-like objects that map them to the ByString delimiter object. 246 // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept 247 // string-like objects (e.g., ',') as delimiter arguments but they will be 248 // treated as if a ByString delimiter was given. 249 template <typename Delimiter> 250 struct SelectDelimiter { 251 using type = Delimiter; 252 }; 253 254 template <> 255 struct SelectDelimiter<char> { 256 using type = ByChar; 257 }; 258 template <> 259 struct SelectDelimiter<char*> { 260 using type = ByString; 261 }; 262 template <> 263 struct SelectDelimiter<const char*> { 264 using type = ByString; 265 }; 266 template <> 267 struct SelectDelimiter<absl::string_view> { 268 using type = ByString; 269 }; 270 template <> 271 struct SelectDelimiter<std::string> { 272 using type = ByString; 273 }; 274 275 // Wraps another delimiter and sets a max number of matches for that delimiter. 276 template <typename Delimiter> 277 class MaxSplitsImpl { 278 public: 279 MaxSplitsImpl(Delimiter delimiter, int limit) 280 : delimiter_(delimiter), limit_(limit), count_(0) {} 281 absl::string_view Find(absl::string_view text, size_t pos) { 282 if (count_++ == limit_) { 283 return absl::string_view(text.data() + text.size(), 284 0); // No more matches. 285 } 286 return delimiter_.Find(text, pos); 287 } 288 289 private: 290 Delimiter delimiter_; 291 const int limit_; 292 int count_; 293 }; 294 295 } // namespace strings_internal 296 297 // MaxSplits() 298 // 299 // A delimiter that limits the number of matches which can occur to the passed 300 // `limit`. The last element in the returned collection will contain all 301 // remaining unsplit pieces, which may contain instances of the delimiter. 302 // The collection will contain at most `limit` + 1 elements. 303 // Example: 304 // 305 // using absl::MaxSplits; 306 // std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1)); 307 // 308 // // v[0] == "a", v[1] == "b,c" 309 template <typename Delimiter> 310 inline strings_internal::MaxSplitsImpl< 311 typename strings_internal::SelectDelimiter<Delimiter>::type> 312 MaxSplits(Delimiter delimiter, int limit) { 313 typedef 314 typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType; 315 return strings_internal::MaxSplitsImpl<DelimiterType>( 316 DelimiterType(delimiter), limit); 317 } 318 319 //------------------------------------------------------------------------------ 320 // Predicates 321 //------------------------------------------------------------------------------ 322 // 323 // Predicates filter the results of a `StrSplit()` by determining whether or not 324 // a resultant element is included in the result set. A predicate may be passed 325 // as an optional third argument to the `StrSplit()` function. 326 // 327 // Predicates are unary functions (or functors) that take a single 328 // `absl::string_view` argument and return a bool indicating whether the 329 // argument should be included (`true`) or excluded (`false`). 330 // 331 // Predicates are useful when filtering out empty substrings. By default, empty 332 // substrings may be returned by `StrSplit()`, which is similar to the way split 333 // functions work in other programming languages. 334 335 // AllowEmpty() 336 // 337 // Always returns `true`, indicating that all strings--including empty 338 // strings--should be included in the split output. This predicate is not 339 // strictly needed because this is the default behavior of `StrSplit()`; 340 // however, it might be useful at some call sites to make the intent explicit. 341 // 342 // Example: 343 // 344 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty()); 345 // 346 // // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == "" 347 struct AllowEmpty { 348 bool operator()(absl::string_view) const { return true; } 349 }; 350 351 // SkipEmpty() 352 // 353 // Returns `false` if the given `absl::string_view` is empty, indicating that 354 // `StrSplit()` should omit the empty string. 355 // 356 // Example: 357 // 358 // std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty()); 359 // 360 // // v[0] == "a", v[1] == "b" 361 // 362 // Note: `SkipEmpty()` does not consider a string containing only whitespace 363 // to be empty. To skip such whitespace as well, use the `SkipWhitespace()` 364 // predicate. 365 struct SkipEmpty { 366 bool operator()(absl::string_view sp) const { return !sp.empty(); } 367 }; 368 369 // SkipWhitespace() 370 // 371 // Returns `false` if the given `absl::string_view` is empty *or* contains only 372 // whitespace, indicating that `StrSplit()` should omit the string. 373 // 374 // Example: 375 // 376 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", 377 // ',', SkipWhitespace()); 378 // // v[0] == " a ", v[1] == "b" 379 // 380 // // SkipEmpty() would return whitespace elements 381 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty()); 382 // // v[0] == " a ", v[1] == " ", v[2] == "b" 383 struct SkipWhitespace { 384 bool operator()(absl::string_view sp) const { 385 sp = absl::StripAsciiWhitespace(sp); 386 return !sp.empty(); 387 } 388 }; 389 390 template <typename T> 391 using EnableSplitIfString = 392 typename std::enable_if<std::is_same<T, std::string>::value || 393 std::is_same<T, const std::string>::value, 394 int>::type; 395 396 //------------------------------------------------------------------------------ 397 // StrSplit() 398 //------------------------------------------------------------------------------ 399 400 // StrSplit() 401 // 402 // Splits a given string based on the provided `Delimiter` object, returning the 403 // elements within the type specified by the caller. Optionally, you may pass a 404 // `Predicate` to `StrSplit()` indicating whether to include or exclude the 405 // resulting element within the final result set. (See the overviews for 406 // Delimiters and Predicates above.) 407 // 408 // Example: 409 // 410 // std::vector<std::string> v = absl::StrSplit("a,b,c,d", ','); 411 // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" 412 // 413 // You can also provide an explicit `Delimiter` object: 414 // 415 // Example: 416 // 417 // using absl::ByAnyChar; 418 // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); 419 // // v[0] == "a", v[1] == "b", v[2] == "c" 420 // 421 // See above for more information on delimiters. 422 // 423 // By default, empty strings are included in the result set. You can optionally 424 // include a third `Predicate` argument to apply a test for whether the 425 // resultant element should be included in the result set: 426 // 427 // Example: 428 // 429 // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", 430 // ',', SkipWhitespace()); 431 // // v[0] == " a ", v[1] == "b" 432 // 433 // See above for more information on predicates. 434 // 435 //------------------------------------------------------------------------------ 436 // StrSplit() Return Types 437 //------------------------------------------------------------------------------ 438 // 439 // The `StrSplit()` function adapts the returned collection to the collection 440 // specified by the caller (e.g. `std::vector` above). The returned collections 441 // may contain `std::string`, `absl::string_view` (in which case the original 442 // string being split must ensure that it outlives the collection), or any 443 // object that can be explicitly created from an `absl::string_view`. This 444 // behavior works for: 445 // 446 // 1) All standard STL containers including `std::vector`, `std::list`, 447 // `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap`. 448 // 2) `std::pair` (which is not actually a container). See below. 449 // 3) `std::array`, which is a container but has different behavior due to its 450 // fixed size. See below. 451 // 452 // Example: 453 // 454 // // The results are returned as `absl::string_view` objects. Note that we 455 // // have to ensure that the input string outlives any results. 456 // std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ','); 457 // 458 // // Stores results in a std::set<std::string>, which also performs 459 // // de-duplication and orders the elements in ascending order. 460 // std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ','); 461 // // a[0] == "a", a[1] == "b", a[2] == "c" 462 // 463 // // `StrSplit()` can be used within a range-based for loop, in which case 464 // // each element will be of type `absl::string_view`. 465 // std::vector<std::string> v; 466 // for (const auto sv : absl::StrSplit("a,b,c", ',')) { 467 // if (sv != "b") v.emplace_back(sv); 468 // } 469 // // v[0] == "a", v[1] == "c" 470 // 471 // // Stores results in a map. The map implementation assumes that the input 472 // // is provided as a series of key/value pairs. For example, the 0th element 473 // // resulting from the split will be stored as a key to the 1st element. If 474 // // an odd number of elements are resolved, the last element is paired with 475 // // a default-constructed value (e.g., empty string). 476 // std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ','); 477 // // m["a"] == "b", m["c"] == "" // last component value equals "" 478 // 479 // Splitting to `std::pair` is an interesting case because it can hold only two 480 // elements and is not a collection type. When splitting to a `std::pair` the 481 // first two split strings become the `std::pair` `.first` and `.second` 482 // members, respectively. The remaining split substrings are discarded. If there 483 // are less than two split substrings, the empty string is used for the 484 // corresponding `std::pair` member. 485 // 486 // Example: 487 // 488 // // Stores first two split strings as the members in a std::pair. 489 // std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ','); 490 // // p.first == "a", p.second == "b" // "c" is omitted. 491 // 492 // 493 // Splitting to `std::array` is similar to splitting to `std::pair`, but for 494 // N elements instead of two; missing elements are filled with the empty string 495 // and extra elements are discarded. 496 // 497 // Examples: 498 // 499 // // Stores first two split strings as the elements in a std::array. 500 // std::array<std::string, 2> a = absl::StrSplit("a,b,c", ','); 501 // // a[0] == "a", a[1] == "b" // "c" is omitted. 502 // 503 // // The second element is empty. 504 // std::array<std::string, 2> a = absl::StrSplit("a,", ','); 505 // // a[0] == "a", a[1] == "" 506 // 507 // The `StrSplit()` function can be used multiple times to perform more 508 // complicated splitting logic, such as intelligently parsing key-value pairs. 509 // 510 // Example: 511 // 512 // // The input string "a=b=c,d=e,f=,g" becomes 513 // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } 514 // std::map<std::string, std::string> m; 515 // for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) { 516 // m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1))); 517 // } 518 // EXPECT_EQ("b=c", m.find("a")->second); 519 // EXPECT_EQ("e", m.find("d")->second); 520 // EXPECT_EQ("", m.find("f")->second); 521 // EXPECT_EQ("", m.find("g")->second); 522 // 523 // WARNING: Due to a legacy bug that is maintained for backward compatibility, 524 // splitting the following empty string_views produces different results: 525 // 526 // absl::StrSplit(absl::string_view(""), '-'); // {""} 527 // absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""} 528 // 529 // Try not to depend on this distinction because the bug may one day be fixed. 530 template <typename Delimiter> 531 strings_internal::Splitter< 532 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty, 533 absl::string_view> 534 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) { 535 using DelimiterType = 536 typename strings_internal::SelectDelimiter<Delimiter>::type; 537 return strings_internal::Splitter<DelimiterType, AllowEmpty, 538 absl::string_view>( 539 text.value(), DelimiterType(d), AllowEmpty()); 540 } 541 542 template <typename Delimiter, typename StringType, 543 EnableSplitIfString<StringType> = 0> 544 strings_internal::Splitter< 545 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty, 546 std::string> 547 StrSplit(StringType&& text, Delimiter d) { 548 using DelimiterType = 549 typename strings_internal::SelectDelimiter<Delimiter>::type; 550 return strings_internal::Splitter<DelimiterType, AllowEmpty, std::string>( 551 std::move(text), DelimiterType(d), AllowEmpty()); 552 } 553 554 template <typename Delimiter, typename Predicate> 555 strings_internal::Splitter< 556 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate, 557 absl::string_view> 558 StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d, 559 Predicate p) { 560 using DelimiterType = 561 typename strings_internal::SelectDelimiter<Delimiter>::type; 562 return strings_internal::Splitter<DelimiterType, Predicate, 563 absl::string_view>( 564 text.value(), DelimiterType(std::move(d)), std::move(p)); 565 } 566 567 template <typename Delimiter, typename Predicate, typename StringType, 568 EnableSplitIfString<StringType> = 0> 569 strings_internal::Splitter< 570 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate, 571 std::string> 572 StrSplit(StringType&& text, Delimiter d, Predicate p) { 573 using DelimiterType = 574 typename strings_internal::SelectDelimiter<Delimiter>::type; 575 return strings_internal::Splitter<DelimiterType, Predicate, std::string>( 576 std::move(text), DelimiterType(d), std::move(p)); 577 } 578 579 ABSL_NAMESPACE_END 580 } // namespace absl 581 582 #endif // ABSL_STRINGS_STR_SPLIT_H_ 583