1 // Boost token_functions.hpp ------------------------------------------------// 2 3 // Copyright John R. Bandela 2001. 4 5 // Distributed under the Boost Software License, Version 1.0. (See 6 // accompanying file LICENSE_1_0.txt or copy at 7 // http://www.boost.org/LICENSE_1_0.txt) 8 9 // See http://www.boost.org/libs/tokenizer/ for documentation. 10 11 // Revision History: 12 // 01 Oct 2004 Joaquin M Lopez Munoz 13 // Workaround for a problem with string::assign in msvc-stlport 14 // 06 Apr 2004 John Bandela 15 // Fixed a bug involving using char_delimiter with a true input iterator 16 // 28 Nov 2003 Robert Zeh and John Bandela 17 // Converted into "fast" functions that avoid using += when 18 // the supplied iterator isn't an input_iterator; based on 19 // some work done at Archelon and a version that was checked into 20 // the boost CVS for a short period of time. 21 // 20 Feb 2002 John Maddock 22 // Removed using namespace std declarations and added 23 // workaround for BOOST_NO_STDC_NAMESPACE (the library 24 // can be safely mixed with regex). 25 // 06 Feb 2002 Jeremy Siek 26 // Added char_separator. 27 // 02 Feb 2002 Jeremy Siek 28 // Removed tabs and a little cleanup. 29 30 31 #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ 32 #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ 33 34 #include <vector> 35 #include <stdexcept> 36 #include <string> 37 #include <cctype> 38 #include <algorithm> // for find_if 39 #include <boost/config.hpp> 40 #include <boost/assert.hpp> 41 #include <boost/type_traits/is_pointer.hpp> 42 #include <boost/detail/workaround.hpp> 43 #include <boost/mpl/if.hpp> 44 #include <boost/throw_exception.hpp> 45 #if !defined(BOOST_NO_CWCTYPE) 46 #include <cwctype> 47 #endif 48 49 // 50 // the following must not be macros if we are to prefix them 51 // with std:: (they shouldn't be macros anyway...) 52 // 53 #ifdef ispunct 54 # undef ispunct 55 #endif 56 #ifdef iswpunct 57 # undef iswpunct 58 #endif 59 #ifdef isspace 60 # undef isspace 61 #endif 62 #ifdef iswspace 63 # undef iswspace 64 #endif 65 // 66 // fix namespace problems: 67 // 68 #ifdef BOOST_NO_STDC_NAMESPACE 69 namespace std{ 70 using ::ispunct; 71 using ::isspace; 72 #if !defined(BOOST_NO_CWCTYPE) 73 using ::iswpunct; 74 using ::iswspace; 75 #endif 76 } 77 #endif 78 79 namespace boost{ 80 //=========================================================================== 81 // The escaped_list_separator class. Which is a model of TokenizerFunction 82 // An escaped list is a super-set of what is commonly known as a comma 83 // separated value (csv) list.It is separated into fields by a comma or 84 // other character. If the delimiting character is inside quotes, then it is 85 // counted as a regular character.To allow for embedded quotes in a field, 86 // there can be escape sequences using the \ much like C. 87 // The role of the comma, the quotation mark, and the escape 88 // character (backslash \), can be assigned to other characters. 89 90 struct escaped_list_error : public std::runtime_error{ escaped_list_errorboost::escaped_list_error91 escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { } 92 }; 93 94 95 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. 96 // MSVC does not like the following typename 97 template <class Char, 98 class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > 99 class escaped_list_separator { 100 101 private: 102 typedef std::basic_string<Char,Traits> string_type; 103 struct char_eq { 104 Char e_; char_eqboost::escaped_list_separator::char_eq105 char_eq(Char e):e_(e) { } operator ()boost::escaped_list_separator::char_eq106 bool operator()(Char c) { 107 return Traits::eq(e_,c); 108 } 109 }; 110 string_type escape_; 111 string_type c_; 112 string_type quote_; 113 bool last_; 114 is_escape(Char e)115 bool is_escape(Char e) { 116 char_eq f(e); 117 return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); 118 } is_c(Char e)119 bool is_c(Char e) { 120 char_eq f(e); 121 return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); 122 } is_quote(Char e)123 bool is_quote(Char e) { 124 char_eq f(e); 125 return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); 126 } 127 template <typename iterator, typename Token> do_escape(iterator & next,iterator end,Token & tok)128 void do_escape(iterator& next,iterator end,Token& tok) { 129 if (++next == end) 130 BOOST_THROW_EXCEPTION(escaped_list_error(std::string("cannot end with escape"))); 131 if (Traits::eq(*next,'n')) { 132 tok+='\n'; 133 return; 134 } 135 else if (is_quote(*next)) { 136 tok+=*next; 137 return; 138 } 139 else if (is_c(*next)) { 140 tok+=*next; 141 return; 142 } 143 else if (is_escape(*next)) { 144 tok+=*next; 145 return; 146 } 147 else 148 BOOST_THROW_EXCEPTION(escaped_list_error(std::string("unknown escape sequence"))); 149 } 150 151 public: 152 escaped_list_separator(Char e='\\\\',Char c=',',Char q='\\"')153 explicit escaped_list_separator(Char e = '\\', 154 Char c = ',',Char q = '\"') 155 : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } 156 escaped_list_separator(string_type e,string_type c,string_type q)157 escaped_list_separator(string_type e, string_type c, string_type q) 158 : escape_(e), c_(c), quote_(q), last_(false) { } 159 reset()160 void reset() {last_=false;} 161 162 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)163 bool operator()(InputIterator& next,InputIterator end,Token& tok) { 164 bool bInQuote = false; 165 tok = Token(); 166 167 if (next == end) { 168 if (last_) { 169 last_ = false; 170 return true; 171 } 172 else 173 return false; 174 } 175 last_ = false; 176 for (;next != end;++next) { 177 if (is_escape(*next)) { 178 do_escape(next,end,tok); 179 } 180 else if (is_c(*next)) { 181 if (!bInQuote) { 182 // If we are not in quote, then we are done 183 ++next; 184 // The last character was a c, that means there is 185 // 1 more blank field 186 last_ = true; 187 return true; 188 } 189 else tok+=*next; 190 } 191 else if (is_quote(*next)) { 192 bInQuote=!bInQuote; 193 } 194 else { 195 tok += *next; 196 } 197 } 198 return true; 199 } 200 }; 201 202 //=========================================================================== 203 // The classes here are used by offset_separator and char_separator to implement 204 // faster assigning of tokens using assign instead of += 205 206 namespace tokenizer_detail { 207 //=========================================================================== 208 // Tokenizer was broken for wide character separators, at least on Windows, since 209 // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts 210 // if higher values are passed in. The traits extension class should take care of this. 211 // Assuming that the conditional will always get optimized out in the function 212 // implementations, argument types are not a problem since both forms of character classifiers 213 // expect an int. 214 215 #if !defined(BOOST_NO_CWCTYPE) 216 template<typename traits, int N> 217 struct traits_extension_details : public traits { 218 typedef typename traits::char_type char_type; isspaceboost::tokenizer_detail::traits_extension_details219 static bool isspace(char_type c) 220 { 221 return std::iswspace(c) != 0; 222 } ispunctboost::tokenizer_detail::traits_extension_details223 static bool ispunct(char_type c) 224 { 225 return std::iswpunct(c) != 0; 226 } 227 }; 228 229 template<typename traits> 230 struct traits_extension_details<traits, 1> : public traits { 231 typedef typename traits::char_type char_type; isspaceboost::tokenizer_detail::traits_extension_details232 static bool isspace(char_type c) 233 { 234 return std::isspace(c) != 0; 235 } ispunctboost::tokenizer_detail::traits_extension_details236 static bool ispunct(char_type c) 237 { 238 return std::ispunct(c) != 0; 239 } 240 }; 241 #endif 242 243 244 // In case there is no cwctype header, we implement the checks manually. 245 // We make use of the fact that the tested categories should fit in ASCII. 246 template<typename traits> 247 struct traits_extension : public traits { 248 typedef typename traits::char_type char_type; isspaceboost::tokenizer_detail::traits_extension249 static bool isspace(char_type c) 250 { 251 #if !defined(BOOST_NO_CWCTYPE) 252 return traits_extension_details<traits, sizeof(char_type)>::isspace(c); 253 #else 254 return static_cast< unsigned >(c) <= 255 && std::isspace(c) != 0; 255 #endif 256 } 257 ispunctboost::tokenizer_detail::traits_extension258 static bool ispunct(char_type c) 259 { 260 #if !defined(BOOST_NO_CWCTYPE) 261 return traits_extension_details<traits, sizeof(char_type)>::ispunct(c); 262 #else 263 return static_cast< unsigned >(c) <= 255 && std::ispunct(c) != 0; 264 #endif 265 } 266 }; 267 268 // The assign_or_plus_equal struct contains functions that implement 269 // assign, +=, and clearing based on the iterator type. The 270 // generic case does nothing for plus_equal and clearing, while 271 // passing through the call for assign. 272 // 273 // When an input iterator is being used, the situation is reversed. 274 // The assign method does nothing, plus_equal invokes operator +=, 275 // and the clearing method sets the supplied token to the default 276 // token constructor's result. 277 // 278 279 template<class IteratorTag> 280 struct assign_or_plus_equal { 281 template<class Iterator, class Token> assignboost::tokenizer_detail::assign_or_plus_equal282 static void assign(Iterator b, Iterator e, Token &t) { 283 t.assign(b, e); 284 } 285 286 template<class Token, class Value> plus_equalboost::tokenizer_detail::assign_or_plus_equal287 static void plus_equal(Token &, const Value &) { } 288 289 // If we are doing an assign, there is no need for the 290 // the clear. 291 // 292 template<class Token> clearboost::tokenizer_detail::assign_or_plus_equal293 static void clear(Token &) { } 294 }; 295 296 template <> 297 struct assign_or_plus_equal<std::input_iterator_tag> { 298 template<class Iterator, class Token> assignboost::tokenizer_detail::assign_or_plus_equal299 static void assign(Iterator , Iterator , Token &) { } 300 template<class Token, class Value> plus_equalboost::tokenizer_detail::assign_or_plus_equal301 static void plus_equal(Token &t, const Value &v) { 302 t += v; 303 } 304 template<class Token> clearboost::tokenizer_detail::assign_or_plus_equal305 static void clear(Token &t) { 306 t = Token(); 307 } 308 }; 309 310 311 template<class Iterator> 312 struct pointer_iterator_category{ 313 typedef std::random_access_iterator_tag type; 314 }; 315 316 317 template<class Iterator> 318 struct class_iterator_category{ 319 typedef typename Iterator::iterator_category type; 320 }; 321 322 323 324 // This portably gets the iterator_tag without partial template specialization 325 template<class Iterator> 326 struct get_iterator_category{ 327 typedef typename mpl::if_<is_pointer<Iterator>, 328 pointer_iterator_category<Iterator>, 329 class_iterator_category<Iterator> 330 >::type cat; 331 332 typedef typename cat::type iterator_category; 333 }; 334 335 336 } // namespace tokenizer_detail 337 338 339 //=========================================================================== 340 // The offset_separator class, which is a model of TokenizerFunction. 341 // Offset breaks a string into tokens based on a range of offsets 342 343 class offset_separator { 344 private: 345 346 std::vector<int> offsets_; 347 unsigned int current_offset_; 348 bool wrap_offsets_; 349 bool return_partial_last_; 350 351 public: 352 template <typename Iter> offset_separator(Iter begin,Iter end,bool wrap_offsets=true,bool return_partial_last=true)353 offset_separator(Iter begin, Iter end, bool wrap_offsets = true, 354 bool return_partial_last = true) 355 : offsets_(begin,end), current_offset_(0), 356 wrap_offsets_(wrap_offsets), 357 return_partial_last_(return_partial_last) { } 358 offset_separator()359 offset_separator() 360 : offsets_(1,1), current_offset_(), 361 wrap_offsets_(true), return_partial_last_(true) { } 362 reset()363 void reset() { 364 current_offset_ = 0; 365 } 366 367 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)368 bool operator()(InputIterator& next, InputIterator end, Token& tok) 369 { 370 typedef tokenizer_detail::assign_or_plus_equal< 371 BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< 372 InputIterator 373 >::iterator_category 374 > assigner; 375 376 BOOST_ASSERT(!offsets_.empty()); 377 378 assigner::clear(tok); 379 InputIterator start(next); 380 381 if (next == end) 382 return false; 383 384 if (current_offset_ == offsets_.size()) 385 { 386 if (wrap_offsets_) 387 current_offset_=0; 388 else 389 return false; 390 } 391 392 int c = offsets_[current_offset_]; 393 int i = 0; 394 for (; i < c; ++i) { 395 if (next == end)break; 396 assigner::plus_equal(tok,*next++); 397 } 398 assigner::assign(start,next,tok); 399 400 if (!return_partial_last_) 401 if (i < (c-1) ) 402 return false; 403 404 ++current_offset_; 405 return true; 406 } 407 }; 408 409 410 //=========================================================================== 411 // The char_separator class breaks a sequence of characters into 412 // tokens based on the character delimiters (very much like bad old 413 // strtok). A delimiter character can either be kept or dropped. A 414 // kept delimiter shows up as an output token, whereas a dropped 415 // delimiter does not. 416 417 // This class replaces the char_delimiters_separator class. The 418 // constructor for the char_delimiters_separator class was too 419 // confusing and needed to be deprecated. However, because of the 420 // default arguments to the constructor, adding the new constructor 421 // would cause ambiguity, so instead I deprecated the whole class. 422 // The implementation of the class was also simplified considerably. 423 424 enum empty_token_policy { drop_empty_tokens, keep_empty_tokens }; 425 426 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. 427 template <typename Char, 428 typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > 429 class char_separator 430 { 431 typedef tokenizer_detail::traits_extension<Tr> Traits; 432 typedef std::basic_string<Char,Tr> string_type; 433 public: 434 explicit char_separator(const Char * dropped_delims,const Char * kept_delims=0,empty_token_policy empty_tokens=drop_empty_tokens)435 char_separator(const Char* dropped_delims, 436 const Char* kept_delims = 0, 437 empty_token_policy empty_tokens = drop_empty_tokens) 438 : m_dropped_delims(dropped_delims), 439 m_use_ispunct(false), 440 m_use_isspace(false), 441 m_empty_tokens(empty_tokens), 442 m_output_done(false) 443 { 444 // Borland workaround 445 if (kept_delims) 446 m_kept_delims = kept_delims; 447 } 448 449 // use ispunct() for kept delimiters and isspace for dropped. 450 explicit char_separator()451 char_separator() 452 : m_use_ispunct(true), 453 m_use_isspace(true), 454 m_empty_tokens(drop_empty_tokens), 455 m_output_done(false) { } 456 reset()457 void reset() { } 458 459 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)460 bool operator()(InputIterator& next, InputIterator end, Token& tok) 461 { 462 typedef tokenizer_detail::assign_or_plus_equal< 463 BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< 464 InputIterator 465 >::iterator_category 466 > assigner; 467 468 assigner::clear(tok); 469 470 // skip past all dropped_delims 471 if (m_empty_tokens == drop_empty_tokens) 472 for (; next != end && is_dropped(*next); ++next) 473 { } 474 475 InputIterator start(next); 476 477 if (m_empty_tokens == drop_empty_tokens) { 478 479 if (next == end) 480 return false; 481 482 483 // if we are on a kept_delims move past it and stop 484 if (is_kept(*next)) { 485 assigner::plus_equal(tok,*next); 486 ++next; 487 } else 488 // append all the non delim characters 489 for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) 490 assigner::plus_equal(tok,*next); 491 } 492 else { // m_empty_tokens == keep_empty_tokens 493 494 // Handle empty token at the end 495 if (next == end) 496 { 497 if (m_output_done == false) 498 { 499 m_output_done = true; 500 assigner::assign(start,next,tok); 501 return true; 502 } 503 else 504 return false; 505 } 506 507 if (is_kept(*next)) { 508 if (m_output_done == false) 509 m_output_done = true; 510 else { 511 assigner::plus_equal(tok,*next); 512 ++next; 513 m_output_done = false; 514 } 515 } 516 else if (m_output_done == false && is_dropped(*next)) { 517 m_output_done = true; 518 } 519 else { 520 if (is_dropped(*next)) 521 start=++next; 522 for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) 523 assigner::plus_equal(tok,*next); 524 m_output_done = true; 525 } 526 } 527 assigner::assign(start,next,tok); 528 return true; 529 } 530 531 private: 532 string_type m_kept_delims; 533 string_type m_dropped_delims; 534 bool m_use_ispunct; 535 bool m_use_isspace; 536 empty_token_policy m_empty_tokens; 537 bool m_output_done; 538 is_kept(Char E) const539 bool is_kept(Char E) const 540 { 541 if (m_kept_delims.length()) 542 return m_kept_delims.find(E) != string_type::npos; 543 else if (m_use_ispunct) { 544 return Traits::ispunct(E) != 0; 545 } else 546 return false; 547 } is_dropped(Char E) const548 bool is_dropped(Char E) const 549 { 550 if (m_dropped_delims.length()) 551 return m_dropped_delims.find(E) != string_type::npos; 552 else if (m_use_isspace) { 553 return Traits::isspace(E) != 0; 554 } else 555 return false; 556 } 557 }; 558 559 //=========================================================================== 560 // The following class is DEPRECATED, use class char_separators instead. 561 // 562 // The char_delimiters_separator class, which is a model of 563 // TokenizerFunction. char_delimiters_separator breaks a string 564 // into tokens based on character delimiters. There are 2 types of 565 // delimiters. returnable delimiters can be returned as 566 // tokens. These are often punctuation. nonreturnable delimiters 567 // cannot be returned as tokens. These are often whitespace 568 569 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. 570 template <class Char, 571 class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > 572 class char_delimiters_separator { 573 private: 574 575 typedef tokenizer_detail::traits_extension<Tr> Traits; 576 typedef std::basic_string<Char,Tr> string_type; 577 string_type returnable_; 578 string_type nonreturnable_; 579 bool return_delims_; 580 bool no_ispunct_; 581 bool no_isspace_; 582 is_ret(Char E) const583 bool is_ret(Char E)const 584 { 585 if (returnable_.length()) 586 return returnable_.find(E) != string_type::npos; 587 else{ 588 if (no_ispunct_) {return false;} 589 else{ 590 int r = Traits::ispunct(E); 591 return r != 0; 592 } 593 } 594 } is_nonret(Char E) const595 bool is_nonret(Char E)const 596 { 597 if (nonreturnable_.length()) 598 return nonreturnable_.find(E) != string_type::npos; 599 else{ 600 if (no_isspace_) {return false;} 601 else{ 602 int r = Traits::isspace(E); 603 return r != 0; 604 } 605 } 606 } 607 608 public: char_delimiters_separator(bool return_delims=false,const Char * returnable=0,const Char * nonreturnable=0)609 explicit char_delimiters_separator(bool return_delims = false, 610 const Char* returnable = 0, 611 const Char* nonreturnable = 0) 612 : returnable_(returnable ? returnable : string_type().c_str()), 613 nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()), 614 return_delims_(return_delims), no_ispunct_(returnable!=0), 615 no_isspace_(nonreturnable!=0) { } 616 reset()617 void reset() { } 618 619 public: 620 621 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)622 bool operator()(InputIterator& next, InputIterator end,Token& tok) { 623 tok = Token(); 624 625 // skip past all nonreturnable delims 626 // skip past the returnable only if we are not returning delims 627 for (;next!=end && ( is_nonret(*next) || (is_ret(*next) 628 && !return_delims_ ) );++next) { } 629 630 if (next == end) { 631 return false; 632 } 633 634 // if we are to return delims and we are one a returnable one 635 // move past it and stop 636 if (is_ret(*next) && return_delims_) { 637 tok+=*next; 638 ++next; 639 } 640 else 641 // append all the non delim characters 642 for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next) 643 tok+=*next; 644 645 646 return true; 647 } 648 }; 649 650 651 } //namespace boost 652 653 #endif 654