1 // pest. The Elegant Parser 2 // Copyright (c) 2018 Dragoș Tiselice 3 // 4 // Licensed under the Apache License, Version 2.0 5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT 6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. All files in the project carrying such notice may not be copied, 8 // modified, or distributed except according to those terms. 9 #![no_std] 10 #![doc( 11 html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg", 12 html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg" 13 )] 14 #![warn(missing_docs, rust_2018_idioms, unused_qualifications)] 15 //! # pest. The Elegant Parser 16 //! 17 //! pest is a general purpose parser written in Rust with a focus on accessibility, correctness, 18 //! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in 19 //! spirit to regular expressions, but which offer the enhanced expressivity needed to parse 20 //! complex languages. 21 //! 22 //! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar 23 //! 24 //! ## Getting started 25 //! 26 //! The recommended way to start parsing with pest is to read the official [book]. 27 //! 28 //! Other helpful resources: 29 //! 30 //! * API reference on [docs.rs] 31 //! * play with grammars and share them on our [fiddle] 32 //! * find previous common questions answered or ask questions on [GitHub Discussions] 33 //! * leave feedback, ask questions, or greet us on [Gitter] or [Discord] 34 //! 35 //! [book]: https://pest.rs/book 36 //! [docs.rs]: https://docs.rs/pest 37 //! [fiddle]: https://pest.rs/#editor 38 //! [Gitter]: https://gitter.im/pest-parser/pest 39 //! [Discord]: https://discord.gg/XEGACtWpT2 40 //! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions 41 //! 42 //! ## Usage 43 //! 44 //! The core of pest is the trait [`Parser`], which provides an interface to the parsing 45 //! functionality. 46 //! 47 //! The accompanying crate `pest_derive` can automatically generate a [`Parser`] from a PEG 48 //! grammar. Using `pest_derive` is highly encouraged, but it is also possible to implement 49 //! [`Parser`] manually if required. 50 //! 51 //! ## `.pest` files 52 //! 53 //! Grammar definitions reside in custom `.pest` files located in the crate `src` directory. 54 //! Parsers are automatically generated from these files using `#[derive(Parser)]` and a special 55 //! `#[grammar = "..."]` attribute on a dummy struct. 56 //! 57 //! ```ignore 58 //! #[derive(Parser)] 59 //! #[grammar = "path/to/my_grammar.pest"] // relative to src 60 //! struct MyParser; 61 //! ``` 62 //! 63 //! The syntax of `.pest` files is documented in the [`pest_derive` crate]. 64 //! 65 //! ## Inline grammars 66 //! 67 //! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute. 68 //! 69 //! [`Parser`]: trait.Parser.html 70 //! [`pest_derive` crate]: https://docs.rs/pest_derive/ 71 //! 72 //! ## Grammar 73 //! 74 //! A grammar is a series of rules separated by whitespace, possibly containing comments. 75 //! 76 //! ### Comments 77 //! 78 //! Comments start with `//` and end at the end of the line. 79 //! 80 //! ```text 81 //! // a comment 82 //! ``` 83 //! 84 //! ### Rules 85 //! 86 //! Rules have the following form: 87 //! 88 //! ```ignore 89 //! name = optional_modifier { expression } 90 //! ``` 91 //! 92 //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the 93 //! first character is not a digit and is used to create token pairs. When the rule starts being 94 //! parsed, the starting part of the token is being produced, with the ending part being produced 95 //! when the rule finishes parsing. 96 //! 97 //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end 98 //! `b`, start `c`, end `c`, end `a`. 99 //! 100 //! #### Modifiers 101 //! 102 //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the 103 //! behavior of the rules. 104 //! 105 //! 1. Silent (`_`) 106 //! 107 //! Silent rules do not create token pairs during parsing, nor are they error-reported. 108 //! 109 //! ```ignore 110 //! a = _{ "a" } 111 //! b = { a ~ "b" } 112 //! ``` 113 //! 114 //! Parsing `"ab"` produces the token pair `b()`. 115 //! 116 //! 2. Atomic (`@`) 117 //! 118 //! Atomic rules do not accept whitespace or comments within their expressions and have a 119 //! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic 120 //! rules behave atomically. 121 //! 122 //! Any rules called by atomic rules do not generate token pairs. 123 //! 124 //! ```ignore 125 //! a = { "a" } 126 //! b = @{ a ~ "b" } 127 //! 128 //! WHITESPACE = _{ " " } 129 //! ``` 130 //! 131 //! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error. 132 //! 133 //! 3. Compound-atomic (`$`) 134 //! 135 //! Compound-atomic are identical to atomic rules with the exception that rules called by them are 136 //! not forbidden from generating token pairs. 137 //! 138 //! ```ignore 139 //! a = { "a" } 140 //! b = ${ a ~ "b" } 141 //! 142 //! WHITESPACE = _{ " " } 143 //! ``` 144 //! 145 //! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error. 146 //! 147 //! 4. Non-atomic (`!`) 148 //! 149 //! Non-atomic are identical to normal rules with the exception that they stop the cascading effect 150 //! of atomic and compound-atomic rules. 151 //! 152 //! ```ignore 153 //! a = { "a" } 154 //! b = !{ a ~ "b" } 155 //! c = @{ b } 156 //! 157 //! WHITESPACE = _{ " " } 158 //! ``` 159 //! 160 //! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`. 161 //! 162 //! #### Expressions 163 //! 164 //! Expressions can be either terminals or non-terminals. 165 //! 166 //! 1. Terminals 167 //! 168 //! | Terminal | Usage | 169 //! |------------|----------------------------------------------------------------| 170 //! | `"a"` | matches the exact string `"a"` | 171 //! | `^"a"` | matches the exact string `"a"` case insensitively (ASCII only) | 172 //! | `'a'..'z'` | matches one character between `'a'` and `'z'` | 173 //! | `a` | matches rule `a` | 174 //! 175 //! Strings and characters follow 176 //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while 177 //! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not 178 //! start with a digit. 179 //! 180 //! 2. Non-terminals 181 //! 182 //! | Non-terminal | Usage | 183 //! |-----------------------|------------------------------------------------------------| 184 //! | `(e)` | matches `e` | 185 //! | `e1 ~ e2` | matches the sequence `e1` `e2` | 186 //! | <code>e1 \| e2</code> | matches either `e1` or `e2` | 187 //! | `e*` | matches `e` zero or more times | 188 //! | `e+` | matches `e` one or more times | 189 //! | `e{n}` | matches `e` exactly `n` times | 190 //! | `e{, n}` | matches `e` at most `n` times | 191 //! | `e{n,}` | matches `e` at least `n` times | 192 //! | `e{m, n}` | matches `e` between `m` and `n` times inclusively | 193 //! | `e?` | optionally matches `e` | 194 //! | `&e` | matches `e` without making progress | 195 //! | `!e` | matches if `e` doesn't match without making progress | 196 //! | `PUSH(e)` | matches `e` and pushes it's captured string down the stack | 197 //! 198 //! where `e`, `e1`, and `e2` are expressions. 199 //! 200 //! Matching is greedy, without backtracking. Note the difference in behavior for 201 //! these two rules in matching identifiers that don't end in an underscore: 202 //! 203 //! ```ignore 204 //! // input: ab_bb_b 205 //! 206 //! identifier = @{ "a" ~ ("b"|"_")* ~ "b" } 207 //! // matches: a b_bb_b nothing -> error! 208 //! 209 //! identifier = @{ "a" ~ ("_"* ~ "b")* } 210 //! // matches: a b, _bb, _b in three repetitions 211 //! ``` 212 //! 213 //! Expressions can modify the stack only if they match the input. For example, 214 //! if `e1` in the compound expression `e1 | e2` does not match the input, then 215 //! it does not modify the stack, so `e2` sees the stack in the same state as 216 //! `e1` did. Repetitions and optionals (`e*`, `e+`, `e{, n}`, `e{n,}`, 217 //! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e` 218 //! expressions are a special case; they never modify the stack. 219 //! Many languages have "keyword" tokens (e.g. if, for, while) as well as general 220 //! tokens (e.g. identifier) that matches any word. In order to match a keyword, 221 //! generally, you may need to restrict that is not immediately followed by another 222 //! letter or digit (otherwise it would be matched as an identifier). 223 //! 224 //! ## Special rules 225 //! 226 //! Special rules can be called within the grammar. They are: 227 //! 228 //! * `WHITESPACE` - runs between rules and sub-rules 229 //! * `COMMENT` - runs between rules and sub-rules 230 //! * `ANY` - matches exactly one `char` 231 //! * `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position 232 //! * `EOI` - (end-of-input) matches only when a `Parser` has reached its end 233 //! * `POP` - pops a string from the stack and matches it 234 //! * `POP_ALL` - pops the entire state of the stack and matches it 235 //! * `PEEK` - peeks a string from the stack and matches it 236 //! * `PEEK[a..b]` - peeks part of the stack and matches it 237 //! * `PEEK_ALL` - peeks the entire state of the stack and matches it 238 //! * `DROP` - drops the top of the stack (fails to match if the stack is empty) 239 //! 240 //! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be 241 //! overridden. 242 //! 243 //! ## `WHITESPACE` and `COMMENT` 244 //! 245 //! When defined, these rules get matched automatically in sequences (`~`) and repetitions 246 //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt 247 //! from this behavior. 248 //! 249 //! These rules should be defined so as to match one whitespace character and one comment only since 250 //! they are run in repetitions. 251 //! 252 //! If both `WHITESPACE` and `COMMENT` are defined, this grammar: 253 //! 254 //! ```ignore 255 //! a = { b ~ c } 256 //! ``` 257 //! 258 //! is effectively transformed into this one behind the scenes: 259 //! 260 //! ```ignore 261 //! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE*)* ~ c } 262 //! ``` 263 //! 264 //! ## `PUSH`, `POP`, `DROP`, and `PEEK` 265 //! 266 //! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can 267 //! then later be used to match grammar based on its content with `POP` and `PEEK`. 268 //! 269 //! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]` 270 //! (`"a"` being on top), this grammar: 271 //! 272 //! ```ignore 273 //! a = { PEEK } 274 //! ``` 275 //! 276 //! is effectively transformed into at parse time: 277 //! 278 //! ```ignore 279 //! a = { "a" } 280 //! ``` 281 //! 282 //! `POP` works the same way with the exception that it pops the string off of the stack if the 283 //! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated 284 //! to `["b"]`. 285 //! 286 //! `DROP` makes it possible to remove the string at the top of the stack 287 //! without matching it. If the stack is nonempty, `DROP` drops the top of the 288 //! stack. If the stack is empty, then `DROP` fails to match. 289 //! 290 //! ### Advanced peeking 291 //! 292 //! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly 293 //! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an 294 //! offset from the top. If the end lies before or at the start, the expression matches (as does 295 //! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top): 296 //! 297 //! ```ignore 298 //! fill = PUSH("c") ~ PUSH("b") ~ PUSH("a") 299 //! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" } // top to bottom 300 //! w = { PEEK[..] } = { "c" ~ "b" ~ "a" } // bottom to top 301 //! x = { PEEK[1..2] } = { PEEK[1..-1] } = { "b" } 302 //! y = { PEEK[..-2] } = { PEEK[0..1] } = { "a" } 303 //! z = { PEEK[1..] } = { PEEK[-2..3] } = { "c" ~ "b" } 304 //! n = { PEEK[2..-2] } = { PEEK[2..1] } = { "" } 305 //! ``` 306 //! 307 //! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches 308 //! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom. 309 //! 310 //! ## `Rule` 311 //! 312 //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This 313 //! implements `pest`'s `RuleType` and can be used throughout the API. 314 //! 315 //! ## `Built-in rules` 316 //! 317 //! Pest also comes with a number of built-in rules for convenience. They are: 318 //! 319 //! * `ASCII_DIGIT` - matches a numeric character from 0..9 320 //! * `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9 321 //! * `ASCII_BIN_DIGIT` - matches a numeric character from 0..1 322 //! * `ASCII_OCT_DIGIT` - matches a numeric character from 0..7 323 //! * `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F 324 //! * `ASCII_ALPHA_LOWER` - matches a character from a..z 325 //! * `ASCII_ALPHA_UPPER` - matches a character from A..Z 326 //! * `ASCII_ALPHA` - matches a character from a..z or A..Z 327 //! * `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9 328 //! * `ASCII` - matches a character from \x00..\x7f 329 //! * `NEWLINE` - matches either "\n" or "\r\n" or "\r" 330 331 #![doc(html_root_url = "https://docs.rs/pest")] 332 333 extern crate alloc; 334 #[cfg(feature = "std")] 335 extern crate std; 336 337 pub use crate::parser::Parser; 338 pub use crate::parser_state::{ 339 set_call_limit, state, Atomicity, Lookahead, MatchDir, ParseResult, ParserState, 340 }; 341 pub use crate::position::Position; 342 pub use crate::span::{merge_spans, Lines, LinesSpan, Span}; 343 pub use crate::stack::Stack; 344 pub use crate::token::Token; 345 use core::fmt::Debug; 346 use core::hash::Hash; 347 348 pub mod error; 349 pub mod iterators; 350 mod macros; 351 mod parser; 352 mod parser_state; 353 mod position; 354 pub mod pratt_parser; 355 #[deprecated( 356 since = "2.4.0", 357 note = "Use `pest::pratt_parser` instead (it is an equivalent which also supports unary prefix/suffix operators). 358 While prec_climber is going to be kept in 2.x minor and patch releases, it may be removed in a future major release." 359 )] 360 pub mod prec_climber; 361 mod span; 362 mod stack; 363 mod token; 364 365 #[doc(hidden)] 366 pub mod unicode; 367 368 /// A trait which parser rules must implement. 369 /// 370 /// This trait is set up so that any struct that implements all of its required traits will 371 /// automatically implement this trait as well. 372 /// 373 /// This is essentially a [trait alias](https://github.com/rust-lang/rfcs/pull/1733). When trait 374 /// aliases are implemented, this may be replaced by one. 375 pub trait RuleType: Copy + Debug + Eq + Hash + Ord {} 376 377 impl<T: Copy + Debug + Eq + Hash + Ord> RuleType for T {} 378