1 /*!
2 Utilities for working with I/O using byte strings.
3 
4 This module currently only exports a single trait, `BufReadExt`, which provides
5 facilities for conveniently and efficiently working with lines as byte strings.
6 
7 More APIs may be added in the future.
8 */
9 
10 use alloc::{vec, vec::Vec};
11 
12 use std::io;
13 
14 use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
15 
16 /// An extension trait for
17 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
18 /// which provides convenience APIs for dealing with byte strings.
19 pub trait BufReadExt: io::BufRead {
20     /// Returns an iterator over the lines of this reader, where each line
21     /// is represented as a byte string.
22     ///
23     /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
24     /// an error is yielded if there was a problem reading from the underlying
25     /// reader.
26     ///
27     /// On success, the next line in the iterator is returned. The line does
28     /// *not* contain a trailing `\n` or `\r\n`.
29     ///
30     /// # Examples
31     ///
32     /// Basic usage:
33     ///
34     /// ```
35     /// use std::io;
36     ///
37     /// use bstr::io::BufReadExt;
38     ///
39     /// # fn example() -> Result<(), io::Error> {
40     /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
41     ///
42     /// let mut lines = vec![];
43     /// for result in cursor.byte_lines() {
44     ///     let line = result?;
45     ///     lines.push(line);
46     /// }
47     /// assert_eq!(lines.len(), 3);
48     /// assert_eq!(lines[0], "lorem".as_bytes());
49     /// assert_eq!(lines[1], "ipsum".as_bytes());
50     /// assert_eq!(lines[2], "dolor".as_bytes());
51     /// # Ok(()) }; example().unwrap()
52     /// ```
byte_lines(self) -> ByteLines<Self> where Self: Sized,53     fn byte_lines(self) -> ByteLines<Self>
54     where
55         Self: Sized,
56     {
57         ByteLines { buf: self }
58     }
59 
60     /// Returns an iterator over byte-terminated records of this reader, where
61     /// each record is represented as a byte string.
62     ///
63     /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
64     /// an error is yielded if there was a problem reading from the underlying
65     /// reader.
66     ///
67     /// On success, the next record in the iterator is returned. The record
68     /// does *not* contain its trailing terminator.
69     ///
70     /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
71     /// that it has no special handling for `\r`.
72     ///
73     /// # Examples
74     ///
75     /// Basic usage:
76     ///
77     /// ```
78     /// use std::io;
79     ///
80     /// use bstr::io::BufReadExt;
81     ///
82     /// # fn example() -> Result<(), io::Error> {
83     /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
84     ///
85     /// let mut records = vec![];
86     /// for result in cursor.byte_records(b'\x00') {
87     ///     let record = result?;
88     ///     records.push(record);
89     /// }
90     /// assert_eq!(records.len(), 3);
91     /// assert_eq!(records[0], "lorem".as_bytes());
92     /// assert_eq!(records[1], "ipsum".as_bytes());
93     /// assert_eq!(records[2], "dolor".as_bytes());
94     /// # Ok(()) }; example().unwrap()
95     /// ```
byte_records(self, terminator: u8) -> ByteRecords<Self> where Self: Sized,96     fn byte_records(self, terminator: u8) -> ByteRecords<Self>
97     where
98         Self: Sized,
99     {
100         ByteRecords { terminator, buf: self }
101     }
102 
103     /// Executes the given closure on each line in the underlying reader.
104     ///
105     /// If the closure returns an error (or if the underlying reader returns an
106     /// error), then iteration is stopped and the error is returned. If false
107     /// is returned, then iteration is stopped and no error is returned.
108     ///
109     /// The closure given is called on exactly the same values as yielded by
110     /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111     /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112     ///
113     /// This routine is useful for iterating over lines as quickly as
114     /// possible. Namely, a single allocation is reused for each line.
115     ///
116     /// # Examples
117     ///
118     /// Basic usage:
119     ///
120     /// ```
121     /// use std::io;
122     ///
123     /// use bstr::io::BufReadExt;
124     ///
125     /// # fn example() -> Result<(), io::Error> {
126     /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
127     ///
128     /// let mut lines = vec![];
129     /// cursor.for_byte_line(|line| {
130     ///     lines.push(line.to_vec());
131     ///     Ok(true)
132     /// })?;
133     /// assert_eq!(lines.len(), 3);
134     /// assert_eq!(lines[0], "lorem".as_bytes());
135     /// assert_eq!(lines[1], "ipsum".as_bytes());
136     /// assert_eq!(lines[2], "dolor".as_bytes());
137     /// # Ok(()) }; example().unwrap()
138     /// ```
for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,139     fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140     where
141         Self: Sized,
142         F: FnMut(&[u8]) -> io::Result<bool>,
143     {
144         self.for_byte_line_with_terminator(|line| {
145             for_each_line(&trim_line_slice(&line))
146         })
147     }
148 
149     /// Executes the given closure on each byte-terminated record in the
150     /// underlying reader.
151     ///
152     /// If the closure returns an error (or if the underlying reader returns an
153     /// error), then iteration is stopped and the error is returned. If false
154     /// is returned, then iteration is stopped and no error is returned.
155     ///
156     /// The closure given is called on exactly the same values as yielded by
157     /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158     /// iterator. Namely, records do _not_ contain a trailing terminator byte.
159     ///
160     /// This routine is useful for iterating over records as quickly as
161     /// possible. Namely, a single allocation is reused for each record.
162     ///
163     /// # Examples
164     ///
165     /// Basic usage:
166     ///
167     /// ```
168     /// use std::io;
169     ///
170     /// use bstr::io::BufReadExt;
171     ///
172     /// # fn example() -> Result<(), io::Error> {
173     /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
174     ///
175     /// let mut records = vec![];
176     /// cursor.for_byte_record(b'\x00', |record| {
177     ///     records.push(record.to_vec());
178     ///     Ok(true)
179     /// })?;
180     /// assert_eq!(records.len(), 3);
181     /// assert_eq!(records[0], "lorem".as_bytes());
182     /// assert_eq!(records[1], "ipsum".as_bytes());
183     /// assert_eq!(records[2], "dolor".as_bytes());
184     /// # Ok(()) }; example().unwrap()
185     /// ```
for_byte_record<F>( &mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,186     fn for_byte_record<F>(
187         &mut self,
188         terminator: u8,
189         mut for_each_record: F,
190     ) -> io::Result<()>
191     where
192         Self: Sized,
193         F: FnMut(&[u8]) -> io::Result<bool>,
194     {
195         self.for_byte_record_with_terminator(terminator, |chunk| {
196             for_each_record(&trim_record_slice(&chunk, terminator))
197         })
198     }
199 
200     /// Executes the given closure on each line in the underlying reader.
201     ///
202     /// If the closure returns an error (or if the underlying reader returns an
203     /// error), then iteration is stopped and the error is returned. If false
204     /// is returned, then iteration is stopped and no error is returned.
205     ///
206     /// Unlike
207     /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208     /// the lines given to the closure *do* include the line terminator, if one
209     /// exists.
210     ///
211     /// This routine is useful for iterating over lines as quickly as
212     /// possible. Namely, a single allocation is reused for each line.
213     ///
214     /// This is identical to `for_byte_record_with_terminator` with a
215     /// terminator of `\n`.
216     ///
217     /// # Examples
218     ///
219     /// Basic usage:
220     ///
221     /// ```
222     /// use std::io;
223     ///
224     /// use bstr::io::BufReadExt;
225     ///
226     /// # fn example() -> Result<(), io::Error> {
227     /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
228     ///
229     /// let mut lines = vec![];
230     /// cursor.for_byte_line_with_terminator(|line| {
231     ///     lines.push(line.to_vec());
232     ///     Ok(true)
233     /// })?;
234     /// assert_eq!(lines.len(), 3);
235     /// assert_eq!(lines[0], "lorem\n".as_bytes());
236     /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
237     /// assert_eq!(lines[2], "dolor".as_bytes());
238     /// # Ok(()) }; example().unwrap()
239     /// ```
for_byte_line_with_terminator<F>( &mut self, for_each_line: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,240     fn for_byte_line_with_terminator<F>(
241         &mut self,
242         for_each_line: F,
243     ) -> io::Result<()>
244     where
245         Self: Sized,
246         F: FnMut(&[u8]) -> io::Result<bool>,
247     {
248         self.for_byte_record_with_terminator(b'\n', for_each_line)
249     }
250 
251     /// Executes the given closure on each byte-terminated record in the
252     /// underlying reader.
253     ///
254     /// If the closure returns an error (or if the underlying reader returns an
255     /// error), then iteration is stopped and the error is returned. If false
256     /// is returned, then iteration is stopped and no error is returned.
257     ///
258     /// Unlike
259     /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260     /// the lines given to the closure *do* include the record terminator, if
261     /// one exists.
262     ///
263     /// This routine is useful for iterating over records as quickly as
264     /// possible. Namely, a single allocation is reused for each record.
265     ///
266     /// # Examples
267     ///
268     /// Basic usage:
269     ///
270     /// ```
271     /// use std::io;
272     ///
273     /// use bstr::{io::BufReadExt, B};
274     ///
275     /// # fn example() -> Result<(), io::Error> {
276     /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277     ///
278     /// let mut records = vec![];
279     /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280     ///     records.push(record.to_vec());
281     ///     Ok(true)
282     /// })?;
283     /// assert_eq!(records.len(), 3);
284     /// assert_eq!(records[0], B(b"lorem\x00"));
285     /// assert_eq!(records[1], B("ipsum\x00"));
286     /// assert_eq!(records[2], B("dolor"));
287     /// # Ok(()) }; example().unwrap()
288     /// ```
for_byte_record_with_terminator<F>( &mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,289     fn for_byte_record_with_terminator<F>(
290         &mut self,
291         terminator: u8,
292         mut for_each_record: F,
293     ) -> io::Result<()>
294     where
295         Self: Sized,
296         F: FnMut(&[u8]) -> io::Result<bool>,
297     {
298         let mut bytes = vec![];
299         let mut res = Ok(());
300         let mut consumed = 0;
301         'outer: loop {
302             // Lend out complete record slices from our buffer
303             {
304                 let mut buf = self.fill_buf()?;
305                 while let Some(index) = buf.find_byte(terminator) {
306                     let (record, rest) = buf.split_at(index + 1);
307                     buf = rest;
308                     consumed += record.len();
309                     match for_each_record(&record) {
310                         Ok(false) => break 'outer,
311                         Err(err) => {
312                             res = Err(err);
313                             break 'outer;
314                         }
315                         _ => (),
316                     }
317                 }
318 
319                 // Copy the final record fragment to our local buffer. This
320                 // saves read_until() from re-scanning a buffer we know
321                 // contains no remaining terminators.
322                 bytes.extend_from_slice(&buf);
323                 consumed += buf.len();
324             }
325 
326             self.consume(consumed);
327             consumed = 0;
328 
329             // N.B. read_until uses a different version of memchr that may
330             // be slower than the memchr crate that bstr uses. However, this
331             // should only run for a fairly small number of records, assuming a
332             // decent buffer size.
333             self.read_until(terminator, &mut bytes)?;
334             if bytes.is_empty() || !for_each_record(&bytes)? {
335                 break;
336             }
337             bytes.clear();
338         }
339         self.consume(consumed);
340         res
341     }
342 }
343 
344 impl<B: io::BufRead> BufReadExt for B {}
345 
346 /// An iterator over lines from an instance of
347 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
348 ///
349 /// This iterator is generally created by calling the
350 /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
351 /// method on the
352 /// [`BufReadExt`](trait.BufReadExt.html)
353 /// trait.
354 #[derive(Debug)]
355 pub struct ByteLines<B> {
356     buf: B,
357 }
358 
359 /// An iterator over records from an instance of
360 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
361 ///
362 /// A byte record is any sequence of bytes terminated by a particular byte
363 /// chosen by the caller. For example, NUL separated byte strings are said to
364 /// be NUL-terminated byte records.
365 ///
366 /// This iterator is generally created by calling the
367 /// [`byte_records`](trait.BufReadExt.html#method.byte_records)
368 /// method on the
369 /// [`BufReadExt`](trait.BufReadExt.html)
370 /// trait.
371 #[derive(Debug)]
372 pub struct ByteRecords<B> {
373     buf: B,
374     terminator: u8,
375 }
376 
377 impl<B: io::BufRead> Iterator for ByteLines<B> {
378     type Item = io::Result<Vec<u8>>;
379 
next(&mut self) -> Option<io::Result<Vec<u8>>>380     fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
381         let mut bytes = vec![];
382         match self.buf.read_until(b'\n', &mut bytes) {
383             Err(e) => Some(Err(e)),
384             Ok(0) => None,
385             Ok(_) => {
386                 trim_line(&mut bytes);
387                 Some(Ok(bytes))
388             }
389         }
390     }
391 }
392 
393 impl<B: io::BufRead> Iterator for ByteRecords<B> {
394     type Item = io::Result<Vec<u8>>;
395 
next(&mut self) -> Option<io::Result<Vec<u8>>>396     fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
397         let mut bytes = vec![];
398         match self.buf.read_until(self.terminator, &mut bytes) {
399             Err(e) => Some(Err(e)),
400             Ok(0) => None,
401             Ok(_) => {
402                 trim_record(&mut bytes, self.terminator);
403                 Some(Ok(bytes))
404             }
405         }
406     }
407 }
408 
trim_line(line: &mut Vec<u8>)409 fn trim_line(line: &mut Vec<u8>) {
410     if line.last_byte() == Some(b'\n') {
411         line.pop_byte();
412         if line.last_byte() == Some(b'\r') {
413             line.pop_byte();
414         }
415     }
416 }
417 
trim_line_slice(mut line: &[u8]) -> &[u8]418 fn trim_line_slice(mut line: &[u8]) -> &[u8] {
419     if line.last_byte() == Some(b'\n') {
420         line = &line[..line.len() - 1];
421         if line.last_byte() == Some(b'\r') {
422             line = &line[..line.len() - 1];
423         }
424     }
425     line
426 }
427 
trim_record(record: &mut Vec<u8>, terminator: u8)428 fn trim_record(record: &mut Vec<u8>, terminator: u8) {
429     if record.last_byte() == Some(terminator) {
430         record.pop_byte();
431     }
432 }
433 
trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8]434 fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
435     if record.last_byte() == Some(terminator) {
436         record = &record[..record.len() - 1];
437     }
438     record
439 }
440 
441 #[cfg(all(test, feature = "std"))]
442 mod tests {
443     use crate::bstring::BString;
444 
445     use super::BufReadExt;
446 
collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString>447     fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
448         let mut lines = vec![];
449         slice
450             .as_ref()
451             .for_byte_line(|line| {
452                 lines.push(BString::from(line.to_vec()));
453                 Ok(true)
454             })
455             .unwrap();
456         lines
457     }
458 
collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString>459     fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
460         let mut lines = vec![];
461         slice
462             .as_ref()
463             .for_byte_line_with_terminator(|line| {
464                 lines.push(BString::from(line.to_vec()));
465                 Ok(true)
466             })
467             .unwrap();
468         lines
469     }
470 
471     #[test]
lines_without_terminator()472     fn lines_without_terminator() {
473         assert_eq!(collect_lines(""), Vec::<BString>::new());
474 
475         assert_eq!(collect_lines("\n"), vec![""]);
476         assert_eq!(collect_lines("\n\n"), vec!["", ""]);
477         assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
478         assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
479         assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
480         assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
481 
482         assert_eq!(collect_lines("\r\n"), vec![""]);
483         assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
484         assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
485         assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
486         assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
487         assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
488 
489         assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
490     }
491 
492     #[test]
lines_with_terminator()493     fn lines_with_terminator() {
494         assert_eq!(collect_lines_term(""), Vec::<BString>::new());
495 
496         assert_eq!(collect_lines_term("\n"), vec!["\n"]);
497         assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
498         assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
499         assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
500         assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
501         assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
502 
503         assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
504         assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
505         assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
506         assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
507         assert_eq!(
508             collect_lines_term("abc\r\nxyz\r\n"),
509             vec!["abc\r\n", "xyz\r\n"]
510         );
511         assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
512 
513         assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
514     }
515 }
516