1*103e46e4SHarish Mahendrakar // Copyright (c) 2012 The WebM project authors. All Rights Reserved.
2*103e46e4SHarish Mahendrakar //
3*103e46e4SHarish Mahendrakar // Use of this source code is governed by a BSD-style license
4*103e46e4SHarish Mahendrakar // that can be found in the LICENSE file in the root of the source
5*103e46e4SHarish Mahendrakar // tree. An additional intellectual property rights grant can be found
6*103e46e4SHarish Mahendrakar // in the file PATENTS. All contributing project authors may
7*103e46e4SHarish Mahendrakar // be found in the AUTHORS file in the root of the source tree.
8*103e46e4SHarish Mahendrakar
9*103e46e4SHarish Mahendrakar #include "webvttparser.h"
10*103e46e4SHarish Mahendrakar
11*103e46e4SHarish Mahendrakar #include <ctype.h>
12*103e46e4SHarish Mahendrakar
13*103e46e4SHarish Mahendrakar #include <climits>
14*103e46e4SHarish Mahendrakar #include <cstddef>
15*103e46e4SHarish Mahendrakar
16*103e46e4SHarish Mahendrakar namespace libwebvtt {
17*103e46e4SHarish Mahendrakar
18*103e46e4SHarish Mahendrakar // NOLINT'ing this enum because clang-format puts it in a single line which
19*103e46e4SHarish Mahendrakar // makes it look really unreadable.
20*103e46e4SHarish Mahendrakar enum {
21*103e46e4SHarish Mahendrakar kNUL = '\x00',
22*103e46e4SHarish Mahendrakar kSPACE = ' ',
23*103e46e4SHarish Mahendrakar kTAB = '\x09',
24*103e46e4SHarish Mahendrakar kLF = '\x0A',
25*103e46e4SHarish Mahendrakar kCR = '\x0D'
26*103e46e4SHarish Mahendrakar }; // NOLINT
27*103e46e4SHarish Mahendrakar
~Reader()28*103e46e4SHarish Mahendrakar Reader::~Reader() {}
29*103e46e4SHarish Mahendrakar
~LineReader()30*103e46e4SHarish Mahendrakar LineReader::~LineReader() {}
31*103e46e4SHarish Mahendrakar
GetLine(std::string * line_ptr)32*103e46e4SHarish Mahendrakar int LineReader::GetLine(std::string* line_ptr) {
33*103e46e4SHarish Mahendrakar if (line_ptr == NULL)
34*103e46e4SHarish Mahendrakar return -1;
35*103e46e4SHarish Mahendrakar
36*103e46e4SHarish Mahendrakar std::string& ln = *line_ptr;
37*103e46e4SHarish Mahendrakar ln.clear();
38*103e46e4SHarish Mahendrakar
39*103e46e4SHarish Mahendrakar // Consume characters from the stream, until we
40*103e46e4SHarish Mahendrakar // reach end-of-line (or end-of-stream).
41*103e46e4SHarish Mahendrakar
42*103e46e4SHarish Mahendrakar // The WebVTT spec states that lines may be
43*103e46e4SHarish Mahendrakar // terminated in any of these three ways:
44*103e46e4SHarish Mahendrakar // LF
45*103e46e4SHarish Mahendrakar // CR
46*103e46e4SHarish Mahendrakar // CR LF
47*103e46e4SHarish Mahendrakar
48*103e46e4SHarish Mahendrakar // We interrogate each character as we read it from the stream.
49*103e46e4SHarish Mahendrakar // If we detect an end-of-line character, we consume the full
50*103e46e4SHarish Mahendrakar // end-of-line indication, and we're done; otherwise, accumulate
51*103e46e4SHarish Mahendrakar // the character and repeat.
52*103e46e4SHarish Mahendrakar
53*103e46e4SHarish Mahendrakar for (;;) {
54*103e46e4SHarish Mahendrakar char c;
55*103e46e4SHarish Mahendrakar const int e = GetChar(&c);
56*103e46e4SHarish Mahendrakar
57*103e46e4SHarish Mahendrakar if (e < 0) // error
58*103e46e4SHarish Mahendrakar return e;
59*103e46e4SHarish Mahendrakar
60*103e46e4SHarish Mahendrakar if (e > 0) // EOF
61*103e46e4SHarish Mahendrakar return (ln.empty()) ? 1 : 0;
62*103e46e4SHarish Mahendrakar
63*103e46e4SHarish Mahendrakar // We have a character, so we must first determine
64*103e46e4SHarish Mahendrakar // whether we have reached end-of-line.
65*103e46e4SHarish Mahendrakar
66*103e46e4SHarish Mahendrakar if (c == kLF)
67*103e46e4SHarish Mahendrakar return 0; // handle the easy end-of-line case immediately
68*103e46e4SHarish Mahendrakar
69*103e46e4SHarish Mahendrakar if (c == kCR)
70*103e46e4SHarish Mahendrakar break; // handle the hard end-of-line case outside of loop
71*103e46e4SHarish Mahendrakar
72*103e46e4SHarish Mahendrakar if (c == '\xFE' || c == '\xFF') // not UTF-8
73*103e46e4SHarish Mahendrakar return -1;
74*103e46e4SHarish Mahendrakar
75*103e46e4SHarish Mahendrakar // To defend against pathological or malicious streams, we
76*103e46e4SHarish Mahendrakar // cap the line length at some arbitrarily-large value:
77*103e46e4SHarish Mahendrakar enum { kMaxLineLength = 10000 }; // arbitrary
78*103e46e4SHarish Mahendrakar
79*103e46e4SHarish Mahendrakar if (ln.length() >= kMaxLineLength)
80*103e46e4SHarish Mahendrakar return -1;
81*103e46e4SHarish Mahendrakar
82*103e46e4SHarish Mahendrakar // We don't have an end-of-line character, so accumulate
83*103e46e4SHarish Mahendrakar // the character in our line buffer.
84*103e46e4SHarish Mahendrakar ln.push_back(c);
85*103e46e4SHarish Mahendrakar }
86*103e46e4SHarish Mahendrakar
87*103e46e4SHarish Mahendrakar // We detected a CR. We must interrogate the next character
88*103e46e4SHarish Mahendrakar // in the stream, to determine whether we have a LF (which
89*103e46e4SHarish Mahendrakar // would make it part of this same line).
90*103e46e4SHarish Mahendrakar
91*103e46e4SHarish Mahendrakar char c;
92*103e46e4SHarish Mahendrakar const int e = GetChar(&c);
93*103e46e4SHarish Mahendrakar
94*103e46e4SHarish Mahendrakar if (e < 0) // error
95*103e46e4SHarish Mahendrakar return e;
96*103e46e4SHarish Mahendrakar
97*103e46e4SHarish Mahendrakar if (e > 0) // EOF
98*103e46e4SHarish Mahendrakar return 0;
99*103e46e4SHarish Mahendrakar
100*103e46e4SHarish Mahendrakar // If next character in the stream is not a LF, return it
101*103e46e4SHarish Mahendrakar // to the stream (because it's part of the next line).
102*103e46e4SHarish Mahendrakar if (c != kLF)
103*103e46e4SHarish Mahendrakar UngetChar(c);
104*103e46e4SHarish Mahendrakar
105*103e46e4SHarish Mahendrakar return 0;
106*103e46e4SHarish Mahendrakar }
107*103e46e4SHarish Mahendrakar
Parser(Reader * r)108*103e46e4SHarish Mahendrakar Parser::Parser(Reader* r) : reader_(r), unget_(-1) {}
109*103e46e4SHarish Mahendrakar
~Parser()110*103e46e4SHarish Mahendrakar Parser::~Parser() {}
111*103e46e4SHarish Mahendrakar
Init()112*103e46e4SHarish Mahendrakar int Parser::Init() {
113*103e46e4SHarish Mahendrakar int e = ParseBOM();
114*103e46e4SHarish Mahendrakar
115*103e46e4SHarish Mahendrakar if (e < 0) // error
116*103e46e4SHarish Mahendrakar return e;
117*103e46e4SHarish Mahendrakar
118*103e46e4SHarish Mahendrakar if (e > 0) // EOF
119*103e46e4SHarish Mahendrakar return -1;
120*103e46e4SHarish Mahendrakar
121*103e46e4SHarish Mahendrakar // Parse "WEBVTT". We read from the stream one character at-a-time, in
122*103e46e4SHarish Mahendrakar // order to defend against non-WebVTT streams (e.g. binary files) that don't
123*103e46e4SHarish Mahendrakar // happen to comprise lines of text demarcated with line terminators.
124*103e46e4SHarish Mahendrakar
125*103e46e4SHarish Mahendrakar const char kId[] = "WEBVTT";
126*103e46e4SHarish Mahendrakar
127*103e46e4SHarish Mahendrakar for (const char* p = kId; *p; ++p) {
128*103e46e4SHarish Mahendrakar char c;
129*103e46e4SHarish Mahendrakar e = GetChar(&c);
130*103e46e4SHarish Mahendrakar
131*103e46e4SHarish Mahendrakar if (e < 0) // error
132*103e46e4SHarish Mahendrakar return e;
133*103e46e4SHarish Mahendrakar
134*103e46e4SHarish Mahendrakar if (e > 0) // EOF
135*103e46e4SHarish Mahendrakar return -1;
136*103e46e4SHarish Mahendrakar
137*103e46e4SHarish Mahendrakar if (c != *p)
138*103e46e4SHarish Mahendrakar return -1;
139*103e46e4SHarish Mahendrakar }
140*103e46e4SHarish Mahendrakar
141*103e46e4SHarish Mahendrakar std::string line;
142*103e46e4SHarish Mahendrakar
143*103e46e4SHarish Mahendrakar e = GetLine(&line);
144*103e46e4SHarish Mahendrakar
145*103e46e4SHarish Mahendrakar if (e < 0) // error
146*103e46e4SHarish Mahendrakar return e;
147*103e46e4SHarish Mahendrakar
148*103e46e4SHarish Mahendrakar if (e > 0) // EOF
149*103e46e4SHarish Mahendrakar return 0; // weird but valid
150*103e46e4SHarish Mahendrakar
151*103e46e4SHarish Mahendrakar if (!line.empty()) {
152*103e46e4SHarish Mahendrakar // Parse optional characters that follow "WEBVTT"
153*103e46e4SHarish Mahendrakar
154*103e46e4SHarish Mahendrakar const char c = line[0];
155*103e46e4SHarish Mahendrakar
156*103e46e4SHarish Mahendrakar if (c != kSPACE && c != kTAB)
157*103e46e4SHarish Mahendrakar return -1;
158*103e46e4SHarish Mahendrakar }
159*103e46e4SHarish Mahendrakar
160*103e46e4SHarish Mahendrakar // The WebVTT spec requires that the "WEBVTT" line
161*103e46e4SHarish Mahendrakar // be followed by an empty line (to separate it from
162*103e46e4SHarish Mahendrakar // first cue).
163*103e46e4SHarish Mahendrakar
164*103e46e4SHarish Mahendrakar e = GetLine(&line);
165*103e46e4SHarish Mahendrakar
166*103e46e4SHarish Mahendrakar if (e < 0) // error
167*103e46e4SHarish Mahendrakar return e;
168*103e46e4SHarish Mahendrakar
169*103e46e4SHarish Mahendrakar if (e > 0) // EOF
170*103e46e4SHarish Mahendrakar return 0; // weird but we allow it
171*103e46e4SHarish Mahendrakar
172*103e46e4SHarish Mahendrakar if (!line.empty())
173*103e46e4SHarish Mahendrakar return -1;
174*103e46e4SHarish Mahendrakar
175*103e46e4SHarish Mahendrakar return 0; // success
176*103e46e4SHarish Mahendrakar }
177*103e46e4SHarish Mahendrakar
Parse(Cue * cue)178*103e46e4SHarish Mahendrakar int Parser::Parse(Cue* cue) {
179*103e46e4SHarish Mahendrakar if (cue == NULL)
180*103e46e4SHarish Mahendrakar return -1;
181*103e46e4SHarish Mahendrakar
182*103e46e4SHarish Mahendrakar // Parse first non-blank line
183*103e46e4SHarish Mahendrakar
184*103e46e4SHarish Mahendrakar std::string line;
185*103e46e4SHarish Mahendrakar int e;
186*103e46e4SHarish Mahendrakar
187*103e46e4SHarish Mahendrakar for (;;) {
188*103e46e4SHarish Mahendrakar e = GetLine(&line);
189*103e46e4SHarish Mahendrakar
190*103e46e4SHarish Mahendrakar if (e) // EOF is OK here
191*103e46e4SHarish Mahendrakar return e;
192*103e46e4SHarish Mahendrakar
193*103e46e4SHarish Mahendrakar if (!line.empty())
194*103e46e4SHarish Mahendrakar break;
195*103e46e4SHarish Mahendrakar }
196*103e46e4SHarish Mahendrakar
197*103e46e4SHarish Mahendrakar // A WebVTT cue comprises an optional cue identifier line followed
198*103e46e4SHarish Mahendrakar // by a (non-optional) timings line. You determine whether you have
199*103e46e4SHarish Mahendrakar // a timings line by scanning for the arrow token, the lexeme of which
200*103e46e4SHarish Mahendrakar // may not appear in the cue identifier line.
201*103e46e4SHarish Mahendrakar
202*103e46e4SHarish Mahendrakar const char kArrow[] = "-->";
203*103e46e4SHarish Mahendrakar std::string::size_type arrow_pos = line.find(kArrow);
204*103e46e4SHarish Mahendrakar
205*103e46e4SHarish Mahendrakar if (arrow_pos != std::string::npos) {
206*103e46e4SHarish Mahendrakar // We found a timings line, which implies that we don't have a cue
207*103e46e4SHarish Mahendrakar // identifier.
208*103e46e4SHarish Mahendrakar
209*103e46e4SHarish Mahendrakar cue->identifier.clear();
210*103e46e4SHarish Mahendrakar } else {
211*103e46e4SHarish Mahendrakar // We did not find a timings line, so we assume that we have a cue
212*103e46e4SHarish Mahendrakar // identifier line, and then try again to find the cue timings on
213*103e46e4SHarish Mahendrakar // the next line.
214*103e46e4SHarish Mahendrakar
215*103e46e4SHarish Mahendrakar cue->identifier.swap(line);
216*103e46e4SHarish Mahendrakar
217*103e46e4SHarish Mahendrakar e = GetLine(&line);
218*103e46e4SHarish Mahendrakar
219*103e46e4SHarish Mahendrakar if (e < 0) // error
220*103e46e4SHarish Mahendrakar return e;
221*103e46e4SHarish Mahendrakar
222*103e46e4SHarish Mahendrakar if (e > 0) // EOF
223*103e46e4SHarish Mahendrakar return -1;
224*103e46e4SHarish Mahendrakar
225*103e46e4SHarish Mahendrakar arrow_pos = line.find(kArrow);
226*103e46e4SHarish Mahendrakar
227*103e46e4SHarish Mahendrakar if (arrow_pos == std::string::npos) // not a timings line
228*103e46e4SHarish Mahendrakar return -1;
229*103e46e4SHarish Mahendrakar }
230*103e46e4SHarish Mahendrakar
231*103e46e4SHarish Mahendrakar e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time,
232*103e46e4SHarish Mahendrakar &cue->settings);
233*103e46e4SHarish Mahendrakar
234*103e46e4SHarish Mahendrakar if (e) // error
235*103e46e4SHarish Mahendrakar return e;
236*103e46e4SHarish Mahendrakar
237*103e46e4SHarish Mahendrakar // The cue payload comprises all the non-empty
238*103e46e4SHarish Mahendrakar // lines that follow the timings line.
239*103e46e4SHarish Mahendrakar
240*103e46e4SHarish Mahendrakar Cue::payload_t& p = cue->payload;
241*103e46e4SHarish Mahendrakar p.clear();
242*103e46e4SHarish Mahendrakar
243*103e46e4SHarish Mahendrakar for (;;) {
244*103e46e4SHarish Mahendrakar e = GetLine(&line);
245*103e46e4SHarish Mahendrakar
246*103e46e4SHarish Mahendrakar if (e < 0) // error
247*103e46e4SHarish Mahendrakar return e;
248*103e46e4SHarish Mahendrakar
249*103e46e4SHarish Mahendrakar if (line.empty())
250*103e46e4SHarish Mahendrakar break;
251*103e46e4SHarish Mahendrakar
252*103e46e4SHarish Mahendrakar p.push_back(line);
253*103e46e4SHarish Mahendrakar }
254*103e46e4SHarish Mahendrakar
255*103e46e4SHarish Mahendrakar if (p.empty())
256*103e46e4SHarish Mahendrakar return -1;
257*103e46e4SHarish Mahendrakar
258*103e46e4SHarish Mahendrakar return 0; // success
259*103e46e4SHarish Mahendrakar }
260*103e46e4SHarish Mahendrakar
GetChar(char * c)261*103e46e4SHarish Mahendrakar int Parser::GetChar(char* c) {
262*103e46e4SHarish Mahendrakar if (unget_ >= 0) {
263*103e46e4SHarish Mahendrakar *c = static_cast<char>(unget_);
264*103e46e4SHarish Mahendrakar unget_ = -1;
265*103e46e4SHarish Mahendrakar return 0;
266*103e46e4SHarish Mahendrakar }
267*103e46e4SHarish Mahendrakar
268*103e46e4SHarish Mahendrakar return reader_->GetChar(c);
269*103e46e4SHarish Mahendrakar }
270*103e46e4SHarish Mahendrakar
UngetChar(char c)271*103e46e4SHarish Mahendrakar void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); }
272*103e46e4SHarish Mahendrakar
ParseBOM()273*103e46e4SHarish Mahendrakar int Parser::ParseBOM() {
274*103e46e4SHarish Mahendrakar // Explanation of UTF-8 BOM:
275*103e46e4SHarish Mahendrakar // http://en.wikipedia.org/wiki/Byte_order_mark
276*103e46e4SHarish Mahendrakar
277*103e46e4SHarish Mahendrakar static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM
278*103e46e4SHarish Mahendrakar
279*103e46e4SHarish Mahendrakar for (int i = 0; i < 3; ++i) {
280*103e46e4SHarish Mahendrakar char c;
281*103e46e4SHarish Mahendrakar int e = GetChar(&c);
282*103e46e4SHarish Mahendrakar
283*103e46e4SHarish Mahendrakar if (e < 0) // error
284*103e46e4SHarish Mahendrakar return e;
285*103e46e4SHarish Mahendrakar
286*103e46e4SHarish Mahendrakar if (e > 0) // EOF
287*103e46e4SHarish Mahendrakar return 1;
288*103e46e4SHarish Mahendrakar
289*103e46e4SHarish Mahendrakar if (c != BOM[i]) {
290*103e46e4SHarish Mahendrakar if (i == 0) { // we don't have a BOM
291*103e46e4SHarish Mahendrakar UngetChar(c);
292*103e46e4SHarish Mahendrakar return 0; // success
293*103e46e4SHarish Mahendrakar }
294*103e46e4SHarish Mahendrakar
295*103e46e4SHarish Mahendrakar // We started a BOM, so we must finish the BOM.
296*103e46e4SHarish Mahendrakar return -1; // error
297*103e46e4SHarish Mahendrakar }
298*103e46e4SHarish Mahendrakar }
299*103e46e4SHarish Mahendrakar
300*103e46e4SHarish Mahendrakar return 0; // success
301*103e46e4SHarish Mahendrakar }
302*103e46e4SHarish Mahendrakar
ParseTimingsLine(std::string * line_ptr,std::string::size_type arrow_pos,Time * start_time,Time * stop_time,Cue::settings_t * settings)303*103e46e4SHarish Mahendrakar int Parser::ParseTimingsLine(std::string* line_ptr,
304*103e46e4SHarish Mahendrakar std::string::size_type arrow_pos, Time* start_time,
305*103e46e4SHarish Mahendrakar Time* stop_time, Cue::settings_t* settings) {
306*103e46e4SHarish Mahendrakar if (line_ptr == NULL)
307*103e46e4SHarish Mahendrakar return -1;
308*103e46e4SHarish Mahendrakar
309*103e46e4SHarish Mahendrakar std::string& line = *line_ptr;
310*103e46e4SHarish Mahendrakar
311*103e46e4SHarish Mahendrakar if (arrow_pos == std::string::npos || arrow_pos >= line.length())
312*103e46e4SHarish Mahendrakar return -1;
313*103e46e4SHarish Mahendrakar
314*103e46e4SHarish Mahendrakar // Place a NUL character at the start of the arrow token, in
315*103e46e4SHarish Mahendrakar // order to demarcate the start time from remainder of line.
316*103e46e4SHarish Mahendrakar line[arrow_pos] = kNUL;
317*103e46e4SHarish Mahendrakar std::string::size_type idx = 0;
318*103e46e4SHarish Mahendrakar
319*103e46e4SHarish Mahendrakar int e = ParseTime(line, &idx, start_time);
320*103e46e4SHarish Mahendrakar if (e) // error
321*103e46e4SHarish Mahendrakar return e;
322*103e46e4SHarish Mahendrakar
323*103e46e4SHarish Mahendrakar // Detect any junk that follows the start time,
324*103e46e4SHarish Mahendrakar // but precedes the arrow symbol.
325*103e46e4SHarish Mahendrakar
326*103e46e4SHarish Mahendrakar while (char c = line[idx]) {
327*103e46e4SHarish Mahendrakar if (c != kSPACE && c != kTAB)
328*103e46e4SHarish Mahendrakar return -1;
329*103e46e4SHarish Mahendrakar ++idx;
330*103e46e4SHarish Mahendrakar }
331*103e46e4SHarish Mahendrakar
332*103e46e4SHarish Mahendrakar // Place a NUL character at the end of the line,
333*103e46e4SHarish Mahendrakar // so the scanner has a place to stop, and begin
334*103e46e4SHarish Mahendrakar // the scan just beyond the arrow token.
335*103e46e4SHarish Mahendrakar
336*103e46e4SHarish Mahendrakar line.push_back(kNUL);
337*103e46e4SHarish Mahendrakar idx = arrow_pos + 3;
338*103e46e4SHarish Mahendrakar
339*103e46e4SHarish Mahendrakar e = ParseTime(line, &idx, stop_time);
340*103e46e4SHarish Mahendrakar if (e) // error
341*103e46e4SHarish Mahendrakar return e;
342*103e46e4SHarish Mahendrakar
343*103e46e4SHarish Mahendrakar e = ParseSettings(line, idx, settings);
344*103e46e4SHarish Mahendrakar if (e) // error
345*103e46e4SHarish Mahendrakar return e;
346*103e46e4SHarish Mahendrakar
347*103e46e4SHarish Mahendrakar return 0; // success
348*103e46e4SHarish Mahendrakar }
349*103e46e4SHarish Mahendrakar
ParseTime(const std::string & line,std::string::size_type * idx_ptr,Time * time)350*103e46e4SHarish Mahendrakar int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr,
351*103e46e4SHarish Mahendrakar Time* time) {
352*103e46e4SHarish Mahendrakar if (idx_ptr == NULL)
353*103e46e4SHarish Mahendrakar return -1;
354*103e46e4SHarish Mahendrakar
355*103e46e4SHarish Mahendrakar std::string::size_type& idx = *idx_ptr;
356*103e46e4SHarish Mahendrakar
357*103e46e4SHarish Mahendrakar if (idx == std::string::npos || idx >= line.length())
358*103e46e4SHarish Mahendrakar return -1;
359*103e46e4SHarish Mahendrakar
360*103e46e4SHarish Mahendrakar if (time == NULL)
361*103e46e4SHarish Mahendrakar return -1;
362*103e46e4SHarish Mahendrakar
363*103e46e4SHarish Mahendrakar // Consume any whitespace that precedes the timestamp.
364*103e46e4SHarish Mahendrakar
365*103e46e4SHarish Mahendrakar while (char c = line[idx]) {
366*103e46e4SHarish Mahendrakar if (c != kSPACE && c != kTAB)
367*103e46e4SHarish Mahendrakar break;
368*103e46e4SHarish Mahendrakar ++idx;
369*103e46e4SHarish Mahendrakar }
370*103e46e4SHarish Mahendrakar
371*103e46e4SHarish Mahendrakar // WebVTT timestamp syntax comes in three flavors:
372*103e46e4SHarish Mahendrakar // SS[.sss]
373*103e46e4SHarish Mahendrakar // MM:SS[.sss]
374*103e46e4SHarish Mahendrakar // HH:MM:SS[.sss]
375*103e46e4SHarish Mahendrakar
376*103e46e4SHarish Mahendrakar // Parse a generic number value. We don't know which component
377*103e46e4SHarish Mahendrakar // of the time we have yet, until we do more parsing.
378*103e46e4SHarish Mahendrakar
379*103e46e4SHarish Mahendrakar int val = ParseNumber(line, &idx);
380*103e46e4SHarish Mahendrakar
381*103e46e4SHarish Mahendrakar if (val < 0) // error
382*103e46e4SHarish Mahendrakar return val;
383*103e46e4SHarish Mahendrakar
384*103e46e4SHarish Mahendrakar Time& t = *time;
385*103e46e4SHarish Mahendrakar
386*103e46e4SHarish Mahendrakar // The presence of a colon character indicates that we have
387*103e46e4SHarish Mahendrakar // an [HH:]MM:SS style syntax.
388*103e46e4SHarish Mahendrakar
389*103e46e4SHarish Mahendrakar if (line[idx] == ':') {
390*103e46e4SHarish Mahendrakar // We have either HH:MM:SS or MM:SS
391*103e46e4SHarish Mahendrakar
392*103e46e4SHarish Mahendrakar // The value we just parsed is either the hours or minutes.
393*103e46e4SHarish Mahendrakar // It must be followed by another number value (that is
394*103e46e4SHarish Mahendrakar // either minutes or seconds).
395*103e46e4SHarish Mahendrakar
396*103e46e4SHarish Mahendrakar const int first_val = val;
397*103e46e4SHarish Mahendrakar
398*103e46e4SHarish Mahendrakar ++idx; // consume colon
399*103e46e4SHarish Mahendrakar
400*103e46e4SHarish Mahendrakar // Parse second value
401*103e46e4SHarish Mahendrakar
402*103e46e4SHarish Mahendrakar val = ParseNumber(line, &idx);
403*103e46e4SHarish Mahendrakar
404*103e46e4SHarish Mahendrakar if (val < 0)
405*103e46e4SHarish Mahendrakar return val;
406*103e46e4SHarish Mahendrakar
407*103e46e4SHarish Mahendrakar if (val >= 60) // either MM or SS
408*103e46e4SHarish Mahendrakar return -1;
409*103e46e4SHarish Mahendrakar
410*103e46e4SHarish Mahendrakar if (line[idx] == ':') {
411*103e46e4SHarish Mahendrakar // We have HH:MM:SS
412*103e46e4SHarish Mahendrakar
413*103e46e4SHarish Mahendrakar t.hours = first_val;
414*103e46e4SHarish Mahendrakar t.minutes = val; // vetted above
415*103e46e4SHarish Mahendrakar
416*103e46e4SHarish Mahendrakar ++idx; // consume MM:SS colon
417*103e46e4SHarish Mahendrakar
418*103e46e4SHarish Mahendrakar // We have parsed the hours and minutes.
419*103e46e4SHarish Mahendrakar // We must now parse the seconds.
420*103e46e4SHarish Mahendrakar
421*103e46e4SHarish Mahendrakar val = ParseNumber(line, &idx);
422*103e46e4SHarish Mahendrakar
423*103e46e4SHarish Mahendrakar if (val < 0)
424*103e46e4SHarish Mahendrakar return val;
425*103e46e4SHarish Mahendrakar
426*103e46e4SHarish Mahendrakar if (val >= 60) // SS part of HH:MM:SS
427*103e46e4SHarish Mahendrakar return -1;
428*103e46e4SHarish Mahendrakar
429*103e46e4SHarish Mahendrakar t.seconds = val;
430*103e46e4SHarish Mahendrakar } else {
431*103e46e4SHarish Mahendrakar // We have MM:SS
432*103e46e4SHarish Mahendrakar
433*103e46e4SHarish Mahendrakar // The implication here is that the hour value was omitted
434*103e46e4SHarish Mahendrakar // from the timestamp (because it was 0).
435*103e46e4SHarish Mahendrakar
436*103e46e4SHarish Mahendrakar if (first_val >= 60) // minutes
437*103e46e4SHarish Mahendrakar return -1;
438*103e46e4SHarish Mahendrakar
439*103e46e4SHarish Mahendrakar t.hours = 0;
440*103e46e4SHarish Mahendrakar t.minutes = first_val;
441*103e46e4SHarish Mahendrakar t.seconds = val; // vetted above
442*103e46e4SHarish Mahendrakar }
443*103e46e4SHarish Mahendrakar } else {
444*103e46e4SHarish Mahendrakar // We have SS (only)
445*103e46e4SHarish Mahendrakar
446*103e46e4SHarish Mahendrakar // The time is expressed as total number of seconds,
447*103e46e4SHarish Mahendrakar // so the seconds value has no upper bound.
448*103e46e4SHarish Mahendrakar
449*103e46e4SHarish Mahendrakar t.seconds = val;
450*103e46e4SHarish Mahendrakar
451*103e46e4SHarish Mahendrakar // Convert SS to HH:MM:SS
452*103e46e4SHarish Mahendrakar
453*103e46e4SHarish Mahendrakar t.minutes = t.seconds / 60;
454*103e46e4SHarish Mahendrakar t.seconds -= t.minutes * 60;
455*103e46e4SHarish Mahendrakar
456*103e46e4SHarish Mahendrakar t.hours = t.minutes / 60;
457*103e46e4SHarish Mahendrakar t.minutes -= t.hours * 60;
458*103e46e4SHarish Mahendrakar }
459*103e46e4SHarish Mahendrakar
460*103e46e4SHarish Mahendrakar // We have parsed the hours, minutes, and seconds.
461*103e46e4SHarish Mahendrakar // We must now parse the milliseconds.
462*103e46e4SHarish Mahendrakar
463*103e46e4SHarish Mahendrakar char c = line[idx];
464*103e46e4SHarish Mahendrakar
465*103e46e4SHarish Mahendrakar // TODO(matthewjheaney): one option here is to slightly relax the
466*103e46e4SHarish Mahendrakar // syntax rules for WebVTT timestamps, to permit the comma character
467*103e46e4SHarish Mahendrakar // to also be used as the seconds/milliseconds separator. This
468*103e46e4SHarish Mahendrakar // would handle streams that use localization conventions for
469*103e46e4SHarish Mahendrakar // countries in Western Europe. For now we obey the rules specified
470*103e46e4SHarish Mahendrakar // in the WebVTT spec (allow "full stop" only).
471*103e46e4SHarish Mahendrakar
472*103e46e4SHarish Mahendrakar const bool have_milliseconds = (c == '.');
473*103e46e4SHarish Mahendrakar
474*103e46e4SHarish Mahendrakar if (!have_milliseconds) {
475*103e46e4SHarish Mahendrakar t.milliseconds = 0;
476*103e46e4SHarish Mahendrakar } else {
477*103e46e4SHarish Mahendrakar ++idx; // consume FULL STOP
478*103e46e4SHarish Mahendrakar
479*103e46e4SHarish Mahendrakar val = ParseNumber(line, &idx);
480*103e46e4SHarish Mahendrakar
481*103e46e4SHarish Mahendrakar if (val < 0)
482*103e46e4SHarish Mahendrakar return val;
483*103e46e4SHarish Mahendrakar
484*103e46e4SHarish Mahendrakar if (val >= 1000)
485*103e46e4SHarish Mahendrakar return -1;
486*103e46e4SHarish Mahendrakar
487*103e46e4SHarish Mahendrakar if (val < 10)
488*103e46e4SHarish Mahendrakar t.milliseconds = val * 100;
489*103e46e4SHarish Mahendrakar else if (val < 100)
490*103e46e4SHarish Mahendrakar t.milliseconds = val * 10;
491*103e46e4SHarish Mahendrakar else
492*103e46e4SHarish Mahendrakar t.milliseconds = val;
493*103e46e4SHarish Mahendrakar }
494*103e46e4SHarish Mahendrakar
495*103e46e4SHarish Mahendrakar // We have parsed the time proper. We must check for any
496*103e46e4SHarish Mahendrakar // junk that immediately follows the time specifier.
497*103e46e4SHarish Mahendrakar
498*103e46e4SHarish Mahendrakar c = line[idx];
499*103e46e4SHarish Mahendrakar
500*103e46e4SHarish Mahendrakar if (c != kNUL && c != kSPACE && c != kTAB)
501*103e46e4SHarish Mahendrakar return -1;
502*103e46e4SHarish Mahendrakar
503*103e46e4SHarish Mahendrakar return 0; // success
504*103e46e4SHarish Mahendrakar }
505*103e46e4SHarish Mahendrakar
ParseSettings(const std::string & line,std::string::size_type idx,Cue::settings_t * settings)506*103e46e4SHarish Mahendrakar int Parser::ParseSettings(const std::string& line, std::string::size_type idx,
507*103e46e4SHarish Mahendrakar Cue::settings_t* settings) {
508*103e46e4SHarish Mahendrakar settings->clear();
509*103e46e4SHarish Mahendrakar
510*103e46e4SHarish Mahendrakar if (idx == std::string::npos || idx >= line.length())
511*103e46e4SHarish Mahendrakar return -1;
512*103e46e4SHarish Mahendrakar
513*103e46e4SHarish Mahendrakar for (;;) {
514*103e46e4SHarish Mahendrakar // We must parse a line comprising a sequence of 0 or more
515*103e46e4SHarish Mahendrakar // NAME:VALUE pairs, separated by whitespace. The line iself is
516*103e46e4SHarish Mahendrakar // terminated with a NUL char (indicating end-of-line).
517*103e46e4SHarish Mahendrakar
518*103e46e4SHarish Mahendrakar for (;;) {
519*103e46e4SHarish Mahendrakar const char c = line[idx];
520*103e46e4SHarish Mahendrakar
521*103e46e4SHarish Mahendrakar if (c == kNUL) // end-of-line
522*103e46e4SHarish Mahendrakar return 0; // success
523*103e46e4SHarish Mahendrakar
524*103e46e4SHarish Mahendrakar if (c != kSPACE && c != kTAB)
525*103e46e4SHarish Mahendrakar break;
526*103e46e4SHarish Mahendrakar
527*103e46e4SHarish Mahendrakar ++idx; // consume whitespace
528*103e46e4SHarish Mahendrakar }
529*103e46e4SHarish Mahendrakar
530*103e46e4SHarish Mahendrakar // We have consumed the whitespace, and have not yet reached
531*103e46e4SHarish Mahendrakar // end-of-line, so there is something on the line for us to parse.
532*103e46e4SHarish Mahendrakar
533*103e46e4SHarish Mahendrakar settings->push_back(Setting());
534*103e46e4SHarish Mahendrakar Setting& s = settings->back();
535*103e46e4SHarish Mahendrakar
536*103e46e4SHarish Mahendrakar // Parse the NAME part of the settings pair.
537*103e46e4SHarish Mahendrakar
538*103e46e4SHarish Mahendrakar for (;;) {
539*103e46e4SHarish Mahendrakar const char c = line[idx];
540*103e46e4SHarish Mahendrakar
541*103e46e4SHarish Mahendrakar if (c == ':') // we have reached end of NAME part
542*103e46e4SHarish Mahendrakar break;
543*103e46e4SHarish Mahendrakar
544*103e46e4SHarish Mahendrakar if (c == kNUL || c == kSPACE || c == kTAB)
545*103e46e4SHarish Mahendrakar return -1;
546*103e46e4SHarish Mahendrakar
547*103e46e4SHarish Mahendrakar s.name.push_back(c);
548*103e46e4SHarish Mahendrakar
549*103e46e4SHarish Mahendrakar ++idx;
550*103e46e4SHarish Mahendrakar }
551*103e46e4SHarish Mahendrakar
552*103e46e4SHarish Mahendrakar if (s.name.empty())
553*103e46e4SHarish Mahendrakar return -1;
554*103e46e4SHarish Mahendrakar
555*103e46e4SHarish Mahendrakar ++idx; // consume colon
556*103e46e4SHarish Mahendrakar
557*103e46e4SHarish Mahendrakar // Parse the VALUE part of the settings pair.
558*103e46e4SHarish Mahendrakar
559*103e46e4SHarish Mahendrakar for (;;) {
560*103e46e4SHarish Mahendrakar const char c = line[idx];
561*103e46e4SHarish Mahendrakar
562*103e46e4SHarish Mahendrakar if (c == kNUL || c == kSPACE || c == kTAB)
563*103e46e4SHarish Mahendrakar break;
564*103e46e4SHarish Mahendrakar
565*103e46e4SHarish Mahendrakar if (c == ':') // suspicious when part of VALUE
566*103e46e4SHarish Mahendrakar return -1; // TODO(matthewjheaney): verify this behavior
567*103e46e4SHarish Mahendrakar
568*103e46e4SHarish Mahendrakar s.value.push_back(c);
569*103e46e4SHarish Mahendrakar
570*103e46e4SHarish Mahendrakar ++idx;
571*103e46e4SHarish Mahendrakar }
572*103e46e4SHarish Mahendrakar
573*103e46e4SHarish Mahendrakar if (s.value.empty())
574*103e46e4SHarish Mahendrakar return -1;
575*103e46e4SHarish Mahendrakar }
576*103e46e4SHarish Mahendrakar }
577*103e46e4SHarish Mahendrakar
ParseNumber(const std::string & line,std::string::size_type * idx_ptr)578*103e46e4SHarish Mahendrakar int Parser::ParseNumber(const std::string& line,
579*103e46e4SHarish Mahendrakar std::string::size_type* idx_ptr) {
580*103e46e4SHarish Mahendrakar if (idx_ptr == NULL)
581*103e46e4SHarish Mahendrakar return -1;
582*103e46e4SHarish Mahendrakar
583*103e46e4SHarish Mahendrakar std::string::size_type& idx = *idx_ptr;
584*103e46e4SHarish Mahendrakar
585*103e46e4SHarish Mahendrakar if (idx == std::string::npos || idx >= line.length())
586*103e46e4SHarish Mahendrakar return -1;
587*103e46e4SHarish Mahendrakar
588*103e46e4SHarish Mahendrakar if (!isdigit(line[idx]))
589*103e46e4SHarish Mahendrakar return -1;
590*103e46e4SHarish Mahendrakar
591*103e46e4SHarish Mahendrakar int result = 0;
592*103e46e4SHarish Mahendrakar
593*103e46e4SHarish Mahendrakar while (isdigit(line[idx])) {
594*103e46e4SHarish Mahendrakar const char c = line[idx];
595*103e46e4SHarish Mahendrakar const int i = c - '0';
596*103e46e4SHarish Mahendrakar
597*103e46e4SHarish Mahendrakar if (result > INT_MAX / 10)
598*103e46e4SHarish Mahendrakar return -1;
599*103e46e4SHarish Mahendrakar
600*103e46e4SHarish Mahendrakar result *= 10;
601*103e46e4SHarish Mahendrakar
602*103e46e4SHarish Mahendrakar if (result > INT_MAX - i)
603*103e46e4SHarish Mahendrakar return -1;
604*103e46e4SHarish Mahendrakar
605*103e46e4SHarish Mahendrakar result += i;
606*103e46e4SHarish Mahendrakar
607*103e46e4SHarish Mahendrakar ++idx;
608*103e46e4SHarish Mahendrakar }
609*103e46e4SHarish Mahendrakar
610*103e46e4SHarish Mahendrakar return result;
611*103e46e4SHarish Mahendrakar }
612*103e46e4SHarish Mahendrakar
operator ==(const Time & rhs) const613*103e46e4SHarish Mahendrakar bool Time::operator==(const Time& rhs) const {
614*103e46e4SHarish Mahendrakar if (hours != rhs.hours)
615*103e46e4SHarish Mahendrakar return false;
616*103e46e4SHarish Mahendrakar
617*103e46e4SHarish Mahendrakar if (minutes != rhs.minutes)
618*103e46e4SHarish Mahendrakar return false;
619*103e46e4SHarish Mahendrakar
620*103e46e4SHarish Mahendrakar if (seconds != rhs.seconds)
621*103e46e4SHarish Mahendrakar return false;
622*103e46e4SHarish Mahendrakar
623*103e46e4SHarish Mahendrakar return (milliseconds == rhs.milliseconds);
624*103e46e4SHarish Mahendrakar }
625*103e46e4SHarish Mahendrakar
operator <(const Time & rhs) const626*103e46e4SHarish Mahendrakar bool Time::operator<(const Time& rhs) const {
627*103e46e4SHarish Mahendrakar if (hours < rhs.hours)
628*103e46e4SHarish Mahendrakar return true;
629*103e46e4SHarish Mahendrakar
630*103e46e4SHarish Mahendrakar if (hours > rhs.hours)
631*103e46e4SHarish Mahendrakar return false;
632*103e46e4SHarish Mahendrakar
633*103e46e4SHarish Mahendrakar if (minutes < rhs.minutes)
634*103e46e4SHarish Mahendrakar return true;
635*103e46e4SHarish Mahendrakar
636*103e46e4SHarish Mahendrakar if (minutes > rhs.minutes)
637*103e46e4SHarish Mahendrakar return false;
638*103e46e4SHarish Mahendrakar
639*103e46e4SHarish Mahendrakar if (seconds < rhs.seconds)
640*103e46e4SHarish Mahendrakar return true;
641*103e46e4SHarish Mahendrakar
642*103e46e4SHarish Mahendrakar if (seconds > rhs.seconds)
643*103e46e4SHarish Mahendrakar return false;
644*103e46e4SHarish Mahendrakar
645*103e46e4SHarish Mahendrakar return (milliseconds < rhs.milliseconds);
646*103e46e4SHarish Mahendrakar }
647*103e46e4SHarish Mahendrakar
operator >(const Time & rhs) const648*103e46e4SHarish Mahendrakar bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); }
649*103e46e4SHarish Mahendrakar
operator <=(const Time & rhs) const650*103e46e4SHarish Mahendrakar bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); }
651*103e46e4SHarish Mahendrakar
operator >=(const Time & rhs) const652*103e46e4SHarish Mahendrakar bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); }
653*103e46e4SHarish Mahendrakar
presentation() const654*103e46e4SHarish Mahendrakar presentation_t Time::presentation() const {
655*103e46e4SHarish Mahendrakar const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
656*103e46e4SHarish Mahendrakar const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
657*103e46e4SHarish Mahendrakar const presentation_t s = 1000LL * presentation_t(seconds);
658*103e46e4SHarish Mahendrakar const presentation_t result = h + m + s + milliseconds;
659*103e46e4SHarish Mahendrakar return result;
660*103e46e4SHarish Mahendrakar }
661*103e46e4SHarish Mahendrakar
presentation(presentation_t d)662*103e46e4SHarish Mahendrakar Time& Time::presentation(presentation_t d) {
663*103e46e4SHarish Mahendrakar if (d < 0) { // error
664*103e46e4SHarish Mahendrakar hours = 0;
665*103e46e4SHarish Mahendrakar minutes = 0;
666*103e46e4SHarish Mahendrakar seconds = 0;
667*103e46e4SHarish Mahendrakar milliseconds = 0;
668*103e46e4SHarish Mahendrakar
669*103e46e4SHarish Mahendrakar return *this;
670*103e46e4SHarish Mahendrakar }
671*103e46e4SHarish Mahendrakar
672*103e46e4SHarish Mahendrakar seconds = static_cast<int>(d / 1000);
673*103e46e4SHarish Mahendrakar milliseconds = static_cast<int>(d - 1000 * seconds);
674*103e46e4SHarish Mahendrakar
675*103e46e4SHarish Mahendrakar minutes = seconds / 60;
676*103e46e4SHarish Mahendrakar seconds -= 60 * minutes;
677*103e46e4SHarish Mahendrakar
678*103e46e4SHarish Mahendrakar hours = minutes / 60;
679*103e46e4SHarish Mahendrakar minutes -= 60 * hours;
680*103e46e4SHarish Mahendrakar
681*103e46e4SHarish Mahendrakar return *this;
682*103e46e4SHarish Mahendrakar }
683*103e46e4SHarish Mahendrakar
operator +=(presentation_t rhs)684*103e46e4SHarish Mahendrakar Time& Time::operator+=(presentation_t rhs) {
685*103e46e4SHarish Mahendrakar const presentation_t d = this->presentation();
686*103e46e4SHarish Mahendrakar const presentation_t dd = d + rhs;
687*103e46e4SHarish Mahendrakar this->presentation(dd);
688*103e46e4SHarish Mahendrakar return *this;
689*103e46e4SHarish Mahendrakar }
690*103e46e4SHarish Mahendrakar
operator +(presentation_t d) const691*103e46e4SHarish Mahendrakar Time Time::operator+(presentation_t d) const {
692*103e46e4SHarish Mahendrakar Time t(*this);
693*103e46e4SHarish Mahendrakar t += d;
694*103e46e4SHarish Mahendrakar return t;
695*103e46e4SHarish Mahendrakar }
696*103e46e4SHarish Mahendrakar
operator -=(presentation_t d)697*103e46e4SHarish Mahendrakar Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); }
698*103e46e4SHarish Mahendrakar
operator -(const Time & t) const699*103e46e4SHarish Mahendrakar presentation_t Time::operator-(const Time& t) const {
700*103e46e4SHarish Mahendrakar const presentation_t rhs = t.presentation();
701*103e46e4SHarish Mahendrakar const presentation_t lhs = this->presentation();
702*103e46e4SHarish Mahendrakar const presentation_t result = lhs - rhs;
703*103e46e4SHarish Mahendrakar return result;
704*103e46e4SHarish Mahendrakar }
705*103e46e4SHarish Mahendrakar
706*103e46e4SHarish Mahendrakar } // namespace libwebvtt
707