xref: /aosp_15_r20/external/libwebm/webvtt/webvttparser.cc (revision 103e46e4cd4b6efcf6001f23fa8665fb110abf8d)
1*103e46e4SHarish Mahendrakar // Copyright (c) 2012 The WebM project authors. All Rights Reserved.
2*103e46e4SHarish Mahendrakar //
3*103e46e4SHarish Mahendrakar // Use of this source code is governed by a BSD-style license
4*103e46e4SHarish Mahendrakar // that can be found in the LICENSE file in the root of the source
5*103e46e4SHarish Mahendrakar // tree. An additional intellectual property rights grant can be found
6*103e46e4SHarish Mahendrakar // in the file PATENTS.  All contributing project authors may
7*103e46e4SHarish Mahendrakar // be found in the AUTHORS file in the root of the source tree.
8*103e46e4SHarish Mahendrakar 
9*103e46e4SHarish Mahendrakar #include "webvttparser.h"
10*103e46e4SHarish Mahendrakar 
11*103e46e4SHarish Mahendrakar #include <ctype.h>
12*103e46e4SHarish Mahendrakar 
13*103e46e4SHarish Mahendrakar #include <climits>
14*103e46e4SHarish Mahendrakar #include <cstddef>
15*103e46e4SHarish Mahendrakar 
16*103e46e4SHarish Mahendrakar namespace libwebvtt {
17*103e46e4SHarish Mahendrakar 
18*103e46e4SHarish Mahendrakar // NOLINT'ing this enum because clang-format puts it in a single line which
19*103e46e4SHarish Mahendrakar // makes it look really unreadable.
20*103e46e4SHarish Mahendrakar enum {
21*103e46e4SHarish Mahendrakar   kNUL = '\x00',
22*103e46e4SHarish Mahendrakar   kSPACE = ' ',
23*103e46e4SHarish Mahendrakar   kTAB = '\x09',
24*103e46e4SHarish Mahendrakar   kLF = '\x0A',
25*103e46e4SHarish Mahendrakar   kCR = '\x0D'
26*103e46e4SHarish Mahendrakar };  // NOLINT
27*103e46e4SHarish Mahendrakar 
~Reader()28*103e46e4SHarish Mahendrakar Reader::~Reader() {}
29*103e46e4SHarish Mahendrakar 
~LineReader()30*103e46e4SHarish Mahendrakar LineReader::~LineReader() {}
31*103e46e4SHarish Mahendrakar 
GetLine(std::string * line_ptr)32*103e46e4SHarish Mahendrakar int LineReader::GetLine(std::string* line_ptr) {
33*103e46e4SHarish Mahendrakar   if (line_ptr == NULL)
34*103e46e4SHarish Mahendrakar     return -1;
35*103e46e4SHarish Mahendrakar 
36*103e46e4SHarish Mahendrakar   std::string& ln = *line_ptr;
37*103e46e4SHarish Mahendrakar   ln.clear();
38*103e46e4SHarish Mahendrakar 
39*103e46e4SHarish Mahendrakar   // Consume characters from the stream, until we
40*103e46e4SHarish Mahendrakar   // reach end-of-line (or end-of-stream).
41*103e46e4SHarish Mahendrakar 
42*103e46e4SHarish Mahendrakar   // The WebVTT spec states that lines may be
43*103e46e4SHarish Mahendrakar   // terminated in any of these three ways:
44*103e46e4SHarish Mahendrakar   //  LF
45*103e46e4SHarish Mahendrakar   //  CR
46*103e46e4SHarish Mahendrakar   //  CR LF
47*103e46e4SHarish Mahendrakar 
48*103e46e4SHarish Mahendrakar   // We interrogate each character as we read it from the stream.
49*103e46e4SHarish Mahendrakar   // If we detect an end-of-line character, we consume the full
50*103e46e4SHarish Mahendrakar   // end-of-line indication, and we're done; otherwise, accumulate
51*103e46e4SHarish Mahendrakar   // the character and repeat.
52*103e46e4SHarish Mahendrakar 
53*103e46e4SHarish Mahendrakar   for (;;) {
54*103e46e4SHarish Mahendrakar     char c;
55*103e46e4SHarish Mahendrakar     const int e = GetChar(&c);
56*103e46e4SHarish Mahendrakar 
57*103e46e4SHarish Mahendrakar     if (e < 0)  // error
58*103e46e4SHarish Mahendrakar       return e;
59*103e46e4SHarish Mahendrakar 
60*103e46e4SHarish Mahendrakar     if (e > 0)  // EOF
61*103e46e4SHarish Mahendrakar       return (ln.empty()) ? 1 : 0;
62*103e46e4SHarish Mahendrakar 
63*103e46e4SHarish Mahendrakar     // We have a character, so we must first determine
64*103e46e4SHarish Mahendrakar     // whether we have reached end-of-line.
65*103e46e4SHarish Mahendrakar 
66*103e46e4SHarish Mahendrakar     if (c == kLF)
67*103e46e4SHarish Mahendrakar       return 0;  // handle the easy end-of-line case immediately
68*103e46e4SHarish Mahendrakar 
69*103e46e4SHarish Mahendrakar     if (c == kCR)
70*103e46e4SHarish Mahendrakar       break;  // handle the hard end-of-line case outside of loop
71*103e46e4SHarish Mahendrakar 
72*103e46e4SHarish Mahendrakar     if (c == '\xFE' || c == '\xFF')  // not UTF-8
73*103e46e4SHarish Mahendrakar       return -1;
74*103e46e4SHarish Mahendrakar 
75*103e46e4SHarish Mahendrakar     // To defend against pathological or malicious streams, we
76*103e46e4SHarish Mahendrakar     // cap the line length at some arbitrarily-large value:
77*103e46e4SHarish Mahendrakar     enum { kMaxLineLength = 10000 };  // arbitrary
78*103e46e4SHarish Mahendrakar 
79*103e46e4SHarish Mahendrakar     if (ln.length() >= kMaxLineLength)
80*103e46e4SHarish Mahendrakar       return -1;
81*103e46e4SHarish Mahendrakar 
82*103e46e4SHarish Mahendrakar     // We don't have an end-of-line character, so accumulate
83*103e46e4SHarish Mahendrakar     // the character in our line buffer.
84*103e46e4SHarish Mahendrakar     ln.push_back(c);
85*103e46e4SHarish Mahendrakar   }
86*103e46e4SHarish Mahendrakar 
87*103e46e4SHarish Mahendrakar   // We detected a CR.  We must interrogate the next character
88*103e46e4SHarish Mahendrakar   // in the stream, to determine whether we have a LF (which
89*103e46e4SHarish Mahendrakar   // would make it part of this same line).
90*103e46e4SHarish Mahendrakar 
91*103e46e4SHarish Mahendrakar   char c;
92*103e46e4SHarish Mahendrakar   const int e = GetChar(&c);
93*103e46e4SHarish Mahendrakar 
94*103e46e4SHarish Mahendrakar   if (e < 0)  // error
95*103e46e4SHarish Mahendrakar     return e;
96*103e46e4SHarish Mahendrakar 
97*103e46e4SHarish Mahendrakar   if (e > 0)  // EOF
98*103e46e4SHarish Mahendrakar     return 0;
99*103e46e4SHarish Mahendrakar 
100*103e46e4SHarish Mahendrakar   // If next character in the stream is not a LF, return it
101*103e46e4SHarish Mahendrakar   // to the stream (because it's part of the next line).
102*103e46e4SHarish Mahendrakar   if (c != kLF)
103*103e46e4SHarish Mahendrakar     UngetChar(c);
104*103e46e4SHarish Mahendrakar 
105*103e46e4SHarish Mahendrakar   return 0;
106*103e46e4SHarish Mahendrakar }
107*103e46e4SHarish Mahendrakar 
Parser(Reader * r)108*103e46e4SHarish Mahendrakar Parser::Parser(Reader* r) : reader_(r), unget_(-1) {}
109*103e46e4SHarish Mahendrakar 
~Parser()110*103e46e4SHarish Mahendrakar Parser::~Parser() {}
111*103e46e4SHarish Mahendrakar 
Init()112*103e46e4SHarish Mahendrakar int Parser::Init() {
113*103e46e4SHarish Mahendrakar   int e = ParseBOM();
114*103e46e4SHarish Mahendrakar 
115*103e46e4SHarish Mahendrakar   if (e < 0)  // error
116*103e46e4SHarish Mahendrakar     return e;
117*103e46e4SHarish Mahendrakar 
118*103e46e4SHarish Mahendrakar   if (e > 0)  // EOF
119*103e46e4SHarish Mahendrakar     return -1;
120*103e46e4SHarish Mahendrakar 
121*103e46e4SHarish Mahendrakar   // Parse "WEBVTT".  We read from the stream one character at-a-time, in
122*103e46e4SHarish Mahendrakar   // order to defend against non-WebVTT streams (e.g. binary files) that don't
123*103e46e4SHarish Mahendrakar   // happen to comprise lines of text demarcated with line terminators.
124*103e46e4SHarish Mahendrakar 
125*103e46e4SHarish Mahendrakar   const char kId[] = "WEBVTT";
126*103e46e4SHarish Mahendrakar 
127*103e46e4SHarish Mahendrakar   for (const char* p = kId; *p; ++p) {
128*103e46e4SHarish Mahendrakar     char c;
129*103e46e4SHarish Mahendrakar     e = GetChar(&c);
130*103e46e4SHarish Mahendrakar 
131*103e46e4SHarish Mahendrakar     if (e < 0)  // error
132*103e46e4SHarish Mahendrakar       return e;
133*103e46e4SHarish Mahendrakar 
134*103e46e4SHarish Mahendrakar     if (e > 0)  // EOF
135*103e46e4SHarish Mahendrakar       return -1;
136*103e46e4SHarish Mahendrakar 
137*103e46e4SHarish Mahendrakar     if (c != *p)
138*103e46e4SHarish Mahendrakar       return -1;
139*103e46e4SHarish Mahendrakar   }
140*103e46e4SHarish Mahendrakar 
141*103e46e4SHarish Mahendrakar   std::string line;
142*103e46e4SHarish Mahendrakar 
143*103e46e4SHarish Mahendrakar   e = GetLine(&line);
144*103e46e4SHarish Mahendrakar 
145*103e46e4SHarish Mahendrakar   if (e < 0)  // error
146*103e46e4SHarish Mahendrakar     return e;
147*103e46e4SHarish Mahendrakar 
148*103e46e4SHarish Mahendrakar   if (e > 0)  // EOF
149*103e46e4SHarish Mahendrakar     return 0;  // weird but valid
150*103e46e4SHarish Mahendrakar 
151*103e46e4SHarish Mahendrakar   if (!line.empty()) {
152*103e46e4SHarish Mahendrakar     // Parse optional characters that follow "WEBVTT"
153*103e46e4SHarish Mahendrakar 
154*103e46e4SHarish Mahendrakar     const char c = line[0];
155*103e46e4SHarish Mahendrakar 
156*103e46e4SHarish Mahendrakar     if (c != kSPACE && c != kTAB)
157*103e46e4SHarish Mahendrakar       return -1;
158*103e46e4SHarish Mahendrakar   }
159*103e46e4SHarish Mahendrakar 
160*103e46e4SHarish Mahendrakar   // The WebVTT spec requires that the "WEBVTT" line
161*103e46e4SHarish Mahendrakar   // be followed by an empty line (to separate it from
162*103e46e4SHarish Mahendrakar   // first cue).
163*103e46e4SHarish Mahendrakar 
164*103e46e4SHarish Mahendrakar   e = GetLine(&line);
165*103e46e4SHarish Mahendrakar 
166*103e46e4SHarish Mahendrakar   if (e < 0)  // error
167*103e46e4SHarish Mahendrakar     return e;
168*103e46e4SHarish Mahendrakar 
169*103e46e4SHarish Mahendrakar   if (e > 0)  // EOF
170*103e46e4SHarish Mahendrakar     return 0;  // weird but we allow it
171*103e46e4SHarish Mahendrakar 
172*103e46e4SHarish Mahendrakar   if (!line.empty())
173*103e46e4SHarish Mahendrakar     return -1;
174*103e46e4SHarish Mahendrakar 
175*103e46e4SHarish Mahendrakar   return 0;  // success
176*103e46e4SHarish Mahendrakar }
177*103e46e4SHarish Mahendrakar 
Parse(Cue * cue)178*103e46e4SHarish Mahendrakar int Parser::Parse(Cue* cue) {
179*103e46e4SHarish Mahendrakar   if (cue == NULL)
180*103e46e4SHarish Mahendrakar     return -1;
181*103e46e4SHarish Mahendrakar 
182*103e46e4SHarish Mahendrakar   // Parse first non-blank line
183*103e46e4SHarish Mahendrakar 
184*103e46e4SHarish Mahendrakar   std::string line;
185*103e46e4SHarish Mahendrakar   int e;
186*103e46e4SHarish Mahendrakar 
187*103e46e4SHarish Mahendrakar   for (;;) {
188*103e46e4SHarish Mahendrakar     e = GetLine(&line);
189*103e46e4SHarish Mahendrakar 
190*103e46e4SHarish Mahendrakar     if (e)  // EOF is OK here
191*103e46e4SHarish Mahendrakar       return e;
192*103e46e4SHarish Mahendrakar 
193*103e46e4SHarish Mahendrakar     if (!line.empty())
194*103e46e4SHarish Mahendrakar       break;
195*103e46e4SHarish Mahendrakar   }
196*103e46e4SHarish Mahendrakar 
197*103e46e4SHarish Mahendrakar   // A WebVTT cue comprises an optional cue identifier line followed
198*103e46e4SHarish Mahendrakar   // by a (non-optional) timings line.  You determine whether you have
199*103e46e4SHarish Mahendrakar   // a timings line by scanning for the arrow token, the lexeme of which
200*103e46e4SHarish Mahendrakar   // may not appear in the cue identifier line.
201*103e46e4SHarish Mahendrakar 
202*103e46e4SHarish Mahendrakar   const char kArrow[] = "-->";
203*103e46e4SHarish Mahendrakar   std::string::size_type arrow_pos = line.find(kArrow);
204*103e46e4SHarish Mahendrakar 
205*103e46e4SHarish Mahendrakar   if (arrow_pos != std::string::npos) {
206*103e46e4SHarish Mahendrakar     // We found a timings line, which implies that we don't have a cue
207*103e46e4SHarish Mahendrakar     // identifier.
208*103e46e4SHarish Mahendrakar 
209*103e46e4SHarish Mahendrakar     cue->identifier.clear();
210*103e46e4SHarish Mahendrakar   } else {
211*103e46e4SHarish Mahendrakar     // We did not find a timings line, so we assume that we have a cue
212*103e46e4SHarish Mahendrakar     // identifier line, and then try again to find the cue timings on
213*103e46e4SHarish Mahendrakar     // the next line.
214*103e46e4SHarish Mahendrakar 
215*103e46e4SHarish Mahendrakar     cue->identifier.swap(line);
216*103e46e4SHarish Mahendrakar 
217*103e46e4SHarish Mahendrakar     e = GetLine(&line);
218*103e46e4SHarish Mahendrakar 
219*103e46e4SHarish Mahendrakar     if (e < 0)  // error
220*103e46e4SHarish Mahendrakar       return e;
221*103e46e4SHarish Mahendrakar 
222*103e46e4SHarish Mahendrakar     if (e > 0)  // EOF
223*103e46e4SHarish Mahendrakar       return -1;
224*103e46e4SHarish Mahendrakar 
225*103e46e4SHarish Mahendrakar     arrow_pos = line.find(kArrow);
226*103e46e4SHarish Mahendrakar 
227*103e46e4SHarish Mahendrakar     if (arrow_pos == std::string::npos)  // not a timings line
228*103e46e4SHarish Mahendrakar       return -1;
229*103e46e4SHarish Mahendrakar   }
230*103e46e4SHarish Mahendrakar 
231*103e46e4SHarish Mahendrakar   e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time,
232*103e46e4SHarish Mahendrakar                        &cue->settings);
233*103e46e4SHarish Mahendrakar 
234*103e46e4SHarish Mahendrakar   if (e)  // error
235*103e46e4SHarish Mahendrakar     return e;
236*103e46e4SHarish Mahendrakar 
237*103e46e4SHarish Mahendrakar   // The cue payload comprises all the non-empty
238*103e46e4SHarish Mahendrakar   // lines that follow the timings line.
239*103e46e4SHarish Mahendrakar 
240*103e46e4SHarish Mahendrakar   Cue::payload_t& p = cue->payload;
241*103e46e4SHarish Mahendrakar   p.clear();
242*103e46e4SHarish Mahendrakar 
243*103e46e4SHarish Mahendrakar   for (;;) {
244*103e46e4SHarish Mahendrakar     e = GetLine(&line);
245*103e46e4SHarish Mahendrakar 
246*103e46e4SHarish Mahendrakar     if (e < 0)  // error
247*103e46e4SHarish Mahendrakar       return e;
248*103e46e4SHarish Mahendrakar 
249*103e46e4SHarish Mahendrakar     if (line.empty())
250*103e46e4SHarish Mahendrakar       break;
251*103e46e4SHarish Mahendrakar 
252*103e46e4SHarish Mahendrakar     p.push_back(line);
253*103e46e4SHarish Mahendrakar   }
254*103e46e4SHarish Mahendrakar 
255*103e46e4SHarish Mahendrakar   if (p.empty())
256*103e46e4SHarish Mahendrakar     return -1;
257*103e46e4SHarish Mahendrakar 
258*103e46e4SHarish Mahendrakar   return 0;  // success
259*103e46e4SHarish Mahendrakar }
260*103e46e4SHarish Mahendrakar 
GetChar(char * c)261*103e46e4SHarish Mahendrakar int Parser::GetChar(char* c) {
262*103e46e4SHarish Mahendrakar   if (unget_ >= 0) {
263*103e46e4SHarish Mahendrakar     *c = static_cast<char>(unget_);
264*103e46e4SHarish Mahendrakar     unget_ = -1;
265*103e46e4SHarish Mahendrakar     return 0;
266*103e46e4SHarish Mahendrakar   }
267*103e46e4SHarish Mahendrakar 
268*103e46e4SHarish Mahendrakar   return reader_->GetChar(c);
269*103e46e4SHarish Mahendrakar }
270*103e46e4SHarish Mahendrakar 
UngetChar(char c)271*103e46e4SHarish Mahendrakar void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); }
272*103e46e4SHarish Mahendrakar 
ParseBOM()273*103e46e4SHarish Mahendrakar int Parser::ParseBOM() {
274*103e46e4SHarish Mahendrakar   // Explanation of UTF-8 BOM:
275*103e46e4SHarish Mahendrakar   // http://en.wikipedia.org/wiki/Byte_order_mark
276*103e46e4SHarish Mahendrakar 
277*103e46e4SHarish Mahendrakar   static const char BOM[] = "\xEF\xBB\xBF";  // UTF-8 BOM
278*103e46e4SHarish Mahendrakar 
279*103e46e4SHarish Mahendrakar   for (int i = 0; i < 3; ++i) {
280*103e46e4SHarish Mahendrakar     char c;
281*103e46e4SHarish Mahendrakar     int e = GetChar(&c);
282*103e46e4SHarish Mahendrakar 
283*103e46e4SHarish Mahendrakar     if (e < 0)  // error
284*103e46e4SHarish Mahendrakar       return e;
285*103e46e4SHarish Mahendrakar 
286*103e46e4SHarish Mahendrakar     if (e > 0)  // EOF
287*103e46e4SHarish Mahendrakar       return 1;
288*103e46e4SHarish Mahendrakar 
289*103e46e4SHarish Mahendrakar     if (c != BOM[i]) {
290*103e46e4SHarish Mahendrakar       if (i == 0) {  // we don't have a BOM
291*103e46e4SHarish Mahendrakar         UngetChar(c);
292*103e46e4SHarish Mahendrakar         return 0;  // success
293*103e46e4SHarish Mahendrakar       }
294*103e46e4SHarish Mahendrakar 
295*103e46e4SHarish Mahendrakar       // We started a BOM, so we must finish the BOM.
296*103e46e4SHarish Mahendrakar       return -1;  // error
297*103e46e4SHarish Mahendrakar     }
298*103e46e4SHarish Mahendrakar   }
299*103e46e4SHarish Mahendrakar 
300*103e46e4SHarish Mahendrakar   return 0;  // success
301*103e46e4SHarish Mahendrakar }
302*103e46e4SHarish Mahendrakar 
ParseTimingsLine(std::string * line_ptr,std::string::size_type arrow_pos,Time * start_time,Time * stop_time,Cue::settings_t * settings)303*103e46e4SHarish Mahendrakar int Parser::ParseTimingsLine(std::string* line_ptr,
304*103e46e4SHarish Mahendrakar                              std::string::size_type arrow_pos, Time* start_time,
305*103e46e4SHarish Mahendrakar                              Time* stop_time, Cue::settings_t* settings) {
306*103e46e4SHarish Mahendrakar   if (line_ptr == NULL)
307*103e46e4SHarish Mahendrakar     return -1;
308*103e46e4SHarish Mahendrakar 
309*103e46e4SHarish Mahendrakar   std::string& line = *line_ptr;
310*103e46e4SHarish Mahendrakar 
311*103e46e4SHarish Mahendrakar   if (arrow_pos == std::string::npos || arrow_pos >= line.length())
312*103e46e4SHarish Mahendrakar     return -1;
313*103e46e4SHarish Mahendrakar 
314*103e46e4SHarish Mahendrakar   // Place a NUL character at the start of the arrow token, in
315*103e46e4SHarish Mahendrakar   // order to demarcate the start time from remainder of line.
316*103e46e4SHarish Mahendrakar   line[arrow_pos] = kNUL;
317*103e46e4SHarish Mahendrakar   std::string::size_type idx = 0;
318*103e46e4SHarish Mahendrakar 
319*103e46e4SHarish Mahendrakar   int e = ParseTime(line, &idx, start_time);
320*103e46e4SHarish Mahendrakar   if (e)  // error
321*103e46e4SHarish Mahendrakar     return e;
322*103e46e4SHarish Mahendrakar 
323*103e46e4SHarish Mahendrakar   // Detect any junk that follows the start time,
324*103e46e4SHarish Mahendrakar   // but precedes the arrow symbol.
325*103e46e4SHarish Mahendrakar 
326*103e46e4SHarish Mahendrakar   while (char c = line[idx]) {
327*103e46e4SHarish Mahendrakar     if (c != kSPACE && c != kTAB)
328*103e46e4SHarish Mahendrakar       return -1;
329*103e46e4SHarish Mahendrakar     ++idx;
330*103e46e4SHarish Mahendrakar   }
331*103e46e4SHarish Mahendrakar 
332*103e46e4SHarish Mahendrakar   // Place a NUL character at the end of the line,
333*103e46e4SHarish Mahendrakar   // so the scanner has a place to stop, and begin
334*103e46e4SHarish Mahendrakar   // the scan just beyond the arrow token.
335*103e46e4SHarish Mahendrakar 
336*103e46e4SHarish Mahendrakar   line.push_back(kNUL);
337*103e46e4SHarish Mahendrakar   idx = arrow_pos + 3;
338*103e46e4SHarish Mahendrakar 
339*103e46e4SHarish Mahendrakar   e = ParseTime(line, &idx, stop_time);
340*103e46e4SHarish Mahendrakar   if (e)  // error
341*103e46e4SHarish Mahendrakar     return e;
342*103e46e4SHarish Mahendrakar 
343*103e46e4SHarish Mahendrakar   e = ParseSettings(line, idx, settings);
344*103e46e4SHarish Mahendrakar   if (e)  // error
345*103e46e4SHarish Mahendrakar     return e;
346*103e46e4SHarish Mahendrakar 
347*103e46e4SHarish Mahendrakar   return 0;  // success
348*103e46e4SHarish Mahendrakar }
349*103e46e4SHarish Mahendrakar 
ParseTime(const std::string & line,std::string::size_type * idx_ptr,Time * time)350*103e46e4SHarish Mahendrakar int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr,
351*103e46e4SHarish Mahendrakar                       Time* time) {
352*103e46e4SHarish Mahendrakar   if (idx_ptr == NULL)
353*103e46e4SHarish Mahendrakar     return -1;
354*103e46e4SHarish Mahendrakar 
355*103e46e4SHarish Mahendrakar   std::string::size_type& idx = *idx_ptr;
356*103e46e4SHarish Mahendrakar 
357*103e46e4SHarish Mahendrakar   if (idx == std::string::npos || idx >= line.length())
358*103e46e4SHarish Mahendrakar     return -1;
359*103e46e4SHarish Mahendrakar 
360*103e46e4SHarish Mahendrakar   if (time == NULL)
361*103e46e4SHarish Mahendrakar     return -1;
362*103e46e4SHarish Mahendrakar 
363*103e46e4SHarish Mahendrakar   // Consume any whitespace that precedes the timestamp.
364*103e46e4SHarish Mahendrakar 
365*103e46e4SHarish Mahendrakar   while (char c = line[idx]) {
366*103e46e4SHarish Mahendrakar     if (c != kSPACE && c != kTAB)
367*103e46e4SHarish Mahendrakar       break;
368*103e46e4SHarish Mahendrakar     ++idx;
369*103e46e4SHarish Mahendrakar   }
370*103e46e4SHarish Mahendrakar 
371*103e46e4SHarish Mahendrakar   // WebVTT timestamp syntax comes in three flavors:
372*103e46e4SHarish Mahendrakar   //  SS[.sss]
373*103e46e4SHarish Mahendrakar   //  MM:SS[.sss]
374*103e46e4SHarish Mahendrakar   //  HH:MM:SS[.sss]
375*103e46e4SHarish Mahendrakar 
376*103e46e4SHarish Mahendrakar   // Parse a generic number value.  We don't know which component
377*103e46e4SHarish Mahendrakar   // of the time we have yet, until we do more parsing.
378*103e46e4SHarish Mahendrakar 
379*103e46e4SHarish Mahendrakar   int val = ParseNumber(line, &idx);
380*103e46e4SHarish Mahendrakar 
381*103e46e4SHarish Mahendrakar   if (val < 0)  // error
382*103e46e4SHarish Mahendrakar     return val;
383*103e46e4SHarish Mahendrakar 
384*103e46e4SHarish Mahendrakar   Time& t = *time;
385*103e46e4SHarish Mahendrakar 
386*103e46e4SHarish Mahendrakar   // The presence of a colon character indicates that we have
387*103e46e4SHarish Mahendrakar   // an [HH:]MM:SS style syntax.
388*103e46e4SHarish Mahendrakar 
389*103e46e4SHarish Mahendrakar   if (line[idx] == ':') {
390*103e46e4SHarish Mahendrakar     // We have either HH:MM:SS or MM:SS
391*103e46e4SHarish Mahendrakar 
392*103e46e4SHarish Mahendrakar     // The value we just parsed is either the hours or minutes.
393*103e46e4SHarish Mahendrakar     // It must be followed by another number value (that is
394*103e46e4SHarish Mahendrakar     // either minutes or seconds).
395*103e46e4SHarish Mahendrakar 
396*103e46e4SHarish Mahendrakar     const int first_val = val;
397*103e46e4SHarish Mahendrakar 
398*103e46e4SHarish Mahendrakar     ++idx;  // consume colon
399*103e46e4SHarish Mahendrakar 
400*103e46e4SHarish Mahendrakar     // Parse second value
401*103e46e4SHarish Mahendrakar 
402*103e46e4SHarish Mahendrakar     val = ParseNumber(line, &idx);
403*103e46e4SHarish Mahendrakar 
404*103e46e4SHarish Mahendrakar     if (val < 0)
405*103e46e4SHarish Mahendrakar       return val;
406*103e46e4SHarish Mahendrakar 
407*103e46e4SHarish Mahendrakar     if (val >= 60)  // either MM or SS
408*103e46e4SHarish Mahendrakar       return -1;
409*103e46e4SHarish Mahendrakar 
410*103e46e4SHarish Mahendrakar     if (line[idx] == ':') {
411*103e46e4SHarish Mahendrakar       // We have HH:MM:SS
412*103e46e4SHarish Mahendrakar 
413*103e46e4SHarish Mahendrakar       t.hours = first_val;
414*103e46e4SHarish Mahendrakar       t.minutes = val;  // vetted above
415*103e46e4SHarish Mahendrakar 
416*103e46e4SHarish Mahendrakar       ++idx;  // consume MM:SS colon
417*103e46e4SHarish Mahendrakar 
418*103e46e4SHarish Mahendrakar       // We have parsed the hours and minutes.
419*103e46e4SHarish Mahendrakar       // We must now parse the seconds.
420*103e46e4SHarish Mahendrakar 
421*103e46e4SHarish Mahendrakar       val = ParseNumber(line, &idx);
422*103e46e4SHarish Mahendrakar 
423*103e46e4SHarish Mahendrakar       if (val < 0)
424*103e46e4SHarish Mahendrakar         return val;
425*103e46e4SHarish Mahendrakar 
426*103e46e4SHarish Mahendrakar       if (val >= 60)  // SS part of HH:MM:SS
427*103e46e4SHarish Mahendrakar         return -1;
428*103e46e4SHarish Mahendrakar 
429*103e46e4SHarish Mahendrakar       t.seconds = val;
430*103e46e4SHarish Mahendrakar     } else {
431*103e46e4SHarish Mahendrakar       // We have MM:SS
432*103e46e4SHarish Mahendrakar 
433*103e46e4SHarish Mahendrakar       // The implication here is that the hour value was omitted
434*103e46e4SHarish Mahendrakar       // from the timestamp (because it was 0).
435*103e46e4SHarish Mahendrakar 
436*103e46e4SHarish Mahendrakar       if (first_val >= 60)  // minutes
437*103e46e4SHarish Mahendrakar         return -1;
438*103e46e4SHarish Mahendrakar 
439*103e46e4SHarish Mahendrakar       t.hours = 0;
440*103e46e4SHarish Mahendrakar       t.minutes = first_val;
441*103e46e4SHarish Mahendrakar       t.seconds = val;  // vetted above
442*103e46e4SHarish Mahendrakar     }
443*103e46e4SHarish Mahendrakar   } else {
444*103e46e4SHarish Mahendrakar     // We have SS (only)
445*103e46e4SHarish Mahendrakar 
446*103e46e4SHarish Mahendrakar     // The time is expressed as total number of seconds,
447*103e46e4SHarish Mahendrakar     // so the seconds value has no upper bound.
448*103e46e4SHarish Mahendrakar 
449*103e46e4SHarish Mahendrakar     t.seconds = val;
450*103e46e4SHarish Mahendrakar 
451*103e46e4SHarish Mahendrakar     // Convert SS to HH:MM:SS
452*103e46e4SHarish Mahendrakar 
453*103e46e4SHarish Mahendrakar     t.minutes = t.seconds / 60;
454*103e46e4SHarish Mahendrakar     t.seconds -= t.minutes * 60;
455*103e46e4SHarish Mahendrakar 
456*103e46e4SHarish Mahendrakar     t.hours = t.minutes / 60;
457*103e46e4SHarish Mahendrakar     t.minutes -= t.hours * 60;
458*103e46e4SHarish Mahendrakar   }
459*103e46e4SHarish Mahendrakar 
460*103e46e4SHarish Mahendrakar   // We have parsed the hours, minutes, and seconds.
461*103e46e4SHarish Mahendrakar   // We must now parse the milliseconds.
462*103e46e4SHarish Mahendrakar 
463*103e46e4SHarish Mahendrakar   char c = line[idx];
464*103e46e4SHarish Mahendrakar 
465*103e46e4SHarish Mahendrakar   // TODO(matthewjheaney): one option here is to slightly relax the
466*103e46e4SHarish Mahendrakar   // syntax rules for WebVTT timestamps, to permit the comma character
467*103e46e4SHarish Mahendrakar   // to also be used as the seconds/milliseconds separator.  This
468*103e46e4SHarish Mahendrakar   // would handle streams that use localization conventions for
469*103e46e4SHarish Mahendrakar   // countries in Western Europe.  For now we obey the rules specified
470*103e46e4SHarish Mahendrakar   // in the WebVTT spec (allow "full stop" only).
471*103e46e4SHarish Mahendrakar 
472*103e46e4SHarish Mahendrakar   const bool have_milliseconds = (c == '.');
473*103e46e4SHarish Mahendrakar 
474*103e46e4SHarish Mahendrakar   if (!have_milliseconds) {
475*103e46e4SHarish Mahendrakar     t.milliseconds = 0;
476*103e46e4SHarish Mahendrakar   } else {
477*103e46e4SHarish Mahendrakar     ++idx;  // consume FULL STOP
478*103e46e4SHarish Mahendrakar 
479*103e46e4SHarish Mahendrakar     val = ParseNumber(line, &idx);
480*103e46e4SHarish Mahendrakar 
481*103e46e4SHarish Mahendrakar     if (val < 0)
482*103e46e4SHarish Mahendrakar       return val;
483*103e46e4SHarish Mahendrakar 
484*103e46e4SHarish Mahendrakar     if (val >= 1000)
485*103e46e4SHarish Mahendrakar       return -1;
486*103e46e4SHarish Mahendrakar 
487*103e46e4SHarish Mahendrakar     if (val < 10)
488*103e46e4SHarish Mahendrakar       t.milliseconds = val * 100;
489*103e46e4SHarish Mahendrakar     else if (val < 100)
490*103e46e4SHarish Mahendrakar       t.milliseconds = val * 10;
491*103e46e4SHarish Mahendrakar     else
492*103e46e4SHarish Mahendrakar       t.milliseconds = val;
493*103e46e4SHarish Mahendrakar   }
494*103e46e4SHarish Mahendrakar 
495*103e46e4SHarish Mahendrakar   // We have parsed the time proper.  We must check for any
496*103e46e4SHarish Mahendrakar   // junk that immediately follows the time specifier.
497*103e46e4SHarish Mahendrakar 
498*103e46e4SHarish Mahendrakar   c = line[idx];
499*103e46e4SHarish Mahendrakar 
500*103e46e4SHarish Mahendrakar   if (c != kNUL && c != kSPACE && c != kTAB)
501*103e46e4SHarish Mahendrakar     return -1;
502*103e46e4SHarish Mahendrakar 
503*103e46e4SHarish Mahendrakar   return 0;  // success
504*103e46e4SHarish Mahendrakar }
505*103e46e4SHarish Mahendrakar 
ParseSettings(const std::string & line,std::string::size_type idx,Cue::settings_t * settings)506*103e46e4SHarish Mahendrakar int Parser::ParseSettings(const std::string& line, std::string::size_type idx,
507*103e46e4SHarish Mahendrakar                           Cue::settings_t* settings) {
508*103e46e4SHarish Mahendrakar   settings->clear();
509*103e46e4SHarish Mahendrakar 
510*103e46e4SHarish Mahendrakar   if (idx == std::string::npos || idx >= line.length())
511*103e46e4SHarish Mahendrakar     return -1;
512*103e46e4SHarish Mahendrakar 
513*103e46e4SHarish Mahendrakar   for (;;) {
514*103e46e4SHarish Mahendrakar     // We must parse a line comprising a sequence of 0 or more
515*103e46e4SHarish Mahendrakar     // NAME:VALUE pairs, separated by whitespace.  The line iself is
516*103e46e4SHarish Mahendrakar     // terminated with a NUL char (indicating end-of-line).
517*103e46e4SHarish Mahendrakar 
518*103e46e4SHarish Mahendrakar     for (;;) {
519*103e46e4SHarish Mahendrakar       const char c = line[idx];
520*103e46e4SHarish Mahendrakar 
521*103e46e4SHarish Mahendrakar       if (c == kNUL)  // end-of-line
522*103e46e4SHarish Mahendrakar         return 0;  // success
523*103e46e4SHarish Mahendrakar 
524*103e46e4SHarish Mahendrakar       if (c != kSPACE && c != kTAB)
525*103e46e4SHarish Mahendrakar         break;
526*103e46e4SHarish Mahendrakar 
527*103e46e4SHarish Mahendrakar       ++idx;  // consume whitespace
528*103e46e4SHarish Mahendrakar     }
529*103e46e4SHarish Mahendrakar 
530*103e46e4SHarish Mahendrakar     // We have consumed the whitespace, and have not yet reached
531*103e46e4SHarish Mahendrakar     // end-of-line, so there is something on the line for us to parse.
532*103e46e4SHarish Mahendrakar 
533*103e46e4SHarish Mahendrakar     settings->push_back(Setting());
534*103e46e4SHarish Mahendrakar     Setting& s = settings->back();
535*103e46e4SHarish Mahendrakar 
536*103e46e4SHarish Mahendrakar     // Parse the NAME part of the settings pair.
537*103e46e4SHarish Mahendrakar 
538*103e46e4SHarish Mahendrakar     for (;;) {
539*103e46e4SHarish Mahendrakar       const char c = line[idx];
540*103e46e4SHarish Mahendrakar 
541*103e46e4SHarish Mahendrakar       if (c == ':')  // we have reached end of NAME part
542*103e46e4SHarish Mahendrakar         break;
543*103e46e4SHarish Mahendrakar 
544*103e46e4SHarish Mahendrakar       if (c == kNUL || c == kSPACE || c == kTAB)
545*103e46e4SHarish Mahendrakar         return -1;
546*103e46e4SHarish Mahendrakar 
547*103e46e4SHarish Mahendrakar       s.name.push_back(c);
548*103e46e4SHarish Mahendrakar 
549*103e46e4SHarish Mahendrakar       ++idx;
550*103e46e4SHarish Mahendrakar     }
551*103e46e4SHarish Mahendrakar 
552*103e46e4SHarish Mahendrakar     if (s.name.empty())
553*103e46e4SHarish Mahendrakar       return -1;
554*103e46e4SHarish Mahendrakar 
555*103e46e4SHarish Mahendrakar     ++idx;  // consume colon
556*103e46e4SHarish Mahendrakar 
557*103e46e4SHarish Mahendrakar     // Parse the VALUE part of the settings pair.
558*103e46e4SHarish Mahendrakar 
559*103e46e4SHarish Mahendrakar     for (;;) {
560*103e46e4SHarish Mahendrakar       const char c = line[idx];
561*103e46e4SHarish Mahendrakar 
562*103e46e4SHarish Mahendrakar       if (c == kNUL || c == kSPACE || c == kTAB)
563*103e46e4SHarish Mahendrakar         break;
564*103e46e4SHarish Mahendrakar 
565*103e46e4SHarish Mahendrakar       if (c == ':')  // suspicious when part of VALUE
566*103e46e4SHarish Mahendrakar         return -1;  // TODO(matthewjheaney): verify this behavior
567*103e46e4SHarish Mahendrakar 
568*103e46e4SHarish Mahendrakar       s.value.push_back(c);
569*103e46e4SHarish Mahendrakar 
570*103e46e4SHarish Mahendrakar       ++idx;
571*103e46e4SHarish Mahendrakar     }
572*103e46e4SHarish Mahendrakar 
573*103e46e4SHarish Mahendrakar     if (s.value.empty())
574*103e46e4SHarish Mahendrakar       return -1;
575*103e46e4SHarish Mahendrakar   }
576*103e46e4SHarish Mahendrakar }
577*103e46e4SHarish Mahendrakar 
ParseNumber(const std::string & line,std::string::size_type * idx_ptr)578*103e46e4SHarish Mahendrakar int Parser::ParseNumber(const std::string& line,
579*103e46e4SHarish Mahendrakar                         std::string::size_type* idx_ptr) {
580*103e46e4SHarish Mahendrakar   if (idx_ptr == NULL)
581*103e46e4SHarish Mahendrakar     return -1;
582*103e46e4SHarish Mahendrakar 
583*103e46e4SHarish Mahendrakar   std::string::size_type& idx = *idx_ptr;
584*103e46e4SHarish Mahendrakar 
585*103e46e4SHarish Mahendrakar   if (idx == std::string::npos || idx >= line.length())
586*103e46e4SHarish Mahendrakar     return -1;
587*103e46e4SHarish Mahendrakar 
588*103e46e4SHarish Mahendrakar   if (!isdigit(line[idx]))
589*103e46e4SHarish Mahendrakar     return -1;
590*103e46e4SHarish Mahendrakar 
591*103e46e4SHarish Mahendrakar   int result = 0;
592*103e46e4SHarish Mahendrakar 
593*103e46e4SHarish Mahendrakar   while (isdigit(line[idx])) {
594*103e46e4SHarish Mahendrakar     const char c = line[idx];
595*103e46e4SHarish Mahendrakar     const int i = c - '0';
596*103e46e4SHarish Mahendrakar 
597*103e46e4SHarish Mahendrakar     if (result > INT_MAX / 10)
598*103e46e4SHarish Mahendrakar       return -1;
599*103e46e4SHarish Mahendrakar 
600*103e46e4SHarish Mahendrakar     result *= 10;
601*103e46e4SHarish Mahendrakar 
602*103e46e4SHarish Mahendrakar     if (result > INT_MAX - i)
603*103e46e4SHarish Mahendrakar       return -1;
604*103e46e4SHarish Mahendrakar 
605*103e46e4SHarish Mahendrakar     result += i;
606*103e46e4SHarish Mahendrakar 
607*103e46e4SHarish Mahendrakar     ++idx;
608*103e46e4SHarish Mahendrakar   }
609*103e46e4SHarish Mahendrakar 
610*103e46e4SHarish Mahendrakar   return result;
611*103e46e4SHarish Mahendrakar }
612*103e46e4SHarish Mahendrakar 
operator ==(const Time & rhs) const613*103e46e4SHarish Mahendrakar bool Time::operator==(const Time& rhs) const {
614*103e46e4SHarish Mahendrakar   if (hours != rhs.hours)
615*103e46e4SHarish Mahendrakar     return false;
616*103e46e4SHarish Mahendrakar 
617*103e46e4SHarish Mahendrakar   if (minutes != rhs.minutes)
618*103e46e4SHarish Mahendrakar     return false;
619*103e46e4SHarish Mahendrakar 
620*103e46e4SHarish Mahendrakar   if (seconds != rhs.seconds)
621*103e46e4SHarish Mahendrakar     return false;
622*103e46e4SHarish Mahendrakar 
623*103e46e4SHarish Mahendrakar   return (milliseconds == rhs.milliseconds);
624*103e46e4SHarish Mahendrakar }
625*103e46e4SHarish Mahendrakar 
operator <(const Time & rhs) const626*103e46e4SHarish Mahendrakar bool Time::operator<(const Time& rhs) const {
627*103e46e4SHarish Mahendrakar   if (hours < rhs.hours)
628*103e46e4SHarish Mahendrakar     return true;
629*103e46e4SHarish Mahendrakar 
630*103e46e4SHarish Mahendrakar   if (hours > rhs.hours)
631*103e46e4SHarish Mahendrakar     return false;
632*103e46e4SHarish Mahendrakar 
633*103e46e4SHarish Mahendrakar   if (minutes < rhs.minutes)
634*103e46e4SHarish Mahendrakar     return true;
635*103e46e4SHarish Mahendrakar 
636*103e46e4SHarish Mahendrakar   if (minutes > rhs.minutes)
637*103e46e4SHarish Mahendrakar     return false;
638*103e46e4SHarish Mahendrakar 
639*103e46e4SHarish Mahendrakar   if (seconds < rhs.seconds)
640*103e46e4SHarish Mahendrakar     return true;
641*103e46e4SHarish Mahendrakar 
642*103e46e4SHarish Mahendrakar   if (seconds > rhs.seconds)
643*103e46e4SHarish Mahendrakar     return false;
644*103e46e4SHarish Mahendrakar 
645*103e46e4SHarish Mahendrakar   return (milliseconds < rhs.milliseconds);
646*103e46e4SHarish Mahendrakar }
647*103e46e4SHarish Mahendrakar 
operator >(const Time & rhs) const648*103e46e4SHarish Mahendrakar bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); }
649*103e46e4SHarish Mahendrakar 
operator <=(const Time & rhs) const650*103e46e4SHarish Mahendrakar bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); }
651*103e46e4SHarish Mahendrakar 
operator >=(const Time & rhs) const652*103e46e4SHarish Mahendrakar bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); }
653*103e46e4SHarish Mahendrakar 
presentation() const654*103e46e4SHarish Mahendrakar presentation_t Time::presentation() const {
655*103e46e4SHarish Mahendrakar   const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
656*103e46e4SHarish Mahendrakar   const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
657*103e46e4SHarish Mahendrakar   const presentation_t s = 1000LL * presentation_t(seconds);
658*103e46e4SHarish Mahendrakar   const presentation_t result = h + m + s + milliseconds;
659*103e46e4SHarish Mahendrakar   return result;
660*103e46e4SHarish Mahendrakar }
661*103e46e4SHarish Mahendrakar 
presentation(presentation_t d)662*103e46e4SHarish Mahendrakar Time& Time::presentation(presentation_t d) {
663*103e46e4SHarish Mahendrakar   if (d < 0) {  // error
664*103e46e4SHarish Mahendrakar     hours = 0;
665*103e46e4SHarish Mahendrakar     minutes = 0;
666*103e46e4SHarish Mahendrakar     seconds = 0;
667*103e46e4SHarish Mahendrakar     milliseconds = 0;
668*103e46e4SHarish Mahendrakar 
669*103e46e4SHarish Mahendrakar     return *this;
670*103e46e4SHarish Mahendrakar   }
671*103e46e4SHarish Mahendrakar 
672*103e46e4SHarish Mahendrakar   seconds = static_cast<int>(d / 1000);
673*103e46e4SHarish Mahendrakar   milliseconds = static_cast<int>(d - 1000 * seconds);
674*103e46e4SHarish Mahendrakar 
675*103e46e4SHarish Mahendrakar   minutes = seconds / 60;
676*103e46e4SHarish Mahendrakar   seconds -= 60 * minutes;
677*103e46e4SHarish Mahendrakar 
678*103e46e4SHarish Mahendrakar   hours = minutes / 60;
679*103e46e4SHarish Mahendrakar   minutes -= 60 * hours;
680*103e46e4SHarish Mahendrakar 
681*103e46e4SHarish Mahendrakar   return *this;
682*103e46e4SHarish Mahendrakar }
683*103e46e4SHarish Mahendrakar 
operator +=(presentation_t rhs)684*103e46e4SHarish Mahendrakar Time& Time::operator+=(presentation_t rhs) {
685*103e46e4SHarish Mahendrakar   const presentation_t d = this->presentation();
686*103e46e4SHarish Mahendrakar   const presentation_t dd = d + rhs;
687*103e46e4SHarish Mahendrakar   this->presentation(dd);
688*103e46e4SHarish Mahendrakar   return *this;
689*103e46e4SHarish Mahendrakar }
690*103e46e4SHarish Mahendrakar 
operator +(presentation_t d) const691*103e46e4SHarish Mahendrakar Time Time::operator+(presentation_t d) const {
692*103e46e4SHarish Mahendrakar   Time t(*this);
693*103e46e4SHarish Mahendrakar   t += d;
694*103e46e4SHarish Mahendrakar   return t;
695*103e46e4SHarish Mahendrakar }
696*103e46e4SHarish Mahendrakar 
operator -=(presentation_t d)697*103e46e4SHarish Mahendrakar Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); }
698*103e46e4SHarish Mahendrakar 
operator -(const Time & t) const699*103e46e4SHarish Mahendrakar presentation_t Time::operator-(const Time& t) const {
700*103e46e4SHarish Mahendrakar   const presentation_t rhs = t.presentation();
701*103e46e4SHarish Mahendrakar   const presentation_t lhs = this->presentation();
702*103e46e4SHarish Mahendrakar   const presentation_t result = lhs - rhs;
703*103e46e4SHarish Mahendrakar   return result;
704*103e46e4SHarish Mahendrakar }
705*103e46e4SHarish Mahendrakar 
706*103e46e4SHarish Mahendrakar }  // namespace libwebvtt
707