Line data Source code
1 : // Protocol Buffers - Google's data interchange format
2 : // Copyright 2008 Google Inc. All rights reserved.
3 : // https://developers.google.com/protocol-buffers/
4 : //
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following disclaimer
13 : // in the documentation and/or other materials provided with the
14 : // distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived from
17 : // this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : // Author: kenton@google.com (Kenton Varda)
32 : // Based on original Protocol Buffers design by
33 : // Sanjay Ghemawat, Jeff Dean, and others.
34 : //
35 : // Here we have a hand-written lexer. At first you might ask yourself,
36 : // "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
37 : // yes I am crazy, but that's beside the point. There are actually reasons
38 : // why I ended up writing this this way.
39 : //
40 : // The traditional approach to lexing is to use lex to generate a lexer for
41 : // you. Unfortunately, lex's output is ridiculously ugly and difficult to
42 : // integrate cleanly with C++ code, especially abstract code or code meant
43 : // as a library. Better parser-generators exist but would add dependencies
44 : // which most users won't already have, which we'd like to avoid. (GNU flex
45 : // has a C++ output option, but it's still ridiculously ugly, non-abstract,
46 : // and not library-friendly.)
47 : //
48 : // The next approach that any good software engineer should look at is to
49 : // use regular expressions. And, indeed, I did. I have code which
50 : // implements this same class using regular expressions. It's about 200
51 : // lines shorter. However:
52 : // - Rather than error messages telling you "This string has an invalid
53 : // escape sequence at line 5, column 45", you get error messages like
54 : // "Parse error on line 5". Giving more precise errors requires adding
55 : // a lot of code that ends up basically as complex as the hand-coded
56 : // version anyway.
57 : // - The regular expression to match a string literal looks like this:
58 : // kString = new RE("(\"([^\"\\\\]|" // non-escaped
59 : // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape
60 : // "\\\\x[0-9a-fA-F])*\"|" // hex escape
61 : // "\'([^\'\\\\]|" // Also support single-quotes.
62 : // "\\\\[abfnrtv?\"'\\\\0-7]|"
63 : // "\\\\x[0-9a-fA-F])*\')");
64 : // Verifying the correctness of this line noise is actually harder than
65 : // verifying the correctness of ConsumeString(), defined below. I'm not
66 : // even confident that the above is correct, after staring at it for some
67 : // time.
68 : // - PCRE is fast, but there's still more overhead involved than the code
69 : // below.
70 : // - Sadly, regular expressions are not part of the C standard library, so
71 : // using them would require depending on some other library. For the
72 : // open source release, this could be really annoying. Nobody likes
73 : // downloading one piece of software just to find that they need to
74 : // download something else to make it work, and in all likelihood
75 : // people downloading Protocol Buffers will already be doing so just
76 : // to make something else work. We could include a copy of PCRE with
77 : // our code, but that obligates us to keep it up-to-date and just seems
78 : // like a big waste just to save 200 lines of code.
79 : //
80 : // On a similar but unrelated note, I'm even scared to use ctype.h.
81 : // Apparently functions like isalpha() are locale-dependent. So, if we used
82 : // that, then if this code is being called from some program that doesn't
83 : // have its locale set to "C", it would behave strangely. We can't just set
84 : // the locale to "C" ourselves since we might break the calling program that
85 : // way, particularly if it is multi-threaded. WTF? Someone please let me
86 : // (Kenton) know if I'm missing something here...
87 : //
88 : // I'd love to hear about other alternatives, though, as this code isn't
89 : // exactly pretty.
90 :
91 : #include <google/protobuf/io/tokenizer.h>
92 : #include <google/protobuf/stubs/common.h>
93 : #include <google/protobuf/stubs/logging.h>
94 : #include <google/protobuf/stubs/stringprintf.h>
95 : #include <google/protobuf/io/strtod.h>
96 : #include <google/protobuf/io/zero_copy_stream.h>
97 : #include <google/protobuf/stubs/strutil.h>
98 : #include <google/protobuf/stubs/stl_util.h>
99 :
100 : namespace google {
101 : namespace protobuf {
102 : namespace io {
103 : namespace {
104 :
105 : // As mentioned above, I don't trust ctype.h due to the presence of "locales".
106 : // So, I have written replacement functions here. Someone please smack me if
107 : // this is a bad idea or if there is some way around this.
108 : //
109 : // These "character classes" are designed to be used in template methods.
110 : // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
111 : // whitespace.
112 :
113 : // Note: No class is allowed to contain '\0', since this is used to mark end-
114 : // of-input and is handled specially.
115 :
116 : #define CHARACTER_CLASS(NAME, EXPRESSION) \
117 : class NAME { \
118 : public: \
119 : static inline bool InClass(char c) { \
120 : return EXPRESSION; \
121 : } \
122 : }
123 :
124 52991 : CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
125 : c == '\r' || c == '\v' || c == '\f');
126 26003 : CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
127 : c == '\r' || c == '\v' || c == '\f');
128 :
129 32225 : CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
130 :
131 25918 : CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
132 58 : CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
133 298 : CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
134 : ('a' <= c && c <= 'f') ||
135 : ('A' <= c && c <= 'F'));
136 :
137 35725 : CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
138 : ('A' <= c && c <= 'Z') ||
139 : (c == '_'));
140 :
141 206245 : CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
142 : ('A' <= c && c <= 'Z') ||
143 : ('0' <= c && c <= '9') ||
144 : (c == '_'));
145 :
146 27 : CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
147 : c == 'r' || c == 't' || c == 'v' || c == '\\' ||
148 : c == '?' || c == '\'' || c == '\"');
149 :
150 : #undef CHARACTER_CLASS
151 :
152 : // Given a char, interpret it as a numeric digit and return its value.
153 : // This supports any number base up to 36.
154 : inline int DigitValue(char digit) {
155 7245 : if ('0' <= digit && digit <= '9') return digit - '0';
156 169 : if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
157 160 : if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
158 : return -1;
159 : }
160 :
161 : // Inline because it's only used in one place.
162 : inline char TranslateEscape(char c) {
163 : switch (c) {
164 : case 'a': return '\a';
165 : case 'b': return '\b';
166 : case 'f': return '\f';
167 : case 'n': return '\n';
168 : case 'r': return '\r';
169 : case 't': return '\t';
170 : case 'v': return '\v';
171 : case '\\': return '\\';
172 : case '?': return '\?'; // Trigraphs = :(
173 : case '\'': return '\'';
174 : case '"': return '\"';
175 :
176 : // We expect escape sequences to have been validated separately.
177 : default: return '?';
178 : }
179 : }
180 :
181 : } // anonymous namespace
182 :
183 113 : ErrorCollector::~ErrorCollector() {}
184 :
185 : // ===================================================================
186 :
187 87 : Tokenizer::Tokenizer(ZeroCopyInputStream* input,
188 : ErrorCollector* error_collector)
189 : : input_(input),
190 : error_collector_(error_collector),
191 : buffer_(NULL),
192 : buffer_size_(0),
193 : buffer_pos_(0),
194 : read_error_(false),
195 : line_(0),
196 : column_(0),
197 : record_target_(NULL),
198 : record_start_(-1),
199 : allow_f_after_float_(false),
200 : comment_style_(CPP_COMMENT_STYLE),
201 : require_space_after_number_(true),
202 261 : allow_multiline_strings_(false) {
203 :
204 87 : current_.line = 0;
205 87 : current_.column = 0;
206 87 : current_.end_column = 0;
207 87 : current_.type = TYPE_START;
208 :
209 87 : Refresh();
210 87 : }
211 :
212 261 : Tokenizer::~Tokenizer() {
213 : // If we had any buffer left unread, return it to the underlying stream
214 : // so that someone else can read it.
215 87 : if (buffer_size_ > buffer_pos_) {
216 0 : input_->BackUp(buffer_size_ - buffer_pos_);
217 : }
218 87 : }
219 :
220 : // -------------------------------------------------------------------
221 : // Internal helpers.
222 :
223 591378 : void Tokenizer::NextChar() {
224 : // Update our line and column counters based on the character being
225 : // consumed.
226 591378 : if (current_char_ == '\n') {
227 10190 : ++line_;
228 10190 : column_ = 0;
229 581188 : } else if (current_char_ == '\t') {
230 0 : column_ += kTabWidth - column_ % kTabWidth;
231 : } else {
232 581188 : ++column_;
233 : }
234 :
235 : // Advance to the next character.
236 591378 : ++buffer_pos_;
237 591378 : if (buffer_pos_ < buffer_size_) {
238 591252 : current_char_ = buffer_[buffer_pos_];
239 : } else {
240 126 : Refresh();
241 : }
242 591378 : }
243 :
244 213 : void Tokenizer::Refresh() {
245 213 : if (read_error_) {
246 0 : current_char_ = '\0';
247 87 : return;
248 : }
249 :
250 : // If we're in a token, append the rest of the buffer to it.
251 213 : if (record_target_ != NULL && record_start_ < buffer_size_) {
252 46 : record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
253 46 : record_start_ = 0;
254 : }
255 :
256 213 : const void* data = NULL;
257 213 : buffer_ = NULL;
258 213 : buffer_pos_ = 0;
259 126 : do {
260 213 : if (!input_->Next(&data, &buffer_size_)) {
261 : // end of stream (or read error)
262 87 : buffer_size_ = 0;
263 87 : read_error_ = true;
264 87 : current_char_ = '\0';
265 87 : return;
266 : }
267 126 : } while (buffer_size_ == 0);
268 :
269 126 : buffer_ = static_cast<const char*>(data);
270 :
271 126 : current_char_ = buffer_[0];
272 : }
273 :
274 : inline void Tokenizer::RecordTo(string* target) {
275 36246 : record_target_ = target;
276 36246 : record_start_ = buffer_pos_;
277 : }
278 :
279 36246 : inline void Tokenizer::StopRecording() {
280 : // Note: The if() is necessary because some STL implementations crash when
281 : // you call string::append(NULL, 0), presumably because they are trying to
282 : // be helpful by detecting the NULL pointer, even though there's nothing
283 : // wrong with reading zero bytes from NULL.
284 36246 : if (buffer_pos_ != record_start_) {
285 36229 : record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
286 : }
287 36246 : record_target_ = NULL;
288 36246 : record_start_ = -1;
289 36246 : }
290 :
291 32225 : inline void Tokenizer::StartToken() {
292 32225 : current_.type = TYPE_START; // Just for the sake of initializing it.
293 32225 : current_.text.clear();
294 32225 : current_.line = line_;
295 32225 : current_.column = column_;
296 32225 : RecordTo(¤t_.text);
297 32225 : }
298 :
299 : inline void Tokenizer::EndToken() {
300 32225 : StopRecording();
301 32225 : current_.end_column = column_;
302 : }
303 :
304 : // -------------------------------------------------------------------
305 : // Helper methods that consume characters.
306 :
307 : template<typename CharacterClass>
308 : inline bool Tokenizer::LookingAt() {
309 : return CharacterClass::InClass(current_char_);
310 : }
311 :
312 : template<typename CharacterClass>
313 32253 : inline bool Tokenizer::TryConsumeOne() {
314 102139 : if (CharacterClass::InClass(current_char_)) {
315 16735 : NextChar();
316 13341 : return true;
317 : } else {
318 : return false;
319 : }
320 : }
321 :
322 : inline bool Tokenizer::TryConsume(char c) {
323 143538 : if (current_char_ == c) {
324 20114 : NextChar();
325 : return true;
326 : } else {
327 : return false;
328 : }
329 : }
330 :
331 : template<typename CharacterClass>
332 60790 : inline void Tokenizer::ConsumeZeroOrMore() {
333 644860 : while (CharacterClass::InClass(current_char_)) {
334 227793 : NextChar();
335 : }
336 60790 : }
337 :
338 : template<typename CharacterClass>
339 30 : inline void Tokenizer::ConsumeOneOrMore(const char* error) {
340 60 : if (!CharacterClass::InClass(current_char_)) {
341 0 : AddError(error);
342 : } else {
343 280 : do {
344 280 : NextChar();
345 280 : } while (CharacterClass::InClass(current_char_));
346 : }
347 30 : }
348 :
349 : // -------------------------------------------------------------------
350 : // Methods that read whole patterns matching certain kinds of tokens
351 : // or comments.
352 :
353 1262 : void Tokenizer::ConsumeString(char delimiter) {
354 : while (true) {
355 117390 : switch (current_char_) {
356 : case '\0':
357 0 : AddError("Unexpected end of string.");
358 0 : return;
359 :
360 : case '\n': {
361 0 : if (!allow_multiline_strings_) {
362 0 : AddError("String literals cannot cross line boundaries.");
363 0 : return;
364 : }
365 0 : NextChar();
366 0 : break;
367 : }
368 :
369 : case '\\': {
370 : // An escape sequence.
371 27 : NextChar();
372 27 : if (TryConsumeOne<Escape>()) {
373 : // Valid escape sequence.
374 11 : } else if (TryConsumeOne<OctalDigit>()) {
375 : // Possibly followed by two more octal digits, but these will
376 : // just be consumed by the main loop anyway so we don't need
377 : // to do so explicitly here.
378 1 : } else if (TryConsume('x') || TryConsume('X')) {
379 1 : if (!TryConsumeOne<HexDigit>()) {
380 0 : AddError("Expected hex digits for escape sequence.");
381 : }
382 : // Possibly followed by another hex digit, but again we don't care.
383 0 : } else if (TryConsume('u')) {
384 0 : if (!TryConsumeOne<HexDigit>() ||
385 0 : !TryConsumeOne<HexDigit>() ||
386 0 : !TryConsumeOne<HexDigit>() ||
387 0 : !TryConsumeOne<HexDigit>()) {
388 0 : AddError("Expected four hex digits for \\u escape sequence.");
389 : }
390 0 : } else if (TryConsume('U')) {
391 : // We expect 8 hex digits; but only the range up to 0x10ffff is
392 : // legal.
393 0 : if (!TryConsume('0') ||
394 0 : !TryConsume('0') ||
395 0 : !(TryConsume('0') || TryConsume('1')) ||
396 0 : !TryConsumeOne<HexDigit>() ||
397 0 : !TryConsumeOne<HexDigit>() ||
398 0 : !TryConsumeOne<HexDigit>() ||
399 0 : !TryConsumeOne<HexDigit>() ||
400 0 : !TryConsumeOne<HexDigit>()) {
401 : AddError("Expected eight hex digits up to 10ffff for \\U escape "
402 0 : "sequence");
403 : }
404 : } else {
405 0 : AddError("Invalid escape sequence in string literal.");
406 : }
407 : break;
408 : }
409 :
410 : default: {
411 117363 : if (current_char_ == delimiter) {
412 1262 : NextChar();
413 1262 : return;
414 : }
415 116101 : NextChar();
416 116101 : break;
417 : }
418 : }
419 : }
420 : }
421 :
422 3466 : Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
423 3525 : bool started_with_dot) {
424 3466 : bool is_float = false;
425 :
426 3607 : if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
427 : // A hex number (started with "0x").
428 23 : ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
429 :
430 3561 : } else if (started_with_zero && LookingAt<Digit>()) {
431 : // An octal number (had a leading zero).
432 : ConsumeZeroOrMore<OctalDigit>();
433 0 : if (LookingAt<Digit>()) {
434 0 : AddError("Numbers starting with leading zero must be in octal.");
435 : ConsumeZeroOrMore<Digit>();
436 : }
437 :
438 : } else {
439 : // A decimal number.
440 3443 : if (started_with_dot) {
441 : is_float = true;
442 : ConsumeZeroOrMore<Digit>();
443 : } else {
444 : ConsumeZeroOrMore<Digit>();
445 :
446 3443 : if (TryConsume('.')) {
447 : is_float = true;
448 : ConsumeZeroOrMore<Digit>();
449 : }
450 : }
451 :
452 6880 : if (TryConsume('e') || TryConsume('E')) {
453 7 : is_float = true;
454 7 : TryConsume('-') || TryConsume('+');
455 7 : ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
456 : }
457 :
458 3451 : if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
459 0 : is_float = true;
460 : }
461 : }
462 :
463 6932 : if (LookingAt<Letter>() && require_space_after_number_) {
464 0 : AddError("Need space between number and identifier.");
465 3466 : } else if (current_char_ == '.') {
466 0 : if (is_float) {
467 : AddError(
468 0 : "Already saw decimal point or exponent; can't have another one.");
469 : } else {
470 0 : AddError("Hex and octal numbers must be integers.");
471 : }
472 : }
473 :
474 3466 : return is_float ? TYPE_FLOAT : TYPE_INTEGER;
475 : }
476 :
477 4025 : void Tokenizer::ConsumeLineComment(string* content) {
478 4025 : if (content != NULL) RecordTo(content);
479 :
480 199458 : while (current_char_ != '\0' && current_char_ != '\n') {
481 195433 : NextChar();
482 : }
483 : TryConsume('\n');
484 :
485 4025 : if (content != NULL) StopRecording();
486 4025 : }
487 :
488 1 : void Tokenizer::ConsumeBlockComment(string* content) {
489 1 : int start_line = line_;
490 1 : int start_column = column_ - 2;
491 :
492 1 : if (content != NULL) RecordTo(content);
493 :
494 : while (true) {
495 32 : while (current_char_ != '\0' &&
496 15 : current_char_ != '*' &&
497 31 : current_char_ != '/' &&
498 : current_char_ != '\n') {
499 15 : NextChar();
500 : }
501 :
502 1 : if (TryConsume('\n')) {
503 0 : if (content != NULL) StopRecording();
504 :
505 : // Consume leading whitespace and asterisk;
506 0 : ConsumeZeroOrMore<WhitespaceNoNewline>();
507 0 : if (TryConsume('*')) {
508 0 : if (TryConsume('/')) {
509 : // End of comment.
510 : break;
511 : }
512 : }
513 :
514 0 : if (content != NULL) RecordTo(content);
515 2 : } else if (TryConsume('*') && TryConsume('/')) {
516 : // End of comment.
517 1 : if (content != NULL) {
518 0 : StopRecording();
519 : // Strip trailing "*/".
520 0 : content->erase(content->size() - 2);
521 : }
522 : break;
523 0 : } else if (TryConsume('/') && current_char_ == '*') {
524 : // Note: We didn't consume the '*' because if there is a '/' after it
525 : // we want to interpret that as the end of the comment.
526 : AddError(
527 0 : "\"/*\" inside block comment. Block comments cannot be nested.");
528 0 : } else if (current_char_ == '\0') {
529 0 : AddError("End-of-file inside block comment.");
530 : error_collector_->AddError(
531 0 : start_line, start_column, " Comment started here.");
532 0 : if (content != NULL) StopRecording();
533 : break;
534 : }
535 : }
536 1 : }
537 :
538 47466 : Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
539 94865 : if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
540 4026 : if (TryConsume('/')) {
541 : return LINE_COMMENT;
542 1 : } else if (TryConsume('*')) {
543 : return BLOCK_COMMENT;
544 : } else {
545 : // Oops, it was just a slash. Return it.
546 0 : current_.type = TYPE_SYMBOL;
547 0 : current_.text = "/";
548 0 : current_.line = line_;
549 0 : current_.column = column_ - 1;
550 0 : current_.end_column = column_;
551 0 : return SLASH_NOT_COMMENT;
552 : }
553 43507 : } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
554 : return LINE_COMMENT;
555 : } else {
556 43440 : return NO_COMMENT;
557 : }
558 : }
559 :
560 : // -------------------------------------------------------------------
561 :
562 64537 : bool Tokenizer::Next() {
563 32312 : previous_ = current_;
564 :
565 32317 : while (!read_error_) {
566 32230 : ConsumeZeroOrMore<Whitespace>();
567 :
568 32230 : switch (TryConsumeCommentStart()) {
569 : case LINE_COMMENT:
570 4 : ConsumeLineComment(NULL);
571 4 : continue;
572 : case BLOCK_COMMENT:
573 1 : ConsumeBlockComment(NULL);
574 1 : continue;
575 : case SLASH_NOT_COMMENT:
576 : return true;
577 : case NO_COMMENT:
578 : break;
579 : }
580 :
581 : // Check for EOF before continuing.
582 32225 : if (read_error_) break;
583 :
584 64450 : if (LookingAt<Unprintable>() || current_char_ == '\0') {
585 0 : AddError("Invalid control characters encountered in text.");
586 0 : NextChar();
587 : // Skip more unprintable characters, too. But, remember that '\0' is
588 : // also what current_char_ is set to after EOF / read error. We have
589 : // to be careful not to go into an infinite loop of trying to consume
590 : // it, so make sure to check read_error_ explicitly before consuming
591 : // '\0'.
592 0 : while (TryConsumeOne<Unprintable>() ||
593 0 : (!read_error_ && TryConsume('\0'))) {
594 : // Ignore.
595 : }
596 :
597 : } else {
598 : // Reading some sort of token.
599 32225 : StartToken();
600 :
601 32225 : if (TryConsumeOne<Letter>()) {
602 13324 : ConsumeZeroOrMore<Alphanumeric>();
603 13324 : current_.type = TYPE_IDENTIFIER;
604 18901 : } else if (TryConsume('0')) {
605 82 : current_.type = ConsumeNumber(true, false);
606 18819 : } else if (TryConsume('.')) {
607 : // This could be the beginning of a floating-point number, or it could
608 : // just be a '.' symbol.
609 :
610 555 : if (TryConsumeOne<Digit>()) {
611 : // It's a floating-point number.
612 0 : if (previous_.type == TYPE_IDENTIFIER &&
613 0 : current_.line == previous_.line &&
614 0 : current_.column == previous_.end_column) {
615 : // We don't accept syntax like "blah.123".
616 : error_collector_->AddError(line_, column_ - 2,
617 0 : "Need space between identifier and decimal point.");
618 : }
619 0 : current_.type = ConsumeNumber(false, true);
620 : } else {
621 555 : current_.type = TYPE_SYMBOL;
622 : }
623 18264 : } else if (TryConsumeOne<Digit>()) {
624 3384 : current_.type = ConsumeNumber(false, false);
625 14880 : } else if (TryConsume('\"')) {
626 1242 : ConsumeString('\"');
627 1242 : current_.type = TYPE_STRING;
628 13638 : } else if (TryConsume('\'')) {
629 20 : ConsumeString('\'');
630 20 : current_.type = TYPE_STRING;
631 : } else {
632 : // Check if the high order bit is set.
633 13618 : if (current_char_ & 0x80) {
634 : error_collector_->AddError(line_, column_,
635 : StringPrintf("Interpreting non ascii codepoint %d.",
636 0 : static_cast<unsigned char>(current_char_)));
637 : }
638 13618 : NextChar();
639 13618 : current_.type = TYPE_SYMBOL;
640 : }
641 :
642 : EndToken();
643 32225 : return true;
644 : }
645 : }
646 :
647 : // EOF
648 87 : current_.type = TYPE_END;
649 87 : current_.text.clear();
650 87 : current_.line = line_;
651 87 : current_.column = column_;
652 87 : current_.end_column = column_;
653 87 : return false;
654 : }
655 :
656 : namespace {
657 :
658 : // Helper class for collecting comments and putting them in the right places.
659 : //
660 : // This basically just buffers the most recent comment until it can be decided
661 : // exactly where that comment should be placed. When Flush() is called, the
662 : // current comment goes into either prev_trailing_comments or detached_comments.
663 : // When the CommentCollector is destroyed, the last buffered comment goes into
664 : // next_leading_comments.
665 : class CommentCollector {
666 : public:
667 5120 : CommentCollector(string* prev_trailing_comments,
668 : vector<string>* detached_comments,
669 : string* next_leading_comments)
670 : : prev_trailing_comments_(prev_trailing_comments),
671 : detached_comments_(detached_comments),
672 : next_leading_comments_(next_leading_comments),
673 : has_comment_(false),
674 : is_line_comment_(false),
675 10240 : can_attach_to_prev_(true) {
676 5120 : if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
677 5120 : if (detached_comments != NULL) detached_comments->clear();
678 5120 : if (next_leading_comments != NULL) next_leading_comments->clear();
679 5120 : }
680 :
681 10240 : ~CommentCollector() {
682 : // Whatever is in the buffer is a leading comment.
683 5120 : if (next_leading_comments_ != NULL && has_comment_) {
684 647 : comment_buffer_.swap(*next_leading_comments_);
685 : }
686 5120 : }
687 :
688 : // About to read a line comment. Get the comment buffer pointer in order to
689 : // read into it.
690 : string* GetBufferForLineComment() {
691 : // We want to combine with previous line comments, but not block comments.
692 4021 : if (has_comment_ && !is_line_comment_) {
693 0 : Flush();
694 : }
695 4021 : has_comment_ = true;
696 4021 : is_line_comment_ = true;
697 : return &comment_buffer_;
698 : }
699 :
700 : // About to read a block comment. Get the comment buffer pointer in order to
701 : // read into it.
702 : string* GetBufferForBlockComment() {
703 0 : if (has_comment_) {
704 0 : Flush();
705 : }
706 0 : has_comment_ = true;
707 0 : is_line_comment_ = false;
708 : return &comment_buffer_;
709 : }
710 :
711 : void ClearBuffer() {
712 184 : comment_buffer_.clear();
713 184 : has_comment_ = false;
714 : }
715 :
716 : // Called once we know that the comment buffer is complete and is *not*
717 : // connected to the next token.
718 1923 : void Flush() {
719 1923 : if (has_comment_) {
720 184 : if (can_attach_to_prev_) {
721 66 : if (prev_trailing_comments_ != NULL) {
722 66 : prev_trailing_comments_->append(comment_buffer_);
723 : }
724 66 : can_attach_to_prev_ = false;
725 : } else {
726 118 : if (detached_comments_ != NULL) {
727 118 : detached_comments_->push_back(comment_buffer_);
728 : }
729 : }
730 : ClearBuffer();
731 : }
732 1923 : }
733 :
734 : void DetachFromPrev() {
735 1255 : can_attach_to_prev_ = false;
736 : }
737 :
738 : private:
739 : string* prev_trailing_comments_;
740 : vector<string>* detached_comments_;
741 : string* next_leading_comments_;
742 :
743 : string comment_buffer_;
744 :
745 : // True if any comments were read into comment_buffer_. This can be true even
746 : // if comment_buffer_ is empty, namely if the comment was "/**/".
747 : bool has_comment_;
748 :
749 : // Is the comment in the comment buffer a line comment?
750 : bool is_line_comment_;
751 :
752 : // Is it still possible that we could be reading a comment attached to the
753 : // previous token?
754 : bool can_attach_to_prev_;
755 : };
756 :
757 : } // namespace
758 :
759 5120 : bool Tokenizer::NextWithComments(string* prev_trailing_comments,
760 : vector<string>* detached_comments,
761 0 : string* next_leading_comments) {
762 : CommentCollector collector(prev_trailing_comments, detached_comments,
763 5120 : next_leading_comments);
764 :
765 5120 : if (current_.type == TYPE_START) {
766 : // Ignore unicode byte order mark(BOM) if it appears at the file
767 : // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
768 78 : if (TryConsume((char)0xEF)) {
769 0 : if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
770 : AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
771 0 : "Only UTF-8 is accepted for proto file.");
772 0 : return false;
773 : }
774 : }
775 78 : collector.DetachFromPrev();
776 : } else {
777 : // A comment appearing on the same line must be attached to the previous
778 : // declaration.
779 5042 : ConsumeZeroOrMore<WhitespaceNoNewline>();
780 5042 : switch (TryConsumeCommentStart()) {
781 : case LINE_COMMENT:
782 64 : ConsumeLineComment(collector.GetBufferForLineComment());
783 :
784 : // Don't allow comments on subsequent lines to be attached to a trailing
785 : // comment.
786 64 : collector.Flush();
787 : break;
788 : case BLOCK_COMMENT:
789 0 : ConsumeBlockComment(collector.GetBufferForBlockComment());
790 :
791 0 : ConsumeZeroOrMore<WhitespaceNoNewline>();
792 0 : if (!TryConsume('\n')) {
793 : // Oops, the next token is on the same line. If we recorded a comment
794 : // we really have no idea which token it should be attached to.
795 : collector.ClearBuffer();
796 0 : return Next();
797 : }
798 :
799 : // Don't allow comments on subsequent lines to be attached to a trailing
800 : // comment.
801 0 : collector.Flush();
802 : break;
803 : case SLASH_NOT_COMMENT:
804 : return true;
805 : case NO_COMMENT:
806 4978 : if (!TryConsume('\n')) {
807 : // The next token is on the same line. There are no comments.
808 60 : return Next();
809 : }
810 : break;
811 : }
812 : }
813 :
814 : // OK, we are now on the line *after* the previous token.
815 : while (true) {
816 10194 : ConsumeZeroOrMore<WhitespaceNoNewline>();
817 :
818 10194 : switch (TryConsumeCommentStart()) {
819 : case LINE_COMMENT:
820 3957 : ConsumeLineComment(collector.GetBufferForLineComment());
821 : break;
822 : case BLOCK_COMMENT:
823 0 : ConsumeBlockComment(collector.GetBufferForBlockComment());
824 :
825 : // Consume the rest of the line so that we don't interpret it as a
826 : // blank line the next time around the loop.
827 0 : ConsumeZeroOrMore<WhitespaceNoNewline>();
828 : TryConsume('\n');
829 : break;
830 : case SLASH_NOT_COMMENT:
831 : return true;
832 : case NO_COMMENT:
833 6237 : if (TryConsume('\n')) {
834 : // Completely blank line.
835 1177 : collector.Flush();
836 1177 : collector.DetachFromPrev();
837 : } else {
838 5060 : bool result = Next();
839 10042 : if (!result ||
840 9360 : current_.text == "}" ||
841 13816 : current_.text == "]" ||
842 4378 : current_.text == ")") {
843 : // It looks like we're at the end of a scope. In this case it
844 : // makes no sense to attach a comment to the following token.
845 682 : collector.Flush();
846 : }
847 5060 : return result;
848 : }
849 : break;
850 : }
851 5120 : }
852 : }
853 :
854 : // -------------------------------------------------------------------
855 : // Token-parsing helpers. Remember that these don't need to report
856 : // errors since any errors should already have been reported while
857 : // tokenizing. Also, these can assume that whatever text they
858 : // are given is text that the tokenizer actually parsed as a token
859 : // of the given type.
860 :
861 3446 : bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
862 : uint64* output) {
863 : // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
864 : // is non-standard. I hate the C standard library. :(
865 :
866 : // return strtoull(text.c_str(), NULL, 0);
867 :
868 3446 : const char* ptr = text.c_str();
869 3446 : int base = 10;
870 3446 : if (ptr[0] == '0') {
871 82 : if (ptr[1] == 'x' || ptr[1] == 'X') {
872 : // This is hex.
873 23 : base = 16;
874 23 : ptr += 2;
875 : } else {
876 : // This is octal.
877 : base = 8;
878 : }
879 : }
880 :
881 3446 : uint64 result = 0;
882 10663 : for (; *ptr != '\0'; ptr++) {
883 7217 : int digit = DigitValue(*ptr);
884 7217 : GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base)
885 : << " Tokenizer::ParseInteger() passed text that could not have been"
886 14434 : " tokenized as an integer: " << CEscape(text);
887 7217 : if (digit > max_value || result > (max_value - digit) / base) {
888 : // Overflow.
889 : return false;
890 : }
891 7217 : result = result * base + digit;
892 : }
893 :
894 3446 : *output = result;
895 3446 : return true;
896 : }
897 :
898 16 : double Tokenizer::ParseFloat(const string& text) {
899 16 : const char* start = text.c_str();
900 : char* end;
901 16 : double result = NoLocaleStrtod(start, &end);
902 :
903 : // "1e" is not a valid float, but if the tokenizer reads it, it will
904 : // report an error but still return it as a valid token. We need to
905 : // accept anything the tokenizer could possibly return, error or not.
906 16 : if (*end == 'e' || *end == 'E') {
907 0 : ++end;
908 0 : if (*end == '-' || *end == '+') ++end;
909 : }
910 :
911 : // If the Tokenizer had allow_f_after_float_ enabled, the float may be
912 : // suffixed with the letter 'f'.
913 16 : if (*end == 'f' || *end == 'F') {
914 0 : ++end;
915 : }
916 :
917 32 : GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
918 : << " Tokenizer::ParseFloat() passed text that could not have been"
919 32 : " tokenized as a float: " << CEscape(text);
920 16 : return result;
921 : }
922 :
923 : // Helper to append a Unicode code point to a string as UTF8, without bringing
924 : // in any external dependencies.
925 0 : static void AppendUTF8(uint32 code_point, string* output) {
926 0 : uint32 tmp = 0;
927 0 : int len = 0;
928 0 : if (code_point <= 0x7f) {
929 0 : tmp = code_point;
930 0 : len = 1;
931 0 : } else if (code_point <= 0x07ff) {
932 : tmp = 0x0000c080 |
933 0 : ((code_point & 0x07c0) << 2) |
934 0 : (code_point & 0x003f);
935 0 : len = 2;
936 0 : } else if (code_point <= 0xffff) {
937 : tmp = 0x00e08080 |
938 0 : ((code_point & 0xf000) << 4) |
939 0 : ((code_point & 0x0fc0) << 2) |
940 0 : (code_point & 0x003f);
941 0 : len = 3;
942 0 : } else if (code_point <= 0x1fffff) {
943 : tmp = 0xf0808080 |
944 0 : ((code_point & 0x1c0000) << 6) |
945 0 : ((code_point & 0x03f000) << 4) |
946 0 : ((code_point & 0x000fc0) << 2) |
947 0 : (code_point & 0x003f);
948 0 : len = 4;
949 : } else {
950 : // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
951 : // normally only defined up to there as well.
952 0 : StringAppendF(output, "\\U%08x", code_point);
953 0 : return;
954 : }
955 0 : tmp = ghtonl(tmp);
956 0 : output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
957 : }
958 :
959 : // Try to read <len> hex digits from ptr, and stuff the numeric result into
960 : // *result. Returns true if that many digits were successfully consumed.
961 0 : static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
962 0 : *result = 0;
963 0 : if (len == 0) return false;
964 0 : for (const char* end = ptr + len; ptr < end; ++ptr) {
965 0 : if (*ptr == '\0') return false;
966 0 : *result = (*result << 4) + DigitValue(*ptr);
967 : }
968 : return true;
969 : }
970 :
971 : // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
972 : // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
973 : // surrogate. These numbers are in a reserved range of Unicode code points, so
974 : // if we encounter such a pair we know how to parse it and convert it into a
975 : // single code point.
976 : static const uint32 kMinHeadSurrogate = 0xd800;
977 : static const uint32 kMaxHeadSurrogate = 0xdc00;
978 : static const uint32 kMinTrailSurrogate = 0xdc00;
979 : static const uint32 kMaxTrailSurrogate = 0xe000;
980 :
981 : static inline bool IsHeadSurrogate(uint32 code_point) {
982 0 : return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
983 : }
984 :
985 : static inline bool IsTrailSurrogate(uint32 code_point) {
986 0 : return (code_point >= kMinTrailSurrogate) &&
987 : (code_point < kMaxTrailSurrogate);
988 : }
989 :
990 : // Combine a head and trail surrogate into a single Unicode code point.
991 : static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
992 : GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
993 : GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
994 0 : return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
995 0 : (trail_surrogate - kMinTrailSurrogate));
996 : }
997 :
998 : // Convert the escape sequence parameter to a number of expected hex digits.
999 : static inline int UnicodeLength(char key) {
1000 0 : if (key == 'u') return 4;
1001 0 : if (key == 'U') return 8;
1002 : return 0;
1003 : }
1004 :
1005 : // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1006 : // to parse that sequence. On success, returns a pointer to the first char
1007 : // beyond that sequence, and fills in *code_point. On failure, returns ptr
1008 : // itself.
1009 0 : static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
1010 0 : const char* p = ptr;
1011 : // Fetch the code point.
1012 0 : const int len = UnicodeLength(*p++);
1013 0 : if (!ReadHexDigits(p, len, code_point))
1014 : return ptr;
1015 0 : p += len;
1016 :
1017 : // Check if the code point we read is a "head surrogate." If so, then we
1018 : // expect it to be immediately followed by another code point which is a valid
1019 : // "trail surrogate," and together they form a UTF-16 pair which decodes into
1020 : // a single Unicode point. Trail surrogates may only use \u, not \U.
1021 0 : if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1022 : uint32 trail_surrogate;
1023 0 : if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1024 0 : IsTrailSurrogate(trail_surrogate)) {
1025 0 : *code_point = AssembleUTF16(*code_point, trail_surrogate);
1026 0 : p += 6;
1027 : }
1028 : // If this failed, then we just emit the head surrogate as a code point.
1029 : // It's bogus, but so is the string.
1030 : }
1031 :
1032 0 : return p;
1033 : }
1034 :
1035 : // The text string must begin and end with single or double quote
1036 : // characters.
1037 1252 : void Tokenizer::ParseStringAppend(const string& text, string* output) {
1038 : // Reminder: text[0] is always a quote character. (If text is
1039 : // empty, it's invalid, so we'll just return).
1040 1252 : const size_t text_size = text.size();
1041 1252 : if (text_size == 0) {
1042 0 : GOOGLE_LOG(DFATAL)
1043 : << " Tokenizer::ParseStringAppend() passed text that could not"
1044 0 : " have been tokenized as a string: " << CEscape(text);
1045 1252 : return;
1046 : }
1047 :
1048 : // Reserve room for new string. The branch is necessary because if
1049 : // there is already space available the reserve() call might
1050 : // downsize the output.
1051 1252 : const size_t new_len = text_size + output->size();
1052 1252 : if (new_len > output->capacity()) {
1053 1252 : output->reserve(new_len);
1054 : }
1055 :
1056 : // Loop through the string copying characters to "output" and
1057 : // interpreting escape sequences. Note that any invalid escape
1058 : // sequences or other errors were already reported while tokenizing.
1059 : // In this case we do not need to produce valid results.
1060 118435 : for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1061 117183 : if (*ptr == '\\' && ptr[1] != '\0') {
1062 : // An escape sequence.
1063 27 : ++ptr;
1064 :
1065 27 : if (OctalDigit::InClass(*ptr)) {
1066 : // An octal escape. May one, two, or three digits.
1067 10 : int code = DigitValue(*ptr);
1068 20 : if (OctalDigit::InClass(ptr[1])) {
1069 8 : ++ptr;
1070 16 : code = code * 8 + DigitValue(*ptr);
1071 : }
1072 20 : if (OctalDigit::InClass(ptr[1])) {
1073 8 : ++ptr;
1074 16 : code = code * 8 + DigitValue(*ptr);
1075 : }
1076 10 : output->push_back(static_cast<char>(code));
1077 :
1078 17 : } else if (*ptr == 'x') {
1079 : // A hex escape. May zero, one, or two digits. (The zero case
1080 : // will have been caught as an error earlier.)
1081 1 : int code = 0;
1082 2 : if (HexDigit::InClass(ptr[1])) {
1083 1 : ++ptr;
1084 1 : code = DigitValue(*ptr);
1085 : }
1086 2 : if (HexDigit::InClass(ptr[1])) {
1087 1 : ++ptr;
1088 2 : code = code * 16 + DigitValue(*ptr);
1089 : }
1090 1 : output->push_back(static_cast<char>(code));
1091 :
1092 16 : } else if (*ptr == 'u' || *ptr == 'U') {
1093 : uint32 unicode;
1094 0 : const char* end = FetchUnicodePoint(ptr, &unicode);
1095 0 : if (end == ptr) {
1096 : // Failure: Just dump out what we saw, don't try to parse it.
1097 0 : output->push_back(*ptr);
1098 : } else {
1099 0 : AppendUTF8(unicode, output);
1100 0 : ptr = end - 1; // Because we're about to ++ptr.
1101 : }
1102 : } else {
1103 : // Some other escape code.
1104 16 : output->push_back(TranslateEscape(*ptr));
1105 : }
1106 :
1107 234312 : } else if (*ptr == text[0] && ptr[1] == '\0') {
1108 : // Ignore final quote matching the starting quote.
1109 : } else {
1110 115904 : output->push_back(*ptr);
1111 : }
1112 : }
1113 : }
1114 :
1115 : template<typename CharacterClass>
1116 34 : static bool AllInClass(const string& s) {
1117 526 : for (int i = 0; i < s.size(); ++i) {
1118 687 : if (!CharacterClass::InClass(s[i]))
1119 : return false;
1120 : }
1121 : return true;
1122 : }
1123 :
1124 34 : bool Tokenizer::IsIdentifier(const string& text) {
1125 : // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1126 34 : if (text.size() == 0)
1127 : return false;
1128 68 : if (!Letter::InClass(text.at(0)))
1129 : return false;
1130 68 : if (!AllInClass<Alphanumeric>(text.substr(1)))
1131 : return false;
1132 34 : return true;
1133 : }
1134 :
1135 : } // namespace io
1136 : } // namespace protobuf
1137 : } // namespace google
|