LCOV - code coverage report
Current view: top level - third_party/protobuf/src/google/protobuf/io - tokenizer.h (source / functions) Hit Total Coverage
Test: tmp.zDYK9MVh93 Lines: 7 12 58.3 %
Date: 2015-10-10 Functions: 1 2 50.0 %

          Line data    Source code
       1             : // Protocol Buffers - Google's data interchange format
       2             : // Copyright 2008 Google Inc.  All rights reserved.
       3             : // https://developers.google.com/protocol-buffers/
       4             : //
       5             : // Redistribution and use in source and binary forms, with or without
       6             : // modification, are permitted provided that the following conditions are
       7             : // met:
       8             : //
       9             : //     * Redistributions of source code must retain the above copyright
      10             : // notice, this list of conditions and the following disclaimer.
      11             : //     * Redistributions in binary form must reproduce the above
      12             : // copyright notice, this list of conditions and the following disclaimer
      13             : // in the documentation and/or other materials provided with the
      14             : // distribution.
      15             : //     * Neither the name of Google Inc. nor the names of its
      16             : // contributors may be used to endorse or promote products derived from
      17             : // this software without specific prior written permission.
      18             : //
      19             : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      20             : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      21             : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      22             : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
      23             : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
      24             : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
      25             : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
      26             : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
      27             : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
      28             : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
      29             : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30             : 
      31             : // Author: kenton@google.com (Kenton Varda)
      32             : //  Based on original Protocol Buffers design by
      33             : //  Sanjay Ghemawat, Jeff Dean, and others.
      34             : //
      35             : // Class for parsing tokenized text from a ZeroCopyInputStream.
      36             : 
      37             : #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
      38             : #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
      39             : 
      40             : #include <string>
      41             : #include <vector>
      42             : #include <google/protobuf/stubs/common.h>
      43             : #include <google/protobuf/stubs/logging.h>
      44             : 
      45             : namespace google {
      46             : namespace protobuf {
      47             : namespace io {
      48             : 
      49             : class ZeroCopyInputStream;     // zero_copy_stream.h
      50             : 
      51             : // Defined in this file.
      52             : class ErrorCollector;
      53             : class Tokenizer;
      54             : 
      55             : // Abstract interface for an object which collects the errors that occur
      56             : // during parsing.  A typical implementation might simply print the errors
      57             : // to stdout.
      58             : class LIBPROTOBUF_EXPORT ErrorCollector {
      59             :  public:
      60         113 :   inline ErrorCollector() {}
      61             :   virtual ~ErrorCollector();
      62             : 
      63             :   // Indicates that there was an error in the input at the given line and
      64             :   // column numbers.  The numbers are zero-based, so you may want to add
      65             :   // 1 to each before printing them.
      66             :   virtual void AddError(int line, int column, const string& message) = 0;
      67             : 
      68             :   // Indicates that there was a warning in the input at the given line and
      69             :   // column numbers.  The numbers are zero-based, so you may want to add
      70             :   // 1 to each before printing them.
      71           0 :   virtual void AddWarning(int /* line */, int /* column */,
      72           0 :                           const string& /* message */) { }
      73             : 
      74             :  private:
      75             :   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
      76             : };
      77             : 
      78             : // This class converts a stream of raw text into a stream of tokens for
      79             : // the protocol definition parser to parse.  The tokens recognized are
      80             : // similar to those that make up the C language; see the TokenType enum for
      81             : // precise descriptions.  Whitespace and comments are skipped.  By default,
      82             : // C- and C++-style comments are recognized, but other styles can be used by
      83             : // calling set_comment_style().
      84             : class LIBPROTOBUF_EXPORT Tokenizer {
      85             :  public:
      86             :   // Construct a Tokenizer that reads and tokenizes text from the given
      87             :   // input stream and writes errors to the given error_collector.
      88             :   // The caller keeps ownership of input and error_collector.
      89             :   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
      90             :   ~Tokenizer();
      91             : 
      92             :   enum TokenType {
      93             :     TYPE_START,       // Next() has not yet been called.
      94             :     TYPE_END,         // End of input reached.  "text" is empty.
      95             : 
      96             :     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
      97             :                       // starting with a digit.  It is an error for a number
      98             :                       // to be followed by an identifier with no space in
      99             :                       // between.
     100             :     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
     101             :                       // the digits are decimal, but a prefix of "0x" indicates
     102             :                       // a hex number and a leading zero indicates octal, just
     103             :                       // like with C numeric literals.  A leading negative sign
     104             :                       // is NOT included in the token; it's up to the parser to
     105             :                       // interpret the unary minus operator on its own.
     106             :     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
     107             :                       // an exponent.  Always in decimal.  Again, never
     108             :                       // negative.
     109             :     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
     110             :                       // or double quotes can be used, but they must match.
     111             :                       // A string literal cannot cross a line break.
     112             :     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
     113             :                       // Symbols are always a single character, so "!+$%" is
     114             :                       // four tokens.
     115             :   };
     116             : 
     117             :   // Structure representing a token read from the token stream.
     118       71478 :   struct Token {
     119             :     TokenType type;
     120             :     string text;       // The exact text of the token as it appeared in
     121             :                        // the input.  e.g. tokens of TYPE_STRING will still
     122             :                        // be escaped and in quotes.
     123             : 
     124             :     // "line" and "column" specify the position of the first character of
     125             :     // the token within the input stream.  They are zero-based.
     126             :     int line;
     127             :     int column;
     128             :     int end_column;
     129             :   };
     130             : 
     131             :   // Get the current token.  This is updated when Next() is called.  Before
     132             :   // the first call to Next(), current() has type TYPE_START and no contents.
     133             :   const Token& current();
     134             : 
     135             :   // Return the previous token -- i.e. what current() returned before the
     136             :   // previous call to Next().
     137             :   const Token& previous();
     138             : 
     139             :   // Advance to the next token.  Returns false if the end of the input is
     140             :   // reached.
     141             :   bool Next();
     142             : 
     143             :   // Like Next(), but also collects comments which appear between the previous
     144             :   // and next tokens.
     145             :   //
     146             :   // Comments which appear to be attached to the previous token are stored
     147             :   // in *prev_tailing_comments.  Comments which appear to be attached to the
     148             :   // next token are stored in *next_leading_comments.  Comments appearing in
     149             :   // between which do not appear to be attached to either will be added to
     150             :   // detached_comments.  Any of these parameters can be NULL to simply discard
     151             :   // the comments.
     152             :   //
     153             :   // A series of line comments appearing on consecutive lines, with no other
     154             :   // tokens appearing on those lines, will be treated as a single comment.
     155             :   //
     156             :   // Only the comment content is returned; comment markers (e.g. //) are
     157             :   // stripped out.  For block comments, leading whitespace and an asterisk will
     158             :   // be stripped from the beginning of each line other than the first.  Newlines
     159             :   // are included in the output.
     160             :   //
     161             :   // Examples:
     162             :   //
     163             :   //   optional int32 foo = 1;  // Comment attached to foo.
     164             :   //   // Comment attached to bar.
     165             :   //   optional int32 bar = 2;
     166             :   //
     167             :   //   optional string baz = 3;
     168             :   //   // Comment attached to baz.
     169             :   //   // Another line attached to baz.
     170             :   //
     171             :   //   // Comment attached to qux.
     172             :   //   //
     173             :   //   // Another line attached to qux.
     174             :   //   optional double qux = 4;
     175             :   //
     176             :   //   // Detached comment.  This is not attached to qux or corge
     177             :   //   // because there are blank lines separating it from both.
     178             :   //
     179             :   //   optional string corge = 5;
     180             :   //   /* Block comment attached
     181             :   //    * to corge.  Leading asterisks
     182             :   //    * will be removed. */
     183             :   //   /* Block comment attached to
     184             :   //    * grault. */
     185             :   //   optional int32 grault = 6;
     186             :   bool NextWithComments(string* prev_trailing_comments,
     187             :                         vector<string>* detached_comments,
     188             :                         string* next_leading_comments);
     189             : 
     190             :   // Parse helpers ---------------------------------------------------
     191             : 
     192             :   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
     193             :   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
     194             :   // result is undefined (possibly an assert failure).
     195             :   static double ParseFloat(const string& text);
     196             : 
     197             :   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
     198             :   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
     199             :   // result is undefined (possibly an assert failure).
     200             :   static void ParseString(const string& text, string* output);
     201             : 
     202             :   // Identical to ParseString, but appends to output.
     203             :   static void ParseStringAppend(const string& text, string* output);
     204             : 
     205             :   // Parses a TYPE_INTEGER token.  Returns false if the result would be
     206             :   // greater than max_value.  Otherwise, returns true and sets *output to the
     207             :   // result.  If the text is not from a Token of type TYPE_INTEGER originally
     208             :   // parsed by a Tokenizer, the result is undefined (possibly an assert
     209             :   // failure).
     210             :   static bool ParseInteger(const string& text, uint64 max_value,
     211             :                            uint64* output);
     212             : 
     213             :   // Options ---------------------------------------------------------
     214             : 
     215             :   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
     216             :   // which would otherwise be integers but which have the 'f' suffix will be
     217             :   // forced to be interpreted as floats.  For all other purposes, the 'f' is
     218             :   // ignored.
     219           9 :   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
     220             : 
     221             :   // Valid values for set_comment_style().
     222             :   enum CommentStyle {
     223             :     // Line comments begin with "//", block comments are delimited by "/*" and
     224             :     // "*/".
     225             :     CPP_COMMENT_STYLE,
     226             :     // Line comments begin with "#".  No way to write block comments.
     227             :     SH_COMMENT_STYLE
     228             :   };
     229             : 
     230             :   // Sets the comment style.
     231           9 :   void set_comment_style(CommentStyle style) { comment_style_ = style; }
     232             : 
     233             :   // Whether to require whitespace between a number and a field name.
     234             :   // Default is true. Do not use this; for Google-internal cleanup only.
     235             :   void set_require_space_after_number(bool require) {
     236           0 :     require_space_after_number_ = require;
     237             :   }
     238             : 
     239             :   // Whether to allow string literals to span multiple lines. Default is false.
     240             :   // Do not use this; for Google-internal cleanup only.
     241             :   void set_allow_multiline_strings(bool allow) {
     242           0 :     allow_multiline_strings_ = allow;
     243             :   }
     244             : 
     245             :   // External helper: validate an identifier.
     246             :   static bool IsIdentifier(const string& text);
     247             : 
     248             :   // -----------------------------------------------------------------
     249             :  private:
     250             :   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
     251             : 
     252             :   Token current_;           // Returned by current().
     253             :   Token previous_;          // Returned by previous().
     254             : 
     255             :   ZeroCopyInputStream* input_;
     256             :   ErrorCollector* error_collector_;
     257             : 
     258             :   char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
     259             :   const char* buffer_;      // Current buffer returned from input_.
     260             :   int buffer_size_;         // Size of buffer_.
     261             :   int buffer_pos_;          // Current position within the buffer.
     262             :   bool read_error_;         // Did we previously encounter a read error?
     263             : 
     264             :   // Line and column number of current_char_ within the whole input stream.
     265             :   int line_;
     266             :   int column_;
     267             : 
     268             :   // String to which text should be appended as we advance through it.
     269             :   // Call RecordTo(&str) to start recording and StopRecording() to stop.
     270             :   // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
     271             :   // position within the current buffer where recording started.
     272             :   string* record_target_;
     273             :   int record_start_;
     274             : 
     275             :   // Options.
     276             :   bool allow_f_after_float_;
     277             :   CommentStyle comment_style_;
     278             :   bool require_space_after_number_;
     279             :   bool allow_multiline_strings_;
     280             : 
     281             :   // Since we count columns we need to interpret tabs somehow.  We'll take
     282             :   // the standard 8-character definition for lack of any way to do better.
     283             :   static const int kTabWidth = 8;
     284             : 
     285             :   // -----------------------------------------------------------------
     286             :   // Helper methods.
     287             : 
     288             :   // Consume this character and advance to the next one.
     289             :   void NextChar();
     290             : 
     291             :   // Read a new buffer from the input.
     292             :   void Refresh();
     293             : 
     294             :   inline void RecordTo(string* target);
     295             :   inline void StopRecording();
     296             : 
     297             :   // Called when the current character is the first character of a new
     298             :   // token (not including whitespace or comments).
     299             :   inline void StartToken();
     300             :   // Called when the current character is the first character after the
     301             :   // end of the last token.  After this returns, current_.text will
     302             :   // contain all text consumed since StartToken() was called.
     303             :   inline void EndToken();
     304             : 
     305             :   // Convenience method to add an error at the current line and column.
     306             :   void AddError(const string& message) {
     307           0 :     error_collector_->AddError(line_, column_, message);
     308             :   }
     309             : 
     310             :   // -----------------------------------------------------------------
     311             :   // The following four methods are used to consume tokens of specific
     312             :   // types.  They are actually used to consume all characters *after*
     313             :   // the first, since the calling function consumes the first character
     314             :   // in order to decide what kind of token is being read.
     315             : 
     316             :   // Read and consume a string, ending when the given delimiter is
     317             :   // consumed.
     318             :   void ConsumeString(char delimiter);
     319             : 
     320             :   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
     321             :   // depending on what was read.  This needs to know if the first
     322             :   // character was a zero in order to correctly recognize hex and octal
     323             :   // numbers.
     324             :   // It also needs to know if the first characted was a . to parse floating
     325             :   // point correctly.
     326             :   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
     327             : 
     328             :   // Consume the rest of a line.
     329             :   void ConsumeLineComment(string* content);
     330             :   // Consume until "*/".
     331             :   void ConsumeBlockComment(string* content);
     332             : 
     333             :   enum NextCommentStatus {
     334             :     // Started a line comment.
     335             :     LINE_COMMENT,
     336             : 
     337             :     // Started a block comment.
     338             :     BLOCK_COMMENT,
     339             : 
     340             :     // Consumed a slash, then realized it wasn't a comment.  current_ has
     341             :     // been filled in with a slash token.  The caller should return it.
     342             :     SLASH_NOT_COMMENT,
     343             : 
     344             :     // We do not appear to be starting a comment here.
     345             :     NO_COMMENT
     346             :   };
     347             : 
     348             :   // If we're at the start of a new comment, consume it and return what kind
     349             :   // of comment it is.
     350             :   NextCommentStatus TryConsumeCommentStart();
     351             : 
     352             :   // -----------------------------------------------------------------
     353             :   // These helper methods make the parsing code more readable.  The
     354             :   // "character classes" referred to are defined at the top of the .cc file.
     355             :   // Basically it is a C++ class with one method:
     356             :   //   static bool InClass(char c);
     357             :   // The method returns true if c is a member of this "class", like "Letter"
     358             :   // or "Digit".
     359             : 
     360             :   // Returns true if the current character is of the given character
     361             :   // class, but does not consume anything.
     362             :   template<typename CharacterClass>
     363             :   inline bool LookingAt();
     364             : 
     365             :   // If the current character is in the given class, consume it and return
     366             :   // true.  Otherwise return false.
     367             :   // e.g. TryConsumeOne<Letter>()
     368             :   template<typename CharacterClass>
     369             :   inline bool TryConsumeOne();
     370             : 
     371             :   // Like above, but try to consume the specific character indicated.
     372             :   inline bool TryConsume(char c);
     373             : 
     374             :   // Consume zero or more of the given character class.
     375             :   template<typename CharacterClass>
     376             :   inline void ConsumeZeroOrMore();
     377             : 
     378             :   // Consume one or more of the given character class or log the given
     379             :   // error message.
     380             :   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
     381             :   template<typename CharacterClass>
     382             :   inline void ConsumeOneOrMore(const char* error);
     383             : };
     384             : 
     385             : // inline methods ====================================================
     386             : inline const Tokenizer::Token& Tokenizer::current() {
     387             :   return current_;
     388             : }
     389             : 
     390             : inline const Tokenizer::Token& Tokenizer::previous() {
     391             :   return previous_;
     392             : }
     393             : 
     394        1242 : inline void Tokenizer::ParseString(const string& text, string* output) {
     395             :   output->clear();
     396        1242 :   ParseStringAppend(text, output);
     397        1242 : }
     398             : 
     399             : }  // namespace io
     400             : }  // namespace protobuf
     401             : 
     402             : }  // namespace google
     403             : #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__

Generated by: LCOV version 1.10