diff options
Diffstat (limited to 'NorthstarDedicatedTest/include/protobuf/io/tokenizer.h')
-rw-r--r-- | NorthstarDedicatedTest/include/protobuf/io/tokenizer.h | 440 |
1 files changed, 440 insertions, 0 deletions
diff --git a/NorthstarDedicatedTest/include/protobuf/io/tokenizer.h b/NorthstarDedicatedTest/include/protobuf/io/tokenizer.h new file mode 100644 index 00000000..a4059081 --- /dev/null +++ b/NorthstarDedicatedTest/include/protobuf/io/tokenizer.h @@ -0,0 +1,440 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Class for parsing tokenized text from a ZeroCopyInputStream. + +#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ +#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ + + +#include <string> +#include <vector> + +#include <stubs/common.h> +#include <stubs/logging.h> +#include <port_def.inc> + +namespace google { +namespace protobuf { +namespace io { + +class ZeroCopyInputStream; // zero_copy_stream.h + +// Defined in this file. +class ErrorCollector; +class Tokenizer; + +// By "column number", the proto compiler refers to a count of the number +// of bytes before a given byte, except that a tab character advances to +// the next multiple of 8 bytes. Note in particular that column numbers +// are zero-based, while many user interfaces use one-based column numbers. +typedef int ColumnNumber; + +// Abstract interface for an object which collects the errors that occur +// during parsing. A typical implementation might simply print the errors +// to stdout. +class PROTOBUF_EXPORT ErrorCollector { + public: + inline ErrorCollector() {} + virtual ~ErrorCollector(); + + // Indicates that there was an error in the input at the given line and + // column numbers. The numbers are zero-based, so you may want to add + // 1 to each before printing them. + virtual void AddError(int line, ColumnNumber column, + const std::string& message) = 0; + + // Indicates that there was a warning in the input at the given line and + // column numbers. The numbers are zero-based, so you may want to add + // 1 to each before printing them. + virtual void AddWarning(int /* line */, ColumnNumber /* column */, + const std::string& /* message */) {} + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); +}; + +// This class converts a stream of raw text into a stream of tokens for +// the protocol definition parser to parse. The tokens recognized are +// similar to those that make up the C language; see the TokenType enum for +// precise descriptions. Whitespace and comments are skipped. By default, +// C- and C++-style comments are recognized, but other styles can be used by +// calling set_comment_style(). +class PROTOBUF_EXPORT Tokenizer { + public: + // Construct a Tokenizer that reads and tokenizes text from the given + // input stream and writes errors to the given error_collector. + // The caller keeps ownership of input and error_collector. + Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); + ~Tokenizer(); + + enum TokenType { + TYPE_START, // Next() has not yet been called. + TYPE_END, // End of input reached. "text" is empty. + + TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not + // starting with a digit. It is an error for a number + // to be followed by an identifier with no space in + // between. + TYPE_INTEGER, // A sequence of digits representing an integer. Normally + // the digits are decimal, but a prefix of "0x" indicates + // a hex number and a leading zero indicates octal, just + // like with C numeric literals. A leading negative sign + // is NOT included in the token; it's up to the parser to + // interpret the unary minus operator on its own. + TYPE_FLOAT, // A floating point literal, with a fractional part and/or + // an exponent. Always in decimal. Again, never + // negative. + TYPE_STRING, // A quoted sequence of escaped characters. Either single + // or double quotes can be used, but they must match. + // A string literal cannot cross a line break. + TYPE_SYMBOL, // Any other printable character, like '!' or '+'. + // Symbols are always a single character, so "!+$%" is + // four tokens. + TYPE_WHITESPACE, // A sequence of whitespace. This token type is only + // produced if report_whitespace() is true. It is not + // reported for whitespace within comments or strings. + TYPE_NEWLINE, // A newline (\n). This token type is only + // produced if report_whitespace() is true and + // report_newlines() is true. It is not reported for + // newlines in comments or strings. + }; + + // Structure representing a token read from the token stream. + struct Token { + TokenType type; + std::string text; // The exact text of the token as it appeared in + // the input. e.g. tokens of TYPE_STRING will still + // be escaped and in quotes. + + // "line" and "column" specify the position of the first character of + // the token within the input stream. They are zero-based. + int line; + ColumnNumber column; + ColumnNumber end_column; + }; + + // Get the current token. This is updated when Next() is called. Before + // the first call to Next(), current() has type TYPE_START and no contents. + const Token& current(); + + // Return the previous token -- i.e. what current() returned before the + // previous call to Next(). + const Token& previous(); + + // Advance to the next token. Returns false if the end of the input is + // reached. + bool Next(); + + // Like Next(), but also collects comments which appear between the previous + // and next tokens. + // + // Comments which appear to be attached to the previous token are stored + // in *prev_tailing_comments. Comments which appear to be attached to the + // next token are stored in *next_leading_comments. Comments appearing in + // between which do not appear to be attached to either will be added to + // detached_comments. Any of these parameters can be NULL to simply discard + // the comments. + // + // A series of line comments appearing on consecutive lines, with no other + // tokens appearing on those lines, will be treated as a single comment. + // + // Only the comment content is returned; comment markers (e.g. //) are + // stripped out. For block comments, leading whitespace and an asterisk will + // be stripped from the beginning of each line other than the first. Newlines + // are included in the output. + // + // Examples: + // + // optional int32 foo = 1; // Comment attached to foo. + // // Comment attached to bar. + // optional int32 bar = 2; + // + // optional string baz = 3; + // // Comment attached to baz. + // // Another line attached to baz. + // + // // Comment attached to qux. + // // + // // Another line attached to qux. + // optional double qux = 4; + // + // // Detached comment. This is not attached to qux or corge + // // because there are blank lines separating it from both. + // + // optional string corge = 5; + // /* Block comment attached + // * to corge. Leading asterisks + // * will be removed. */ + // /* Block comment attached to + // * grault. */ + // optional int32 grault = 6; + bool NextWithComments(std::string* prev_trailing_comments, + std::vector<std::string>* detached_comments, + std::string* next_leading_comments); + + // Parse helpers --------------------------------------------------- + + // Parses a TYPE_FLOAT token. This never fails, so long as the text actually + // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the + // result is undefined (possibly an assert failure). + static double ParseFloat(const std::string& text); + + // Parses a TYPE_STRING token. This never fails, so long as the text actually + // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the + // result is undefined (possibly an assert failure). + static void ParseString(const std::string& text, std::string* output); + + // Identical to ParseString, but appends to output. + static void ParseStringAppend(const std::string& text, std::string* output); + + // Parses a TYPE_INTEGER token. Returns false if the result would be + // greater than max_value. Otherwise, returns true and sets *output to the + // result. If the text is not from a Token of type TYPE_INTEGER originally + // parsed by a Tokenizer, the result is undefined (possibly an assert + // failure). + static bool ParseInteger(const std::string& text, uint64_t max_value, + uint64_t* output); + + // Options --------------------------------------------------------- + + // Set true to allow floats to be suffixed with the letter 'f'. Tokens + // which would otherwise be integers but which have the 'f' suffix will be + // forced to be interpreted as floats. For all other purposes, the 'f' is + // ignored. + void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } + + // Valid values for set_comment_style(). + enum CommentStyle { + // Line comments begin with "//", block comments are delimited by "/*" and + // "*/". + CPP_COMMENT_STYLE, + // Line comments begin with "#". No way to write block comments. + SH_COMMENT_STYLE + }; + + // Sets the comment style. + void set_comment_style(CommentStyle style) { comment_style_ = style; } + + // Whether to require whitespace between a number and a field name. + // Default is true. Do not use this; for Google-internal cleanup only. + void set_require_space_after_number(bool require) { + require_space_after_number_ = require; + } + + // Whether to allow string literals to span multiple lines. Default is false. + // Do not use this; for Google-internal cleanup only. + void set_allow_multiline_strings(bool allow) { + allow_multiline_strings_ = allow; + } + + // If true, whitespace tokens are reported by Next(). + // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`. + bool report_whitespace() const; + void set_report_whitespace(bool report); + + // If true, newline tokens are reported by Next(). + // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`. + bool report_newlines() const; + void set_report_newlines(bool report); + + // External helper: validate an identifier. + static bool IsIdentifier(const std::string& text); + + // ----------------------------------------------------------------- + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); + + Token current_; // Returned by current(). + Token previous_; // Returned by previous(). + + ZeroCopyInputStream* input_; + ErrorCollector* error_collector_; + + char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). + const char* buffer_; // Current buffer returned from input_. + int buffer_size_; // Size of buffer_. + int buffer_pos_; // Current position within the buffer. + bool read_error_; // Did we previously encounter a read error? + + // Line and column number of current_char_ within the whole input stream. + int line_; + ColumnNumber column_; + + // String to which text should be appended as we advance through it. + // Call RecordTo(&str) to start recording and StopRecording() to stop. + // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the + // position within the current buffer where recording started. + std::string* record_target_; + int record_start_; + + // Options. + bool allow_f_after_float_; + CommentStyle comment_style_; + bool require_space_after_number_; + bool allow_multiline_strings_; + bool report_whitespace_ = false; + bool report_newlines_ = false; + + // Since we count columns we need to interpret tabs somehow. We'll take + // the standard 8-character definition for lack of any way to do better. + // This must match the documentation of ColumnNumber. + static const int kTabWidth = 8; + + // ----------------------------------------------------------------- + // Helper methods. + + // Consume this character and advance to the next one. + void NextChar(); + + // Read a new buffer from the input. + void Refresh(); + + inline void RecordTo(std::string* target); + inline void StopRecording(); + + // Called when the current character is the first character of a new + // token (not including whitespace or comments). + inline void StartToken(); + // Called when the current character is the first character after the + // end of the last token. After this returns, current_.text will + // contain all text consumed since StartToken() was called. + inline void EndToken(); + + // Convenience method to add an error at the current line and column. + void AddError(const std::string& message) { + error_collector_->AddError(line_, column_, message); + } + + // ----------------------------------------------------------------- + // The following four methods are used to consume tokens of specific + // types. They are actually used to consume all characters *after* + // the first, since the calling function consumes the first character + // in order to decide what kind of token is being read. + + // Read and consume a string, ending when the given delimiter is + // consumed. + void ConsumeString(char delimiter); + + // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER + // depending on what was read. This needs to know if the first + // character was a zero in order to correctly recognize hex and octal + // numbers. + // It also needs to know if the first character was a . to parse floating + // point correctly. + TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); + + // Consume the rest of a line. + void ConsumeLineComment(std::string* content); + // Consume until "*/". + void ConsumeBlockComment(std::string* content); + + enum NextCommentStatus { + // Started a line comment. + LINE_COMMENT, + + // Started a block comment. + BLOCK_COMMENT, + + // Consumed a slash, then realized it wasn't a comment. current_ has + // been filled in with a slash token. The caller should return it. + SLASH_NOT_COMMENT, + + // We do not appear to be starting a comment here. + NO_COMMENT + }; + + // If we're at the start of a new comment, consume it and return what kind + // of comment it is. + NextCommentStatus TryConsumeCommentStart(); + + // If we're looking at a TYPE_WHITESPACE token and `report_whitespace_` is + // true, consume it and return true. + bool TryConsumeWhitespace(); + + // If we're looking at a TYPE_NEWLINE token and `report_newlines_` is true, + // consume it and return true. + bool TryConsumeNewline(); + + // ----------------------------------------------------------------- + // These helper methods make the parsing code more readable. The + // "character classes" referred to are defined at the top of the .cc file. + // Basically it is a C++ class with one method: + // static bool InClass(char c); + // The method returns true if c is a member of this "class", like "Letter" + // or "Digit". + + // Returns true if the current character is of the given character + // class, but does not consume anything. + template <typename CharacterClass> + inline bool LookingAt(); + + // If the current character is in the given class, consume it and return + // true. Otherwise return false. + // e.g. TryConsumeOne<Letter>() + template <typename CharacterClass> + inline bool TryConsumeOne(); + + // Like above, but try to consume the specific character indicated. + inline bool TryConsume(char c); + + // Consume zero or more of the given character class. + template <typename CharacterClass> + inline void ConsumeZeroOrMore(); + + // Consume one or more of the given character class or log the given + // error message. + // e.g. ConsumeOneOrMore<Digit>("Expected digits."); + template <typename CharacterClass> + inline void ConsumeOneOrMore(const char* error); +}; + +// inline methods ==================================================== +inline const Tokenizer::Token& Tokenizer::current() { return current_; } + +inline const Tokenizer::Token& Tokenizer::previous() { return previous_; } + +inline void Tokenizer::ParseString(const std::string& text, + std::string* output) { + output->clear(); + ParseStringAppend(text, output); +} + +} // namespace io +} // namespace protobuf +} // namespace google + +#include <port_undef.inc> + +#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__ |