Program Listing for File xh_scanner.h¶
↰ Return to documentation for file (src/translator/xh_scanner.h
)
// https://www.codeproject.com/Articles/14076/Fast-and-Compact-HTML-XML-Scanner-Tokenizer
// BSD license
//|
//| simple and fast XML/HTML scanner/tokenizer
//|
//| (C) Andrew Fedoniouk @ terrainformatica.com
//|
#include <cassert>
#include <cstring>
#include <string_view>
namespace markup {
struct instream {
const char *p;
const char *begin;
const char *end;
explicit instream(const char *src) : p(src), begin(src), end(src + strlen(src)) {}
instream(const char *begin, const char *end) : p(begin), begin(begin), end(end) {}
char consume() { return p < end ? *p++ : 0; }
char peek() const { return p < end ? *p : 0; }
const char *pos() const { return p; }
};
// Think string_view, but with a mutable range
struct string_ref {
const char *data;
size_t size;
};
class Scanner {
public:
enum TokenType {
TT_ERROR = -1,
TT_EOF = 0,
TT_TAG_START, // <tag ...
// ^-- happens here
//
TT_TAG_END, // </tag>
// ^-- happens here
// <tag ... />
// ^-- or here
//
TT_ATTRIBUTE, // <tag attr="value" >
// ^-- happens here, attr_name() and value()
// will be filled with 'attr' and 'value'.
//
TT_TEXT, // <tag>xxx</tag>
// ^-- happens here
// <tag>foo && bar</tag>
// ^---^----^----^-- and all of here as well
// Comes after TT_TAG_START or as the first token if the input
// begins with text instead of a root element.
//
TT_DATA, // <!-- foo -->
// ^-- here
// <? ... ?>
// ^-- as well as here
// <script>...</script>
// ^-- or here
// <style>...</style>
// ^-- or here
// comes after TT_COMMENT_START, TT_PI_START, or TT_TAG_START
// if the tag was <script> or <style>.
//
TT_COMMENT_START, // <!-- foo -->
// ^-- happens here
//
TT_COMMENT_END, // <!-- foo -->
// ^-- happens here
//
TT_PROCESSING_INSTRUCTION_START, // <?xml version="1.0?>
// ^-- happens here
//
TT_PROCESSING_INSTRUCTION_END, // <?xml version="1.0?>
// ^-- would you believe this happens here
};
public:
explicit Scanner(instream &is)
: value_{nullptr, 0},
tagName_{nullptr, 0},
attributeName_{nullptr, 0},
input_(is),
start_(nullptr),
scanFun_(&Scanner::scanBody),
gotTail_(false) {}
// get next token
TokenType next() { return (this->*scanFun_)(); }
// get value of TT_TEXT, TT_ATTR and TT_DATA
std::string_view value() const;
// get attribute name
std::string_view attribute() const;
// get tag name
std::string_view tag() const;
inline const char *start() const { return start_; }
private: /* methods */
typedef TokenType (Scanner::*ScanPtr)();
// Consumes the text around and between tags
TokenType scanBody();
// Consumes name="attr"
TokenType scanAttribute();
// Consumes <!-- ... -->
TokenType scanComment();
// Consumes <?name [attrs]?>
TokenType scanProcessingInstruction();
// Consumes ...</style> and ...</script>
TokenType scanSpecial();
// Consumes <tagname and </tagname>
TokenType scanTag();
// Consumes '&' etc, emits parent_token_type
TokenType scanEntity(TokenType parentTokenType);
size_t skipWhitespace();
bool resolveEntity(string_ref const &buffer, string_ref &decoded) const;
static bool isWhitespace(char c);
private: /* data */
string_ref value_;
string_ref tagName_;
string_ref attributeName_;
ScanPtr scanFun_; // current 'reader'
instream &input_;
// Start position of a token.
const char *start_;
bool gotTail_; // aux flag used in scanComment, scanSpecial, scanProcessingInstruction
};
} // namespace markup