Program Listing for File xh_scanner.cpp¶
↰ Return to documentation for file (src/translator/xh_scanner.cpp
)
// https://www.codeproject.com/Articles/14076/Fast-and-Compact-HTML-XML-Scanner-Tokenizer
// BSD license
#include "xh_scanner.h"
#include <cassert>
#include <cctype>
#include <cstring>
namespace {
// Simple replacement for string_view.ends_with(compile-time C string)
template <typename Char_t, size_t Len>
inline bool endsWith(markup::string_ref &str, const Char_t (&suffix)[Len]) {
size_t offset = str.size - (Len - 1);
return offset <= str.size && std::memcmp(str.data + offset, suffix, Len - 1) == 0;
}
inline bool equalsCaseInsensitive(const char *lhs, const char *rhs, size_t len) {
for (size_t i = 0; i < len; ++i) {
// cast to unsigned char otherwise std::tolower has undefined behaviour
if (std::tolower(static_cast<unsigned char>(lhs[i])) != std::tolower(static_cast<unsigned char>(rhs[i])))
return false;
}
return true;
}
// Alias for the above, but with compile-time known C string
template <size_t Len>
inline bool equalsCaseInsensitive(markup::string_ref &lhs, const char (&rhs)[Len]) {
return lhs.size == Len - 1 && equalsCaseInsensitive(lhs.data, rhs, Len - 1);
}
template <typename Char_t, size_t Len>
bool operator==(markup::string_ref const &str, const Char_t (&str2)[Len]) {
return str.size == Len - 1 && std::memcmp(str.data, str2, Len - 1) == 0;
}
template <size_t N>
constexpr size_t length(char const (&/*unused*/)[N]) {
return N - 1;
}
} // end namespace
namespace markup {
// case sensitive string equality test
// s_lowcase shall be lowercase string
std::string_view Scanner::value() const { return std::string_view(value_.data, value_.size); }
std::string_view Scanner::attribute() const { return std::string_view(attributeName_.data, attributeName_.size); }
std::string_view Scanner::tag() const { return std::string_view(tagName_.data, tagName_.size); }
Scanner::TokenType Scanner::scanBody() {
value_ = string_ref{input_.pos(), 0};
start_ = input_.pos();
switch (input_.peek()) {
case '\0':
return TT_EOF;
case '<':
return scanTag();
case '&':
return scanEntity(TT_TEXT);
}
while (true) {
switch (input_.peek()) {
case '\0':
case '<':
case '&':
return TT_TEXT;
default:
input_.consume();
++value_.size;
break;
}
}
}
// Consumes one or closing bit of a tag:
// <tag attr="value">...</tag>
// |------------|
// Followed by:
// - scanSpecial if <script> or <style>
// - scanBody
// - another scan_head for the next attribute or end of open tag
// Returns:
// - TT_ATTRIBUTE if attribute is read
// - TT_TAG_END if self-closing tag
// - TT_ERROR if wrong character encountered
// - TT_EOF if unexpected end of input (will not return TT_ATTRIBUTE if attribute value wasn't finished yet)
// - TT_TAG_END through scanSpecial
// - TT_TEXT through scanBody
Scanner::TokenType Scanner::scanAttribute() {
// Skip all whitespace between tag name or last attribute and next attribute or '>'
skipWhitespace();
// Find end of tag name
switch (input_.peek()) {
case '>':
input_.consume();
// Treat some elements as opaque, e.g. <script>, <style>
if (/*equalsCaseInsensitive(tagName_, "title") ||*/ equalsCaseInsensitive(tagName_, "script") ||
equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {
// script is special because we want to parse the attributes,
// but not the content
scanFun_ = &Scanner::scanSpecial;
return scanSpecial();
} else {
scanFun_ = &Scanner::scanBody;
return scanBody();
}
case '/':
input_.consume();
if (input_.peek() == '>') {
// self closing tag
input_.consume();
scanFun_ = &Scanner::scanBody;
return TT_TAG_END;
} else {
return TT_ERROR;
}
}
attributeName_ = string_ref{input_.pos(), 0};
value_ = string_ref{nullptr, 0};
// attribute name...
while (input_.peek() != '=') {
switch (input_.peek()) {
case '\0':
return TT_EOF;
case '>':
return TT_ATTRIBUTE; // attribute without value (HTML style) at end of tag
case '<':
return TT_ERROR;
default:
if (skipWhitespace()) {
if (input_.peek() == '=') {
break;
} else {
return TT_ATTRIBUTE; // attribute without value (HTML style) but not yet at end of tag
}
}
input_.consume();
++attributeName_.size;
break;
}
}
// consume '=' and any following whitespace
input_.consume();
skipWhitespace();
// attribute value...
char quote; // Either '"' or '\'' depending on which quote we're searching for
switch (input_.peek()) {
case '"':
case '\'':
quote = input_.consume();
value_ = string_ref{input_.pos(), 0};
while (true) {
if (input_.peek() == '\0') {
return TT_ERROR;
} else if (input_.peek() == quote) {
input_.consume();
return TT_ATTRIBUTE;
} else {
input_.consume();
++value_.size;
}
}
break;
default:
value_ = string_ref{input_.pos(), 0};
while (true) {
if (isWhitespace(input_.peek())) return TT_ATTRIBUTE;
if (input_.peek() == '>') return TT_ATTRIBUTE; // '>' will be consumed next round
input_.consume();
++value_.size;
}
break;
}
// How did we end up here?!
return TT_ERROR;
}
// scans tag name of open or closing tag
// <tag attr="value">...</tag>
// |--| |----|
// Emits:
// - TT_TAG_START if tag head is read
// - TT_COMMENT_START
// - TT_PROCESSING_INSTRUCTION_START
// - TT_CDATA_START
// - TT_ENTITY_START
// - TT_ERROR if unexpected character or end
Scanner::TokenType Scanner::scanTag() {
start_ = input_.pos();
if (input_.consume() != '<') return TT_ERROR;
bool isTail = input_.peek() == '/';
if (isTail) input_.consume();
tagName_ = string_ref{input_.pos(), 0};
while (input_.peek()) {
if (skipWhitespace()) break;
if (input_.peek() == '/' || input_.peek() == '>') break;
input_.consume();
++tagName_.size;
// Note: these tests are executed at every char, thus eager.
// "<?xml" will match on `tagName_ == "?"`.
if (tagName_ == "!--") {
scanFun_ = &Scanner::scanComment;
return TT_COMMENT_START;
} else if (tagName_ == "?") {
scanFun_ = &Scanner::scanProcessingInstruction;
return TT_PROCESSING_INSTRUCTION_START;
}
}
if (!input_.peek()) return TT_EOF;
if (isTail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
scanFun_ = &Scanner::scanAttribute;
return TT_TAG_START;
}
Scanner::TokenType Scanner::scanEntity(TokenType parentTokenType) {
// `entity` includes starting '&' and ending ';'
start_ = input_.pos();
string_ref entity{input_.pos(), 0};
bool hasEnd = false;
if (input_.consume() != '&') return TT_ERROR;
++entity.size; // Account for the consumed '&'
// Consume the entity
while (input_.peek()) {
if (input_.peek() == ';') {
input_.consume();
++entity.size;
hasEnd = true;
break;
} else if (!isalpha(input_.peek())) {
hasEnd = false;
break;
} else {
input_.consume();
++entity.size;
}
}
// If we can decode the entity, do so
if (hasEnd && resolveEntity(entity, value_)) return parentTokenType;
// Otherwise, just yield the whole thing undecoded, interpret it as text
value_ = entity;
return parentTokenType;
}
bool Scanner::resolveEntity(string_ref const &buffer, string_ref &decoded) const {
static char lt = '<', gt = '>', amp = '&', quot = '"', apos = '\'', nbsp = ' ';
if (buffer == "<") {
decoded = string_ref{<, 1};
return true;
}
if (buffer == ">") {
decoded = string_ref{>, 1};
return true;
}
if (buffer == "&") {
decoded = string_ref{&, 1};
return true;
}
if (buffer == """) {
decoded = string_ref{", 1};
return true;
}
if (buffer == "'") {
decoded = string_ref{&apos, 1};
return true;
}
if (buffer == " ") {
decoded = string_ref{ , 1}; // TODO: handle non-breaking spaces better than just converting them to spaces
return true;
}
return false;
}
// skip whitespaces.
// returns how many whitespaces were skipped
size_t Scanner::skipWhitespace() {
size_t skipped = 0;
while (isWhitespace(input_.peek())) {
input_.consume();
++skipped;
}
return skipped;
}
bool Scanner::isWhitespace(char c) {
return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f');
}
Scanner::TokenType Scanner::scanComment() {
if (gotTail_) {
start_ = input_.pos() - length("-->"); // minus "-->"
scanFun_ = &Scanner::scanBody;
gotTail_ = false;
return TT_COMMENT_END;
}
start_ = input_.pos();
value_ = string_ref{input_.pos(), 0};
while (true) {
if (input_.consume() == '\0') return TT_EOF;
++value_.size;
if (endsWith(value_, "-->")) {
gotTail_ = true;
value_.size -= length("-->");
break;
}
}
return TT_DATA;
}
Scanner::TokenType Scanner::scanProcessingInstruction() {
if (gotTail_) {
start_ = input_.pos() - length("?>");
scanFun_ = &Scanner::scanBody;
gotTail_ = false;
return TT_PROCESSING_INSTRUCTION_END;
}
start_ = input_.pos();
value_ = string_ref{input_.pos(), 0};
while (true) {
if (input_.consume() == '\0') return TT_EOF;
++value_.size;
if (endsWith(value_, "?>")) {
gotTail_ = true;
value_.size -= length("?>");
break;
}
}
return TT_DATA;
}
Scanner::TokenType Scanner::scanSpecial() {
if (gotTail_) {
start_ = input_.pos() - (tagName_.size + length("</>"));
scanFun_ = &Scanner::scanBody;
gotTail_ = false;
return TT_TAG_END;
}
start_ = input_.pos();
value_ = string_ref{input_.pos(), 0};
while (true) {
if (input_.consume() == '\0') return TT_EOF;
++value_.size;
// Test for </tag>
// TODO: no whitespaces allowed? Is that okay?
if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + length("</>")) {
// Test for the "</"" bit of "</tag>"
size_t posTagStart = value_.size - tagName_.size - length("</>");
if (std::memcmp(value_.data + posTagStart, "</", length("</")) != 0) continue;
// Test for the "tag" bit of "</tag>". Doing case insensitive compare because <I>...</i> is okay.
size_t posTagName = value_.size - tagName_.size - length(">"); // end - tag>
if (!equalsCaseInsensitive(value_.data + posTagName, tagName_.data, tagName_.size)) continue;
gotTail_ = true;
value_.size -= tagName_.size + length("</>");
break;
}
}
return TT_DATA;
}
} // namespace markup