Program Listing for File html.cpp¶
↰ Return to documentation for file (src/translator/html.cpp
)
#include "html.h"
#include <algorithm>
#include "response.h"
#include "translator/definitions.h"
#include "xh_scanner.h"
namespace {
using marian::bergamot::AnnotatedText;
using marian::bergamot::ByteRange;
using marian::bergamot::HTML;
using marian::bergamot::Response;
void encodeEntities(marian::string_view const &input, std::string &output) {
output.clear();
output.reserve(input.size()); // assumes there are no entities in most cases
for (char it : input) {
switch (it) {
case '&':
output.append("&");
break;
case '<':
output.append("<");
break;
case '>':
output.append(">");
break;
// case ???:
// output.append(" ");
// break;
// case '"':
// output.append(""");
// break;
// case '\'':
// output.append("'");
// break;
default:
output.push_back(it);
break;
}
}
}
size_t countPrefixWhitespaces(marian::string_view const &input) {
size_t size = 0;
while (size < input.size() && std::isspace(static_cast<unsigned char>(input[size]))) ++size;
return size;
}
std::string toLowerCase(std::string_view const &input) {
std::string out;
out.resize(input.size());
std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
return out;
}
std::string format(std::string const &formatTemplate) { return formatTemplate; }
template <typename Arg>
std::string format(std::string const &formatTemplate, Arg arg) {
std::ostringstream os;
auto index = formatTemplate.find("{}");
assert(index != std::string::npos);
os << formatTemplate.substr(0, index) << arg << formatTemplate.substr(index + 2);
return os.str();
}
template <typename Arg, typename... Args>
std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
std::ostringstream os;
auto index = formatTemplate.find("{}");
assert(index != std::string::npos);
os << formatTemplate.substr(0, index) << arg << format(formatTemplate.substr(index + 2), std::forward<Args>(args)...);
return os.str();
}
template <typename T>
class Reversed {
public:
using iterator = typename T::const_reverse_iterator;
explicit Reversed(T const &container) : container_(container){};
iterator begin() const { return container_.rbegin(); }
iterator end() const { return container_.rend(); }
private:
T const &container_;
};
void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening,
HTML::TagStack &closing) {
opening.clear();
closing.clear();
size_t i = 0;
// Find first difference
for (; i < prev.size(); ++i)
if (i >= curr.size() || prev[i] != curr[i]) break;
// Only nodes of type ELEMENT can have children and thus would need a closing tag.
// NOLINTNEXTLINE(bugprone-narrowing-conversions)
std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing),
[&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; });
// NOLINTNEXTLINE(bugprone-narrowing-conversions)
opening.insert(opening.end(), curr.begin() + i, curr.end());
}
bool intersects(ByteRange const &range, HTML::Span const &span) {
return range.begin <= span.end && range.end >= span.begin;
};
bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); }
bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
}
bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
if (a.size() > b.size()) return false;
for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
if (*i != *j) return false;
return true;
}
bool hasAlignments(Response const &response) {
// Test for each sentence individually as a sentence may be empty (or there)
// might be no sentences, so just testing for alignments.empty() would not be
// sufficient.
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
// If response.alignments is just empty, this might catch it.
if (response.alignments.size() <= sentenceIdx ||
response.alignments[sentenceIdx].size() != response.target.numWords(sentenceIdx))
return false;
// If response.alignments is "empty" because the model did not provide alignments,
// it still has entries for each target word. But all these entries are empty.
for (size_t wordIdx = 0; wordIdx < response.target.numWords(sentenceIdx); ++wordIdx)
if (response.alignments[sentenceIdx][wordIdx].size() != response.source.numWords(sentenceIdx)) return false;
}
return true;
}
class TokenFormatter {
public:
explicit TokenFormatter(marian::string_view token)
: offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
// Do encoding of any entities that popped up in the translation
encodeEntities(token, html_);
}
std::string &&html() { return std::move(html_); }
// Append the markup necessary for moving from `prev` set of tags to `curr`.
void append(HTML::TagStack const &prev, HTML::TagStack const &curr) {
HTML::TagStack opening, closing;
diffTags(prev, curr, opening, closing);
for (HTML::Tag const *tag : Reversed(closing)) {
assert(tag->type == HTML::Tag::ELEMENT);
std::string closeTag = format("</{}>", tag->name);
html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
offset_ += closeTag.size();
if (closeLeft_) whitespaceOffset_ += closeTag.size();
}
for (HTML::Tag const *tag : opening) {
std::string openTag;
switch (tag->type) {
case HTML::Tag::ELEMENT:
case HTML::Tag::VOID_ELEMENT:
openTag = format("<{}{}>{}", tag->name, tag->attributes, tag->data);
break;
case HTML::Tag::COMMENT:
openTag = format("<!--{}-->", tag->data);
break;
case HTML::Tag::PROCESSING_INSTRUCTION:
openTag = format("<?{}?>", tag->data);
break;
case HTML::Tag::WHITESPACE: {
// Try to eat two newlines (paragraph break) from our segment
auto pos = html_.find("\n\n", whitespaceOffset_);
if (pos != std::string::npos && pos < whitespaceOffset_ + whitespaceSize_) {
html_.erase(pos, 2);
whitespaceSize_ -= 2;
}
} break;
}
html_.insert(offset_ + whitespaceSize_, openTag);
offset_ += openTag.size();
closeLeft_ = closeLeft_ && openTag.empty();
}
}
private:
std::string html_; // Output html
size_t offset_; // Size added by prepending HTML
size_t whitespaceOffset_; // position of prefix whitespace characters
// (it moves as closing tags are prepended)
size_t whitespaceSize_; // number of prefix whitespace characters
// Close tags we want to show up left (before) the token, but open tags
// ideally come directly after any prefix whitespace. However, some tokens
// match multiple spans. If a previous span has added an open tag, after any
// whitespace, and the next span closes said tag again, we need to close
// it after the whitespace. So after the first open tag, any closing tag
// should also align right, after whitespace, not before. Hence this bool.
bool closeLeft_;
};
size_t debugCountTokens(AnnotatedText const &text) {
size_t tokens = 1; // for the ending gap
for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
tokens += 1 + text.numWords(sentenceIdx); // pre-sentence prefix/gap + each word
}
return tokens;
}
// last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
// Only full elements can be consumed this way. With void tags we don't know
// where to stop scanning. All other types cannot be nested anyway.
assert(tag.type == HTML::Tag::ELEMENT);
// TT_TAG_START is already consumed.
markup::Scanner::TokenType token;
size_t inside = 0;
// Consume the full open tag, i.e. all its attributes
while (!inside) {
token = scanner.next();
switch (token) {
case markup::Scanner::TT_ERROR:
ABORT("HTML parse error");
case markup::Scanner::TT_EOF:
ABORT("Did not find closing tag </{}>", name);
case markup::Scanner::TT_ATTRIBUTE:
tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
break;
default:
// Not an attribute! Must be something inside the body or the closing
// tag already. Time to jump to the next loop.
++inside;
break;
}
}
// Last token was something that would have triggered Scanner::scanBody(),
// which sets value() to start pointing at the body.
const char *start = scanner.start();
// Consume the rest of the HTML until (including) the final closing tag. We
// start with the token that caused the previous loop to fall into the default
// case.
while (inside) {
switch (token) {
case markup::Scanner::TT_ERROR:
ABORT("HTML parse error");
case markup::Scanner::TT_EOF:
ABORT("Did not find closing tag </{}>");
case markup::Scanner::TT_TAG_START:
// Note: Looking specifically for only our own type of tag so we don't
// have to care about whether other tags we encounter are void tags or
// not. Does assume the HTML is valid, as no stack is kept.
if (toLowerCase(scanner.tag()) == name) ++inside;
break;
case markup::Scanner::TT_TAG_END:
if (toLowerCase(scanner.tag()) == name) --inside;
break;
default:
break;
}
// Only continue scanning if we're still inside. We could have just read the
// TT_TAG_END token that ended this element, and we don't want to continue
// consuming tokens at that point.
if (inside) token = scanner.next();
}
// Only a TAG_END could have stopped the previous loop. We take the start
// of the final closing tag as the end of our data.
assert(token == markup::Scanner::TT_TAG_END);
const char *end = scanner.start();
// All data between the end of the first open element, and the start of the
// last close element, we just treat as raw data that will be printed when
// this tag is eventually printed.
assert(end >= start);
tag.data = std::string_view(start, end - start);
}
} // namespace
namespace marian::bergamot {
std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
if (tag == nullptr) return out << "[nullptr]";
switch (tag->type) {
case HTML::Tag::ELEMENT:
return out << '<' << tag->name << tag->attributes << '>';
case HTML::Tag::VOID_ELEMENT:
return out << '<' << tag->name << tag->attributes << "/>";
case HTML::Tag::COMMENT:
return out << "<!--" << tag->data << "-->";
case HTML::Tag::PROCESSING_INSTRUCTION:
return out << "<?" << tag->data << "?>";
case HTML::Tag::WHITESPACE:
return out << "[inserted space]";
}
return out << "[Unknown tag type]";
}
std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) {
for (auto it = tags.begin(); it != tags.end(); ++it) {
if (it != tags.begin()) out << ' ';
out << *it;
}
return out;
}
HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) {
if (!processMarkup) return;
std::string original = std::move(source);
markup::instream in(original.data(), original.data() + original.size());
markup::Scanner scanner(in);
source.clear(); // source is moved out of, so should be clear anyway
Tag *tag = nullptr; // current tag (after opening at least)
TagStack stack; // stack of currently open tags
bool addSentenceBreak = false; // whether to add a sentence break next text segment
bool addWordBreak = false; // whether to add a word break next text segment
// Starting point: an empty span with no open tags.
spans_.push_back(Span{0, 0, {}});
bool stop = false;
while (!stop) {
switch (scanner.next()) {
case markup::Scanner::TT_ERROR:
ABORT("HTML parse error");
case markup::Scanner::TT_EOF:
stop = true;
break;
case markup::Scanner::TT_TEXT: {
// If the previous segment was the open or close tag of a block element
// we treat the text after it as a new sentence.
if (addSentenceBreak) {
// If there isn't already a \n\n at the end of source...
if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") {
stack.push_back(makeTag({Tag::WHITESPACE}));
// Important: span->size() == 0 to make it behave as a void element.
// Also important: position before the \n\n tokens, not after, to
// make it easier to remove them later through apply().
spans_.push_back(Span{source.size(), source.size(), stack});
source.append("\n\n"); // Should work with ssplit-mode = wrapped_text
stack.pop_back();
}
addSentenceBreak = false;
}
// If the previous segment was an open or close tag, it might be best
// to add a space to make sure we don't append to the previous word.
if (addWordBreak) {
// Only add the space when it would be inside a word. Do not add it if
// it would be between a word and punctuation.
if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
source.push_back(' ');
}
addWordBreak = false;
}
// Store which tags were open when this span of text was encountered.
auto begin = source.size();
source.append(scanner.value());
spans_.push_back(Span{begin, source.size(), stack});
} break;
case markup::Scanner::TT_TAG_START: {
std::string name = toLowerCase(scanner.tag());
// Tag *tag is used by attribute parsing
auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT;
tag = makeTag({type, std::string(scanner.tag())});
stack.push_back(tag);
// Empty elements (e.g. <img>) are not applicable to a span of text
// so instead we "apply" them to an empty span in between, and then
// immediately remove them again from the stack.
if (tag->type == Tag::VOID_ELEMENT) {
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
}
// Ignored tags have same semantics as void tags with regards to moving
// them around with the rest of the content.
if (contains(options_.ignoredTags, name)) {
consumeIgnoredTag(scanner, *tag, name);
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
}
// Treat non-inline HTML tags as spaces that break up words.
if (!contains(options_.inlineTags, name)) {
addSentenceBreak = true;
} else if (!contains(options_.inWordTags, name)) {
addWordBreak = true;
}
} break;
case markup::Scanner::TT_TAG_END: {
std::string tagName = toLowerCase(scanner.tag());
// If this is the closing bit of a void tag, i.e. triggered by the "/>"
// bit of "<img/>", then completely ignore it.
if (contains(options_.voidTags, tagName)) break;
ABORT_IF(stack.empty(), "Encountered more closing tags ({}) than opening tags", scanner.tag());
ABORT_IF(toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()),
"Encountered unexpected closing tag </{}>, stack is {}", scanner.tag(), stack);
// What to do with "<u></u>" case, where tag is immediately closed
// so it never makes it into the taint of any of the spans? This adds
// an empty span so it still gets recorded in spans_.
if (spans_.empty() || !contains(spans_.back().tags, stack.back()))
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
// Add space if necessary
if (!contains(options_.inlineTags, tagName)) {
addSentenceBreak = true;
} else if (!contains(options_.inWordTags, tagName)) {
addWordBreak = true;
}
} break;
case markup::Scanner::TT_ATTRIBUTE:
assert(tag != nullptr);
tag->attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
break;
case markup::Scanner::TT_COMMENT_START:
// Tag *tag is used when TT_DATA is seen to add the comment's content.
tag = makeTag({Tag::COMMENT});
stack.push_back(tag);
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
break;
case markup::Scanner::TT_PROCESSING_INSTRUCTION_START:
// Tag *tag is used when TT_DATA is seen to add the PI's content.
tag = makeTag({Tag::PROCESSING_INSTRUCTION});
stack.push_back(tag);
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
break;
case markup::Scanner::TT_COMMENT_END:
case markup::Scanner::TT_PROCESSING_INSTRUCTION_END:
tag = nullptr;
break;
case markup::Scanner::TT_DATA:
assert(tag != nullptr);
tag->data = scanner.value();
break;
default:
ABORT("Unsupported scanner token type");
}
}
ABORT_IF(!stack.empty(), "Not all tags were closed: {}", stack);
// Add a trailing span (that's empty) to signify all closed tags.
spans_.emplace_back(Span{source.size(), source.size(), stack});
}
void HTML::restore(Response &response) {
// No-op if process_markup was false (and thus spans_ is empty)
// TODO: replace this with optional<HTML> at a higher level
if (spans_.empty()) return;
// We need alignment info to transfer the HTML tags from the input to the
// translation. If those are not available, no HTML in translations for you.
ABORT_UNLESS(hasAlignments(response),
"Response object does not contain alignments. TranslationModel or ResponseOptions is misconfigured?");
// Reconstruction of HTML tags:
// 1. Map each token to a Span
// 2. Reconstruct the source HTML with these tainted tokens
// 3. Transfer the spans from the source tokens to the target tokens using alignment information
// 4. For spans that represent empty elements (e.g. <img>) figure out their position
// 5. Reconstruct the target HTML with these tainted tokens
// sourceTokenSpans is a vector with a pointer to a span for each token. We
// use iterators here to point to these positions so we can easily compare if
// one span comes before or after another, information we'll need when we need
// to figure out whether we've skipped spans (of emtpy elements) when
// reconstructing HTML in response.target.
std::vector<SpanIterator> sourceTokenSpans;
// RestoreSource re-inserts HTML into the source text, but also identifies
// which span each source token fits into best.
AnnotatedText source = restoreSource(response.source, sourceTokenSpans);
assert(sourceTokenSpans.size() == debugCountTokens(response.source));
// Find for every token in target the token in source that best matches.
std::vector<std::vector<size_t>> alignments;
hardAlignments(response, alignments, sourceTokenSpans);
std::vector<SpanIterator> targetTokenSpans;
copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans);
assert(targetTokenSpans.size() == debugCountTokens(response.target));
// Take the spans, and use them to make a taint for every word in the
// translation. Optionally add extra tags, like quality score metadata.
std::vector<HTML::TagStack> targetTokenTags;
annotateTagStack(response, targetTokenSpans, targetTokenTags);
AnnotatedText target = restoreTarget(response.target, targetTokenSpans, targetTokenTags);
response.source = source;
response.target = target;
}
AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans) {
auto spanIt = spans_.begin();
auto prevIt = spans_.begin(); // safe because first span is always empty span, and
// and the while-loop below will do the rest
assert(prevIt == spans_.end() || prevIt->tags.empty());
return in.apply([&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// Potential issue: spans and tokens can intersect, e.g.
//
// text <p> h <u> e </u> ll o </p>
// spans |1| |2| |3333| (so only 2 is tainted with <p><u>, others only <p>)
// tokens |111111111111111|2|
//
// Now 1 covers span 1 to 3, so what taint should it get? Just `<p>`, or
// `<p><u>`?
// Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
// just insert spaces around all elements, every segment of `hello` will be
// a token.
// Seek to the last span that overlaps with this token
while (true) {
formatter.append(prevIt->tags, spanIt->tags);
prevIt = spanIt;
if (spanIt + 1 != spans_.end() && ((spanIt + 1)->begin < range.end || last)) {
spanIt++;
continue;
}
break;
}
// TODO: This is just the taint of the last span, not the ones in between.
// This makes us lose some markup of parts of tokens as described above.
sourceTokenSpans.emplace_back(prevIt);
return std::move(formatter.html());
});
}
AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
std::vector<TagStack> const &targetTokenTags) {
auto prevTags = spans_.cbegin()->tags;
auto stragglerSpanIt = spans_.cbegin();
auto targetSpanIt = targetTokenSpans.begin();
auto targetTagIt = targetTokenTags.begin();
AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
for (; stragglerSpanIt < *targetSpanIt; stragglerSpanIt++) {
// We're only interested in empty spans or spans that would otherwise get
// lost because they didn't align with anything between the spans in
// targetSpanIt
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
if (stragglerSpanIt->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), stragglerSpanIt) != targetTokenSpans.end())
continue;
formatter.append(prevTags, stragglerSpanIt->tags);
prevTags = stragglerSpanIt->tags;
}
// Now do the same thing but for our target set of tags. Note that we cannot
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
// because there is no guarantee that the order in `targetTokenSpans` is
// the same as that of `spans`.
formatter.append(prevTags, *targetTagIt);
// If this is the last token of the response, close all open tags.
if (last) {
// Note: this assert is true due to our current implementation of
// HardAlignments() that always matches the last token of the input with
// the last token of the output. But lets assume someone someday changes
// HardAlignments(), and then this for-loop will be necessary.
// assert((*targetSpanIt)->tags.empty());
formatter.append(*targetTagIt, HTML::TagStack());
}
prevTags = *targetTagIt;
++targetSpanIt;
++targetTagIt;
return std::move(formatter.html());
});
// Assert that we did in fact use all our taints
assert(targetSpanIt == targetTokenSpans.end());
return out;
}
HTML::Tag *HTML::makeTag(Tag &&tag) {
pool_.emplace_front(std::move(tag));
return &pool_.front();
}
void HTML::copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<SpanIterator> const &sourceTokenSpans,
std::vector<SpanIterator> &targetTokenSpans) {
size_t offset = 0; // Sentence offset in sourceTokenSpans
// Fill targetTokenSpans based on the alignments we just made up.
// NOTE: this should match the exact order of Apply()
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for sentence ending gap
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
size_t s = alignments[sentenceIdx][t];
assert(s < response.source.numWords(sentenceIdx));
targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]); // +1 for prefix gap
}
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
assert(offset + 1 == sourceTokenSpans.size());
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
}
void HTML::annotateTagStack(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
std::vector<HTML::TagStack> &targetTokenTags) {
auto spanIt = targetTokenSpans.begin();
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
// Sentence prefix
targetTokenTags.push_back((*spanIt)->tags);
spanIt++;
// Offset in targetTokenTags at which this sentence's tags start.
size_t tagOffset = targetTokenTags.size();
// Initially, just copy the span's tags to this token
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
targetTokenTags.emplace_back((*spanIt)->tags);
spanIt++;
}
// If we have quality score information, add that as metadata as well.
if (!response.qualityScores.empty()) {
auto const &sentenceQuality = response.qualityScores[sentenceIdx];
// Create a single <font> tag for this sentence with sentence level info
Tag *sentenceTag = makeTag({Tag::ELEMENT, "font"});
sentenceTag->attributes += format(" x-bergamot-sentence-index=\"{}\" x-bergamot-sentence-score=\"{}\"",
sentenceIdx, sentenceQuality.sentenceScore);
// Add that tag to all tokens in this sentence.
for (size_t tokenIdx = 0; tokenIdx < response.target.numWords(sentenceIdx); ++tokenIdx) {
targetTokenTags[tagOffset + tokenIdx].push_back(sentenceTag);
}
// Add word level <font> tags as well to all tokens that make up a word.
for (size_t wordIdx = 0; wordIdx < sentenceQuality.wordRanges.size(); ++wordIdx) {
Tag *wordTag = makeTag({Tag::ELEMENT, "font"});
wordTag->attributes += format(" x-bergamot-word-index=\"{}\" x-bergamot-word-score=\"{}\"", wordIdx,
sentenceQuality.wordScores[wordIdx]);
auto const &range = sentenceQuality.wordRanges[wordIdx];
for (size_t tokenIdx = range.begin; tokenIdx < range.end; ++tokenIdx) {
targetTokenTags[tagOffset + tokenIdx].push_back(wordTag);
}
}
}
}
// Suffix
targetTokenTags.push_back((*spanIt)->tags);
spanIt++;
assert(spanIt == targetTokenSpans.end());
}
// Reports if token `str` is likely to be a continuation of a word. This is used
// to determine whether we should share the markup, or whether we should see
// this token as a fresh start. This implementation will treat "hello[world]"
// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
if (options_.continuationDelimiters.empty()) return false;
if (prev.empty() || str.empty()) return false;
return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
options_.continuationDelimiters.find(prev.back()) == std::string::npos;
}
bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
}
void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
std::vector<SpanIterator> const &sourceTokenSpans) {
size_t offset = 0; // sentence offset in sourceTokenSpans
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
// Hard-align: find for each target token the most prevalent source token
// Note: only search from 0 to N-1 because token N is end-of-sentence token
// that can only align with the end-of-sentence token of the target
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
alignments.back().push_back(
std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
response.alignments[sentenceIdx][t].begin());
}
// Next, we try to smooth out these selected alignments with a few heuristics
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
// If this token is a continuation of a previous token, pick the tags from the most
// prevalent token for the whole word.
if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
// Note: only looking at the previous token since that will already
// have this treatment applied to it.
size_t currSentenceIdx = alignments.back()[t];
size_t prevSentenceIdx = alignments.back()[t - 1];
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
// If this token has more markup, or a better score than the previous
// token (and they together are part of a word-ish thing) then mark
// this word as aligning. Otherwise just copy the alignment source of
// the previous token.
if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
// Stop if this was the first token or the beginning of the word
if (i == 0 ||
!isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
break;
}
} else {
alignments.back()[t] = prevSentenceIdx;
}
}
}
// Always align target end with source end
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
}
} // namespace marian::bergamot