&sourceTokenSpans) {
auto spanIt = spans_.begin();
auto prevIt = spans_.begin(); // safe because first span is always empty span, and
// and the while-loop below will do the rest
assert(prevIt == spans_.end() || prevIt->tags.empty());
return in.apply([&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// Potential issue: spans and tokens can intersect, e.g.
//
// text h e ll o
// spans |1| |2| |3333| (so only 2 is tainted with , others only
)
// tokens |111111111111111|2|
//
// Now 1 covers span 1 to 3, so what taint should it get? Just `
`, or
// `
`?
// Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
// just insert spaces around all elements, every segment of `hello` will be
// a token.
// Seek to the last span that overlaps with this token
while (true) {
formatter.append(prevIt->tags, spanIt->tags);
prevIt = spanIt;
if (spanIt + 1 != spans_.end() && ((spanIt + 1)->begin < range.end || last)) {
spanIt++;
continue;
}
break;
}
// TODO: This is just the taint of the last span, not the ones in between.
// This makes us lose some markup of parts of tokens as described above.
sourceTokenSpans.emplace_back(prevIt);
return std::move(formatter.html());
});
}
AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector const &targetTokenSpans,
std::vector const &targetTokenTags) {
auto prevTags = spans_.cbegin()->tags;
auto stragglerSpanIt = spans_.cbegin();
auto targetSpanIt = targetTokenSpans.begin();
auto targetTagIt = targetTokenTags.begin();
AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
for (; stragglerSpanIt < *targetSpanIt; stragglerSpanIt++) {
// We're only interested in empty spans or spans that would otherwise get
// lost because they didn't align with anything between the spans in
// targetSpanIt
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
if (stragglerSpanIt->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), stragglerSpanIt) != targetTokenSpans.end())
continue;
formatter.append(prevTags, stragglerSpanIt->tags);
prevTags = stragglerSpanIt->tags;
}
// Now do the same thing but for our target set of tags. Note that we cannot
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
// because there is no guarantee that the order in `targetTokenSpans` is
// the same as that of `spans`.
formatter.append(prevTags, *targetTagIt);
// If this is the last token of the response, close all open tags.
if (last) {
// Note: this assert is true due to our current implementation of
// HardAlignments() that always matches the last token of the input with
// the last token of the output. But lets assume someone someday changes
// HardAlignments(), and then this for-loop will be necessary.
// assert((*targetSpanIt)->tags.empty());
formatter.append(*targetTagIt, HTML::TagStack());
}
prevTags = *targetTagIt;
++targetSpanIt;
++targetTagIt;
return std::move(formatter.html());
});
// Assert that we did in fact use all our taints
assert(targetSpanIt == targetTokenSpans.end());
return out;
}
HTML::Tag *HTML::makeTag(Tag &&tag) {
pool_.emplace_front(std::move(tag));
return &pool_.front();
}
void HTML::copyTagStack(Response const &response, std::vector> const &alignments,
std::vector const &sourceTokenSpans,
std::vector &targetTokenSpans) {
size_t offset = 0; // Sentence offset in sourceTokenSpans
// Fill targetTokenSpans based on the alignments we just made up.
// NOTE: this should match the exact order of Apply()
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for sentence ending gap
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
size_t s = alignments[sentenceIdx][t];
assert(s < response.source.numWords(sentenceIdx));
targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]); // +1 for prefix gap
}
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
assert(offset + 1 == sourceTokenSpans.size());
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
}
void HTML::annotateTagStack(Response const &response, std::vector const &targetTokenSpans,
std::vector &targetTokenTags) {
auto spanIt = targetTokenSpans.begin();
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
// Sentence prefix
targetTokenTags.push_back((*spanIt)->tags);
spanIt++;
// Offset in targetTokenTags at which this sentence's tags start.
size_t tagOffset = targetTokenTags.size();
// Initially, just copy the span's tags to this token
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
targetTokenTags.emplace_back((*spanIt)->tags);
spanIt++;
}
// If we have quality score information, add that as metadata as well.
if (!response.qualityScores.empty()) {
auto const &sentenceQuality = response.qualityScores[sentenceIdx];
// Create a single tag for this sentence with sentence level info
Tag *sentenceTag = makeTag({Tag::ELEMENT, "font"});
sentenceTag->attributes += format(" x-bergamot-sentence-index=\"{}\" x-bergamot-sentence-score=\"{}\"",
sentenceIdx, sentenceQuality.sentenceScore);
// Add that tag to all tokens in this sentence.
for (size_t tokenIdx = 0; tokenIdx < response.target.numWords(sentenceIdx); ++tokenIdx) {
targetTokenTags[tagOffset + tokenIdx].push_back(sentenceTag);
}
// Add word level tags as well to all tokens that make up a word.
for (size_t wordIdx = 0; wordIdx < sentenceQuality.wordRanges.size(); ++wordIdx) {
Tag *wordTag = makeTag({Tag::ELEMENT, "font"});
wordTag->attributes += format(" x-bergamot-word-index=\"{}\" x-bergamot-word-score=\"{}\"", wordIdx,
sentenceQuality.wordScores[wordIdx]);
auto const &range = sentenceQuality.wordRanges[wordIdx];
for (size_t tokenIdx = range.begin; tokenIdx < range.end; ++tokenIdx) {
targetTokenTags[tagOffset + tokenIdx].push_back(wordTag);
}
}
}
}
// Suffix
targetTokenTags.push_back((*spanIt)->tags);
spanIt++;
assert(spanIt == targetTokenSpans.end());
}
// Reports if token `str` is likely to be a continuation of a word. This is used
// to determine whether we should share the markup, or whether we should see
// this token as a fresh start. This implementation will treat "hello[world]"
// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
if (options_.continuationDelimiters.empty()) return false;
if (prev.empty() || str.empty()) return false;
return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
options_.continuationDelimiters.find(prev.back()) == std::string::npos;
}
bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
}
void HTML::hardAlignments(Response const &response, std::vector> &alignments,
std::vector const &sourceTokenSpans) {
size_t offset = 0; // sentence offset in sourceTokenSpans
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
// Hard-align: find for each target token the most prevalent source token
// Note: only search from 0 to N-1 because token N is end-of-sentence token
// that can only align with the end-of-sentence token of the target
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
alignments.back().push_back(
std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
response.alignments[sentenceIdx][t].begin());
}
// Next, we try to smooth out these selected alignments with a few heuristics
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
// If this token is a continuation of a previous token, pick the tags from the most
// prevalent token for the whole word.
if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
// Note: only looking at the previous token since that will already
// have this treatment applied to it.
size_t currSentenceIdx = alignments.back()[t];
size_t prevSentenceIdx = alignments.back()[t - 1];
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
// If this token has more markup, or a better score than the previous
// token (and they together are part of a word-ish thing) then mark
// this word as aligning. Otherwise just copy the alignment source of
// the previous token.
if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
// Stop if this was the first token or the beginning of the word
if (i == 0 ||
!isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
break;
}
} else {
alignments.back()[t] = prevSentenceIdx;
}
}
}
// Always align target end with source end
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
}
} // namespace marian::bergamot