Program Listing for File annotation.cpp

Return to documentation for file (src/translator/annotation.cpp)

#include "annotation.h"

#include <cassert>

namespace marian {
namespace bergamot {

AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) {
  // Treat the entire text as a gap that recordExistingSentence will break.
  annotation.token_begin_.back() = text.size();
}

void AnnotatedText::appendSentence(string_view prefix, std::vector<string_view>::iterator begin,
                                   std::vector<string_view>::iterator end) {
  assert(annotation.token_begin_.back() == text.size());

  // prefix is just end of the previous one.
  appendEndingWhitespace(prefix);

  // Appending sentence text.
  std::size_t offset = text.size();
  for (std::vector<string_view>::iterator token = begin; token != end; ++token) {
    offset += token->size();
    annotation.token_begin_.push_back(offset);
  }
  if (begin != end) {
    text.append(begin->data(), (end - 1)->data() + (end - 1)->size());
    assert(offset == text.size());  // Tokens should be contiguous.
  }

  // Add the gap after the sentence.  This is empty for now, but will be
  // extended with appendEndingWhitespace or another appendSentence.
  annotation.gap_.push_back(annotation.token_begin_.size() - 1);
  annotation.token_begin_.push_back(offset);
}

void AnnotatedText::appendEndingWhitespace(string_view whitespace) {
  text.append(whitespace.data(), whitespace.size());
  annotation.token_begin_.back() = text.size();
}

void AnnotatedText::recordExistingSentence(std::vector<string_view>::iterator begin,
                                           std::vector<string_view>::iterator end, const char *sentence_begin) {
  assert(sentence_begin >= text.data());
  assert(sentence_begin <= text.data() + text.size());
  assert(begin == end || sentence_begin == begin->data());
  assert(!annotation.token_begin_.empty());
  assert(annotation.token_begin_.back() == text.size());
  // Clip off size token ending.
  annotation.token_begin_.pop_back();
  for (std::vector<string_view>::iterator i = begin; i != end; ++i) {
    assert(i->data() >= text.data());                                  // In range.
    assert(i->data() + i->size() <= text.data() + text.size());        // In range
    assert(i + 1 == end || i->data() + i->size() == (i + 1)->data());  // Contiguous
    annotation.token_begin_.push_back(i->data() - text.data());
  }
  // Gap token after sentence.
  annotation.gap_.push_back(annotation.token_begin_.size());
  if (begin != end) {
    annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data());
  } else {
    // empty sentence.
    annotation.token_begin_.push_back(sentence_begin - text.data());
  }
  // Add back size token ending.
  annotation.token_begin_.push_back(text.size());
}

}  // namespace bergamot
}  // namespace marian