Program Listing for File text_processor.cpp

Return to documentation for file (src/translator/text_processor.cpp)

#include "text_processor.h"

#include <vector>

#include "annotation.h"
#include "common/cli_helper.h"
#include "common/options.h"
#include "data/types.h"
#include "definitions.h"

namespace marian {
namespace bergamot {

namespace {
ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m) {
  typedef ug::ssplit::SentenceStream::splitmode splitmode;
  if (m == "sentence") {
    return splitmode::one_sentence_per_line;
  } else if (m == "paragraph") {
    return splitmode::one_paragraph_per_line;
  } else if (m == "wrapped_text") {
    return splitmode::wrapped_text;
  } else {
    ABORT("Unknown ssplitmode {}, Please choose one of {sentence,paragraph,wrapped_text}");
  }
}

ug::ssplit::SentenceSplitter loadSplitter(const std::string &ssplitPrefixFile) {
  // Temporarily supports empty, will be removed when mozilla passes ssplitPrefixFile
  ug::ssplit::SentenceSplitter splitter;
  if (ssplitPrefixFile.size()) {
    std::string interpSsplitPrefixFile = marian::cli::interpolateEnvVars(ssplitPrefixFile);
    LOG(info, "Loading protected prefixes for sentence splitting from {}", interpSsplitPrefixFile);
    splitter.load(interpSsplitPrefixFile);
  } else {
    LOG(warn,
        "Missing list of protected prefixes for sentence splitting. "
        "Set with --ssplit-prefix-file.");
  }
  return splitter;
}

ug::ssplit::SentenceSplitter loadSplitter(const AlignedMemory &memory) {
  // Temporarily supports empty, will be removed when mozilla passes memory
  ug::ssplit::SentenceSplitter splitter;
  if (memory.size()) {
    std::string_view serialized(memory.begin(), memory.size());
    splitter.loadFromSerialized(serialized);
  }
  return splitter;
}

}  // namespace

Segment TextProcessor::tokenize(const string_view &segment, std::vector<string_view> &wordRanges) const {
  // vocabs_->sources().front() is invoked as we currently only support one source vocab
  return vocabs_.sources().front()->encodeWithByteRanges(segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
}

TextProcessor::TextProcessor(Ptr<Options> options, const Vocabs &vocabs, const std::string &ssplit_prefix_file)
    : vocabs_(vocabs), ssplit_(loadSplitter(ssplit_prefix_file)) {
  parseCommonOptions(options);
}

TextProcessor::TextProcessor(Ptr<Options> options, const Vocabs &vocabs, const AlignedMemory &memory)
    : vocabs_(vocabs) {
  // This is not the best of the solutions at the moment, but is consistent with what happens among other structures
  // like model, vocabulary or shortlist. First, we check if the bytearray is empty. If not, we load from ByteArray. In
  // case empty, the string based loader which reads from file is called. However, ssplit allows for not supplying
  // ssplit-prefix-file where-in the purely regular expression based splitter is activated.
  //
  // For now, we allow not supplying an ssplit-prefix-file.

  if (memory.begin() == nullptr && memory.size()) {
    ssplit_ = loadSplitter(memory);
  } else {
    ssplit_ = loadSplitter(options->get<std::string>("ssplit-prefix-file", ""));
  }
  parseCommonOptions(options);
}

void TextProcessor::parseCommonOptions(Ptr<Options> options) {
  maxLengthBreak_ = options->get<size_t>("max-length-break");
  ssplitMode_ = string2splitmode(options->get<std::string>("ssplit-mode"));
}

void TextProcessor::process(std::string &&input, AnnotatedText &source, Segments &segments) const {
  source = std::move(AnnotatedText(std::move(input)));
  std::string_view input_converted(source.text.data(), source.text.size());
  auto sentenceStream = ug::ssplit::SentenceStream(input_converted, ssplit_, ssplitMode_);

  std::string_view sentenceStringPiece;

  while (sentenceStream >> sentenceStringPiece) {
    marian::string_view sentence(sentenceStringPiece.data(), sentenceStringPiece.size());

    std::vector<string_view> wordRanges;
    Segment segment = tokenize(sentence, wordRanges);

    // There are some cases where SentencePiece or vocab returns no words
    // after normalization. 0 prevents any empty entries from being added.
    if (segment.size() > 0) {
      // Wrap segment into sentences of at most maxLengthBreak_ tokens and
      // tell source about them.
      wrap(segment, wordRanges, segments, source);
    }
  }
}

void TextProcessor::wrap(Segment &segment, std::vector<string_view> &wordRanges, Segments &segments,
                         AnnotatedText &source) const {
  // There's an EOS token added to the words, manually. SentencePiece/marian-vocab is set to not append EOS. Marian
  // requires EOS to be at the end as a marker to start translating. So while we're supplied maxLengthBreak_ from
  // outside, we need to ensure there's space for EOS in each wrapped segment.
  Word sourceEosId = vocabs_.sources().front()->getEosId();
  size_t wrapStep = maxLengthBreak_ - 1;

  for (size_t offset = 0; offset < segment.size(); offset += wrapStep) {
    auto start = segment.begin() + offset;

    // Restrict the range within bounds.
    size_t left = segment.size() - offset;
    size_t diff = std::min(wrapStep, left);

    segments.emplace_back(start, start + diff);
    segments.back().push_back(sourceEosId);

    auto astart = wordRanges.begin() + offset;

    // Construct a part vector of string_view representing wrapped segment, use the last string_view to create an EOS
    // string_view manually.
    std::vector<string_view> partWordRanges(astart, astart + diff);
    string_view &last = partWordRanges.back();
    const char *end = last.data() + last.size();
    partWordRanges.emplace_back(end, 0);
    // diff > 0
    source.recordExistingSentence(partWordRanges.begin(), partWordRanges.end(), astart->data());
  }
}

void TextProcessor::processFromAnnotation(AnnotatedText &source, Segments &segments) const {
  std::string copySource = source.text;
  AnnotatedText replacement(std::move(copySource));

  for (size_t s = 0; s < source.numSentences(); s++) {
    // This is our sentenceStream
    ByteRange sentenceByteRange = source.sentenceAsByteRange(s);

    // Fool tokenization using ByteRanges into looking at replacement. They're same, so okay.
    marian::string_view sentence{&replacement.text[sentenceByteRange.begin], sentenceByteRange.size()};

    std::vector<string_view> wordRanges;
    Segment segment = tokenize(sentence, wordRanges);

    // Manually add EoS
    Word sourceEosId = vocabs_.sources().front()->getEosId();
    segment.push_back(sourceEosId);

    if (!wordRanges.empty()) {
      string_view &last = wordRanges.back();  // this is a possible segfault if wordRanges is empty. So guard.
      const char *end = last.data() + last.size();
      wordRanges.emplace_back(end, 0);
    } else {
      const char *end = sentence.data() + sentence.size();
      wordRanges.emplace_back(end, 0);
    }

    segments.push_back(std::move(segment));
    replacement.recordExistingSentence(wordRanges.begin(), wordRanges.end(), wordRanges.begin()->data());
  }

  source = replacement;
}

}  // namespace bergamot
}  // namespace marian