Program Listing for File annotation.h

Return to documentation for file (src/translator/annotation.h)

#ifndef BERGAMOT_SENTENCE_RANGES_H_
#define BERGAMOT_SENTENCE_RANGES_H_

#include <cassert>
#include <utility>
#include <vector>

#include "data/types.h"
#include "definitions.h"

namespace marian {
namespace bergamot {

class Annotation {
 public:
  Annotation() {
    token_begin_.push_back(0);
    token_begin_.push_back(0);
    gap_.push_back(0);
  }

  size_t numSentences() const { return gap_.size() - 1; }

  size_t numWords(size_t sentenceIdx) const {
    return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
  }

  ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
    size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
    return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
  }

  ByteRange sentence(size_t sentenceIdx) const {
    return ByteRange{
        token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
        token_begin_[gap_[sentenceIdx + 1]]  /*beginning of whitespace after */
    };
  }

  ByteRange gap(size_t gapIdx) const {
    size_t tokenIdx = gap_[gapIdx];
    return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
  }

 private:
  friend class AnnotatedText;
  std::vector<size_t> token_begin_;

  std::vector<size_t> gap_;
};

struct AnnotatedText {
 public:
  std::string text;
  Annotation annotation;

  AnnotatedText() {}

  AnnotatedText(std::string &&text);

  void appendSentence(string_view prefix, std::vector<string_view>::iterator tokens_begin,
                      std::vector<string_view>::iterator tokens_end);

  void appendEndingWhitespace(string_view whitespace);

  void recordExistingSentence(std::vector<string_view>::iterator tokens_begin,
                              std::vector<string_view>::iterator tokens_end, const char *sentence_begin);

  const size_t numSentences() const { return annotation.numSentences(); }

  const size_t numWords(size_t sentenceIdx) const { return annotation.numWords(sentenceIdx); }

  string_view word(size_t sentenceIdx, size_t wordIdx) const {
    return asStringView(annotation.word(sentenceIdx, wordIdx));
  }

  string_view sentence(size_t sentenceIdx) const { return asStringView(annotation.sentence(sentenceIdx)); }

  string_view gap(size_t sentenceIdx) const { return asStringView(annotation.gap(sentenceIdx)); }

  ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { return annotation.word(sentenceIdx, wordIdx); }

  ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }

  template <typename Fun>
  AnnotatedText apply(Fun fun) const {
    AnnotatedText out;

    for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
      std::string sentence;
      std::vector<ByteRange> tokens;

      std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);

      for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
        std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
        tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
        sentence += token;
      }

      // Convert our ByteRanges to string_views since that's what appendSentence
      // expects
      std::vector<marian::string_view> views(tokens.size());
      std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
        return marian::string_view(sentence.data() + range.begin, range.size());
      });

      out.appendSentence(prefix, views.begin(), views.end());
    }

    out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));

    return out;
  }

 private:
  string_view asStringView(const ByteRange &byteRange) const {
    return string_view(text.data() + byteRange.begin, byteRange.size());
  }
};

}  // namespace bergamot
}  // namespace marian

#endif  //  BERGAMOT_SENTENCE_RANGES_H_