Program Listing for File annotation.h¶
↰ Return to documentation for file (src/translator/annotation.h
)
#ifndef BERGAMOT_SENTENCE_RANGES_H_
#define BERGAMOT_SENTENCE_RANGES_H_
#include <cassert>
#include <utility>
#include <vector>
#include "data/types.h"
#include "definitions.h"
namespace marian {
namespace bergamot {
class Annotation {
public:
Annotation() {
token_begin_.push_back(0);
token_begin_.push_back(0);
gap_.push_back(0);
}
size_t numSentences() const { return gap_.size() - 1; }
size_t numWords(size_t sentenceIdx) const {
return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
}
ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
}
ByteRange sentence(size_t sentenceIdx) const {
return ByteRange{
token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */
};
}
ByteRange gap(size_t gapIdx) const {
size_t tokenIdx = gap_[gapIdx];
return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
}
private:
friend class AnnotatedText;
std::vector<size_t> token_begin_;
std::vector<size_t> gap_;
};
struct AnnotatedText {
public:
std::string text;
Annotation annotation;
AnnotatedText() {}
AnnotatedText(std::string &&text);
void appendSentence(string_view prefix, std::vector<string_view>::iterator tokens_begin,
std::vector<string_view>::iterator tokens_end);
void appendEndingWhitespace(string_view whitespace);
void recordExistingSentence(std::vector<string_view>::iterator tokens_begin,
std::vector<string_view>::iterator tokens_end, const char *sentence_begin);
const size_t numSentences() const { return annotation.numSentences(); }
const size_t numWords(size_t sentenceIdx) const { return annotation.numWords(sentenceIdx); }
string_view word(size_t sentenceIdx, size_t wordIdx) const {
return asStringView(annotation.word(sentenceIdx, wordIdx));
}
string_view sentence(size_t sentenceIdx) const { return asStringView(annotation.sentence(sentenceIdx)); }
string_view gap(size_t sentenceIdx) const { return asStringView(annotation.gap(sentenceIdx)); }
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { return annotation.word(sentenceIdx, wordIdx); }
ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }
template <typename Fun>
AnnotatedText apply(Fun fun) const {
AnnotatedText out;
for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
std::string sentence;
std::vector<ByteRange> tokens;
std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);
for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
sentence += token;
}
// Convert our ByteRanges to string_views since that's what appendSentence
// expects
std::vector<marian::string_view> views(tokens.size());
std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
return marian::string_view(sentence.data() + range.begin, range.size());
});
out.appendSentence(prefix, views.begin(), views.end());
}
out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));
return out;
}
private:
string_view asStringView(const ByteRange &byteRange) const {
return string_view(text.data() + byteRange.begin, byteRange.size());
}
};
} // namespace bergamot
} // namespace marian
#endif // BERGAMOT_SENTENCE_RANGES_H_