.. _program_listing_file_src_translator_annotation.h: Program Listing for File annotation.h ===================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/translator/annotation.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #ifndef BERGAMOT_SENTENCE_RANGES_H_ #define BERGAMOT_SENTENCE_RANGES_H_ #include #include #include #include "data/types.h" #include "definitions.h" namespace marian { namespace bergamot { class Annotation { public: Annotation() { token_begin_.push_back(0); token_begin_.push_back(0); gap_.push_back(0); } size_t numSentences() const { return gap_.size() - 1; } size_t numWords(size_t sentenceIdx) const { return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */; } ByteRange word(size_t sentenceIdx, size_t wordIdx) const { size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx; return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; } ByteRange sentence(size_t sentenceIdx) const { return ByteRange{ token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */ token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */ }; } ByteRange gap(size_t gapIdx) const { size_t tokenIdx = gap_[gapIdx]; return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; } private: friend class AnnotatedText; std::vector token_begin_; std::vector gap_; }; struct AnnotatedText { public: std::string text; Annotation annotation; AnnotatedText() {} AnnotatedText(std::string &&text); void appendSentence(string_view prefix, std::vector::iterator tokens_begin, std::vector::iterator tokens_end); void appendEndingWhitespace(string_view whitespace); void recordExistingSentence(std::vector::iterator tokens_begin, std::vector::iterator tokens_end, const char *sentence_begin); const size_t numSentences() const { return annotation.numSentences(); } const size_t numWords(size_t sentenceIdx) const { return annotation.numWords(sentenceIdx); } string_view word(size_t sentenceIdx, size_t wordIdx) const { return asStringView(annotation.word(sentenceIdx, wordIdx)); } string_view sentence(size_t sentenceIdx) const { return asStringView(annotation.sentence(sentenceIdx)); } string_view gap(size_t sentenceIdx) const { return asStringView(annotation.gap(sentenceIdx)); } ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { return annotation.word(sentenceIdx, wordIdx); } ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); } template AnnotatedText apply(Fun fun) const { AnnotatedText out; for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) { std::string sentence; std::vector tokens; std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false); for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) { std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false); tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()}); sentence += token; } // Convert our ByteRanges to string_views since that's what appendSentence // expects std::vector views(tokens.size()); std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) { return marian::string_view(sentence.data() + range.begin, range.size()); }); out.appendSentence(prefix, views.begin(), views.end()); } out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true)); return out; } private: string_view asStringView(const ByteRange &byteRange) const { return string_view(text.data() + byteRange.begin, byteRange.size()); } }; } // namespace bergamot } // namespace marian #endif // BERGAMOT_SENTENCE_RANGES_H_