Program Listing for File text_processor.h¶
↰ Return to documentation for file (src/translator/text_processor.h
)
#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
#include <vector>
#include "aligned.h"
#include "annotation.h"
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
#include "ssplit.h"
#include "vocabs.h"
namespace marian {
namespace bergamot {
class TextProcessor {
public:
// There are two ways to construct text-processor, different in a file-system
// based prefix file load and a memory based prefix file store. @jerinphilip
// is not doing magic inference inside to determine file-based or memory
// based on one being empty or not.
TextProcessor(Ptr<Options>, const Vocabs &vocabs, const std::string &ssplit_prefix_file);
TextProcessor(Ptr<Options>, const Vocabs &vocabs, const AlignedMemory &memory);
void process(std::string &&blob, AnnotatedText &source, Segments &segments) const;
void processFromAnnotation(AnnotatedText &source, Segments &segments) const;
private:
void parseCommonOptions(Ptr<Options> options);
Segment tokenize(const string_view &input, std::vector<string_view> &tokenRanges) const;
void wrap(Segment &sentence, std::vector<string_view> &tokenRanges, Segments &segments, AnnotatedText &source) const;
const Vocabs &vocabs_;
size_t maxLengthBreak_;
ug::ssplit::SentenceSplitter ssplit_;
ug::ssplit::SentenceStream::splitmode ssplitMode_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_