Program Listing for File text_processor.h

Return to documentation for file (src/translator/text_processor.h)

#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_

#include <vector>

#include "aligned.h"
#include "annotation.h"
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
#include "ssplit.h"
#include "vocabs.h"

namespace marian {
namespace bergamot {

class TextProcessor {
 public:
  // There are two ways to construct text-processor, different in a file-system
  // based prefix file load and a memory based prefix file store. @jerinphilip
  // is not doing magic inference inside to determine file-based or memory
  // based on one being empty or not.

  TextProcessor(Ptr<Options>, const Vocabs &vocabs, const std::string &ssplit_prefix_file);

  TextProcessor(Ptr<Options>, const Vocabs &vocabs, const AlignedMemory &memory);


  void process(std::string &&blob, AnnotatedText &source, Segments &segments) const;

  void processFromAnnotation(AnnotatedText &source, Segments &segments) const;

 private:
  void parseCommonOptions(Ptr<Options> options);

  Segment tokenize(const string_view &input, std::vector<string_view> &tokenRanges) const;

  void wrap(Segment &sentence, std::vector<string_view> &tokenRanges, Segments &segments, AnnotatedText &source) const;

  const Vocabs &vocabs_;
  size_t maxLengthBreak_;

  ug::ssplit::SentenceSplitter ssplit_;

  ug::ssplit::SentenceStream::splitmode ssplitMode_;
};

}  // namespace bergamot
}  // namespace marian

#endif  // SRC_BERGAMOT_TEXT_PROCESSOR_H_