Program Listing for File html.h¶
↰ Return to documentation for file (src/translator/html.h)
#ifndef SRC_BERGAMOT_HTML_H_
#define SRC_BERGAMOT_HTML_H_
#include <forward_list>
#include <set>
#include <stdexcept>
#include <string>
#include <string_view>
#include "annotation.h"
#include "data/types.h"
#include "definitions.h"
namespace marian::bergamot {
struct Response;
class HTML {
 public:
  using TagNameSet = std::set<std::string, std::less<>>;
  struct Options {
    TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
                        "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};
    TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
                          "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
                          "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
    TagNameSet inWordTags{"wbr"};
    TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
    std::string continuationDelimiters = "\n ,.(){}[]";
    bool substituteInlineTagsWithSpaces = true;
  };
  struct Tag {
    enum NodeType {
      ELEMENT,                 // <b>...</b>
      VOID_ELEMENT,            // <img>
      COMMENT,                 // <!-- ... -->
      PROCESSING_INSTRUCTION,  // <?...?>
      WHITESPACE,              // A \n\n we inserted to break a sentence.
    };
    NodeType type;           // Type of the node
    std::string name;        // Tag name (if type is ELEMENT or VOID_ELEMENT)
    std::string attributes;  // Tag attributes (as raw HTML string, including
                             // entities and prefix whitespace)
    std::string data;        // Raw data of an element that just needs to be
                             // copied as is, e.g. <script> or <style>
  };
  using TagStack = std::vector<Tag *>;
  struct Span {
    size_t begin;   // Start offset in (plain text) source
    size_t end;     // end offset in source
    TagStack tags;  // Note: free pointers to memory owned by `pool_`.
    inline size_t size() const { return end - begin; }
  };
  explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
  explicit HTML(std::string &&source, bool processMarkup, Options &&options);
  HTML(const HTML &) = delete;
  HTML(HTML &&) = default;
  void restore(Response &response);
 private:
  using SpanIterator = std::vector<HTML::Span>::iterator;
  using AnnotatedText = marian::bergamot::AnnotatedText;
  AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
  AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
                              std::vector<HTML::TagStack> const &targetTokenTags);
  bool isContinuation(marian::string_view prev, marian::string_view str) const;
  bool isContinuation(std::string_view prev, std::string_view str) const;
  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
                    std::vector<HTML::SpanIterator> &targetTokenSpans);
  void annotateTagStack(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
                        std::vector<HTML::TagStack> &targetTokenTags);
  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
  Tag *makeTag(Tag &&tag);
  Options options_;
  std::vector<Span> spans_;
  std::forward_list<Tag> pool_;
};
}  // namespace marian::bergamot
#endif  // SRC_BERGAMOT_HTML_H_