Program Listing for File html.h

Return to documentation for file (src/translator/html.h)

#ifndef SRC_BERGAMOT_HTML_H_
#define SRC_BERGAMOT_HTML_H_

#include <forward_list>
#include <set>
#include <stdexcept>
#include <string>
#include <string_view>

#include "annotation.h"
#include "data/types.h"
#include "definitions.h"

namespace marian::bergamot {

struct Response;

class HTML {
 public:
  using TagNameSet = std::set<std::string, std::less<>>;

  struct Options {
    TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
                        "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};

    TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
                          "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
                          "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};

    TagNameSet inWordTags{"wbr"};

    TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};

    std::string continuationDelimiters = "\n ,.(){}[]";

    bool substituteInlineTagsWithSpaces = true;
  };

  struct Tag {
    enum NodeType {
      ELEMENT,                 // <b>...</b>
      VOID_ELEMENT,            // <img>
      COMMENT,                 // <!-- ... -->
      PROCESSING_INSTRUCTION,  // <?...?>
      WHITESPACE,              // A \n\n we inserted to break a sentence.
    };

    NodeType type;           // Type of the node
    std::string name;        // Tag name (if type is ELEMENT or VOID_ELEMENT)
    std::string attributes;  // Tag attributes (as raw HTML string, including
                             // entities and prefix whitespace)
    std::string data;        // Raw data of an element that just needs to be
                             // copied as is, e.g. <script> or <style>
  };

  using TagStack = std::vector<Tag *>;

  struct Span {
    size_t begin;   // Start offset in (plain text) source
    size_t end;     // end offset in source
    TagStack tags;  // Note: free pointers to memory owned by `pool_`.
    inline size_t size() const { return end - begin; }
  };

  explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
  explicit HTML(std::string &&source, bool processMarkup, Options &&options);

  HTML(const HTML &) = delete;

  HTML(HTML &&) = default;

  void restore(Response &response);

 private:
  using SpanIterator = std::vector<HTML::Span>::iterator;
  using AnnotatedText = marian::bergamot::AnnotatedText;

  AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);

  AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
                              std::vector<HTML::TagStack> const &targetTokenTags);

  bool isContinuation(marian::string_view prev, marian::string_view str) const;
  bool isContinuation(std::string_view prev, std::string_view str) const;

  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
                    std::vector<HTML::SpanIterator> &targetTokenSpans);

  void annotateTagStack(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
                        std::vector<HTML::TagStack> &targetTokenTags);

  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);

  Tag *makeTag(Tag &&tag);

  Options options_;

  std::vector<Span> spans_;

  std::forward_list<Tag> pool_;
};

}  // namespace marian::bergamot

#endif  // SRC_BERGAMOT_HTML_H_