Program Listing for File vocabs.h

Return to documentation for file (src/translator/vocabs.h)

#pragma once

namespace marian {
namespace bergamot {

class Vocabs {
 public:
  Vocabs(Ptr<Options> options, std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) : options_(options) {
    if (!vocabMemories.empty()) {
      // load vocabs from buffer
      load(std::move(vocabMemories));
    } else {
      // load vocabs from file
      auto vocabPaths = options->get<std::vector<std::string>>("vocabs");
      load(vocabPaths);
    }
  }

  const std::vector<Ptr<Vocab const>>& sources() const { return srcVocabs_; }

  const Ptr<Vocab const>& target() const { return trgVocab_; }

 private:
  std::vector<Ptr<Vocab const>> srcVocabs_;  // source vocabularies
  Ptr<Vocab const> trgVocab_;                // target vocabulary
  Ptr<Options> options_;

  // load from buffer
  void load(std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) {
    // At least two vocabs: src and trg
    ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
    srcVocabs_.resize(vocabMemories.size());
    // hashMap is introduced to avoid double loading the same vocab
    // loading vocabs (either from buffers or files) is the biggest bottleneck of the speed
    // uintptr_t holds unique keys (address) for share_ptr<AlignedMemory>
    std::unordered_map<uintptr_t, Ptr<Vocab>> vmap;
    for (size_t i = 0; i < srcVocabs_.size(); i++) {
      auto m = vmap.emplace(std::make_pair(reinterpret_cast<uintptr_t>(vocabMemories[i].get()), Ptr<Vocab>()));
      if (m.second) {  // new: load the vocab
        m.first->second = New<Vocab>(options_, i);
        m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
      }
      srcVocabs_[i] = m.first->second;
    }
    // Initialize target vocab
    trgVocab_ = srcVocabs_.back();
    srcVocabs_.pop_back();
  }

  // load from file
  void load(const std::vector<std::string>& vocabPaths) {
    // with the current setup, we need at least two vocabs: src and trg
    ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies.");
    srcVocabs_.resize(vocabPaths.size());
    std::unordered_map<std::string, Ptr<Vocab>> vmap;
    for (size_t i = 0; i < srcVocabs_.size(); ++i) {
      auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr<Vocab>()));
      if (m.second) {  // new: load the vocab
        m.first->second = New<Vocab>(options_, i);
        m.first->second->load(vocabPaths[i]);
      }
      srcVocabs_[i] = m.first->second;
    }
    // Initialize target vocab
    trgVocab_ = srcVocabs_.back();
    srcVocabs_.pop_back();
  }
};

}  // namespace bergamot
}  // namespace marian