Program Listing for File parser.cpp¶
↰ Return to documentation for file (src/translator/parser.cpp
)
#include "parser.h"
#include <unordered_map>
#include "common/build_info.h"
#include "common/config.h"
#include "common/regex.h"
#include "common/version.h"
namespace marian {
namespace bergamot {
std::shared_ptr<marian::Options> parseOptionsFromFilePath(const std::string &configPath, bool validate /*= true*/) {
// Read entire string and redirect to parseOptionsFromString
std::ifstream readStream(configPath);
std::stringstream buffer;
buffer << readStream.rdbuf();
return parseOptionsFromString(buffer.str(), validate, /*pathsInSameDirAs=*/configPath);
};
std::shared_ptr<marian::Options> parseOptionsFromString(const std::string &configAsString, bool validate /*= true*/,
std::string pathsInSameDirAs /*=""*/) {
marian::Options options;
marian::ConfigParser configParser(cli::mode::translation);
// These are additional options we use to hijack for our own marian-replacement layer (for batching,
// multi-request-compile etc) and hence goes into Ptr<Options>.
configParser.addOption<size_t>("--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128);
// The following is a complete hijack of an existing option, so no need to add explicitly.
// configParser.addOption<size_t>("--mini-batch-words", "Bergamot Options",
// "Maximum input tokens to be processed in a single sentence.", 1024);
configParser.addOption<std::string>("--ssplit-prefix-file", "Bergamot Options",
"File with nonbreaking prefixes for sentence splitting.");
configParser.addOption<std::string>("--ssplit-mode", "Bergamot Options", "[paragraph, sentence, wrapped_text]",
"paragraph");
configParser.addOption<std::string>("--quality", "Bergamot Options", "File considering Quality Estimation model");
// Parse configs onto defaultConfig. The preliminary merge sets the YAML internal representation with legal values.
const YAML::Node &defaultConfig = configParser.getConfig();
options.merge(defaultConfig);
options.parse(configAsString);
// This is in a marian `.cpp` as of now, and requires explicit copy-here.
// https://github.com/marian-nmt/marian-dev/blob/9fa166be885b025711f27b35453e0f2c00c9933e/src/common/config_parser.cpp#L28
// clang-format off
const std::set<std::string> PATHS = {
"model",
"models",
"train-sets",
"vocabs",
"embedding-vectors",
"valid-sets",
"valid-script-path",
"valid-script-args",
"valid-log",
"valid-translation-output",
"input", // except: 'stdin', handled in makeAbsolutePaths and interpolateEnvVars
"output", // except: 'stdout', handled in makeAbsolutePaths and interpolateEnvVars
"pretrained-model",
"data-weighting",
"log",
"sqlite", // except: 'temporary', handled in the processPaths function
"shortlist", // except: only the first element in the sequence is a path, handled in the
// processPaths function
"ssplit-prefix-file", // added for bergamot
"quality", // added for bergamot
};
// clang-format on
if (!pathsInSameDirAs.empty()) {
YAML::Node configYAML = options.cloneToYamlNode();
marian::cli::makeAbsolutePaths(configYAML, pathsInSameDirAs, PATHS);
options.merge(configYAML, /*overwrite=*/true);
}
// Perform validation on parsed options only when requested
if (validate) {
YAML::Node configYAML = options.cloneToYamlNode();
marian::ConfigValidator validator(configYAML);
validator.validateOptions(marian::cli::mode::translation);
}
return std::make_shared<marian::Options>(options);
}
} // namespace bergamot
} // namespace marian