Program Listing for File quality_estimator.cpp

Return to documentation for file (src/translator/quality_estimator.cpp)

#include "quality_estimator.h"

namespace marian::bergamot {

void UnsupervisedQualityEstimator::computeQualityScores(const Histories& histories, Response& response) const {
  for (size_t i = 0; i < histories.size(); ++i) {
    const Result result = histories[i]->top();
    const Hypothesis::PtrType& hypothesis = std::get<1>(result);
    const std::vector<float> logProbs = hypothesis->tracebackWordScores();
    response.qualityScores.push_back(std::move(computeSentenceScores(logProbs, response.target, i)));
  }
}

Response::SentenceQualityScore UnsupervisedQualityEstimator::computeSentenceScores(const std::vector<float>& logProbs,
                                                                                   const AnnotatedText& target,
                                                                                   const size_t sentenceIdx) const {
  const std::vector<SubwordRange> wordIndices = mapWords(logProbs, target, sentenceIdx);

  std::vector<float> wordScores;

  for (const SubwordRange& wordIndice : wordIndices) {
    wordScores.push_back(
        std::accumulate(logProbs.begin() + wordIndice.begin, logProbs.begin() + wordIndice.end, float(0.0)) /
        wordIndice.size());
  }

  const float sentenceScore =
      std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();

  return {wordScores, wordIndices, sentenceScore};
}

LogisticRegressorQualityEstimator::Matrix::Matrix(const size_t rowsParam, const size_t colsParam)
    : rows(rowsParam), cols(colsParam), data_(rowsParam * colsParam) {}

LogisticRegressorQualityEstimator::Matrix::Matrix(Matrix&& other)
    : rows(other.rows), cols(other.cols), data_(std::move(other.data_)) {}

const float& LogisticRegressorQualityEstimator::Matrix::at(const size_t row, const size_t col) const {
  return data_[row * cols + col];
}

float& LogisticRegressorQualityEstimator::Matrix::at(const size_t row, const size_t col) {
  return data_[row * cols + col];
}

LogisticRegressorQualityEstimator::LogisticRegressorQualityEstimator(Scale&& scale, Array&& coefficients,
                                                                     const float intercept)
    : scale_(std::move(scale)), coefficients_(std::move(coefficients)), intercept_(intercept), coefficientsByStds_() {
  // Pre-compute the scale operations for the linear model
  for (int i = 0; i < coefficients_.size(); ++i) {
    coefficientsByStds_[i] = coefficients_[i] / scale_.stds[i];
    constantFactor_ += coefficientsByStds_[i] * scale_.means[i];
  }
}

LogisticRegressorQualityEstimator::LogisticRegressorQualityEstimator(LogisticRegressorQualityEstimator&& other)
    : scale_(std::move(other.scale_)),
      coefficients_(std::move(other.coefficients_)),
      intercept_(std::move(other.intercept_)),
      coefficientsByStds_(std::move(other.coefficientsByStds_)),
      constantFactor_(std::move(other.constantFactor_)) {}

LogisticRegressorQualityEstimator LogisticRegressorQualityEstimator::fromAlignedMemory(
    const AlignedMemory& alignedMemory) {
  LOG(info, "[data] Loading Quality Estimator model from buffer");

  const char* ptr = alignedMemory.begin();
  const size_t blobSize = alignedMemory.size();

  ABORT_IF(blobSize < sizeof(Header), "Quality estimation file too small");
  const Header& header = *reinterpret_cast<const Header*>(ptr);

  ABORT_IF(header.magic != BINARY_QE_MODEL_MAGIC, "Incorrect magic bytes for quality estimation file");
  ABORT_IF(header.lrParametersDims <= 0, "The number of lr parameter dimension cannot be equal or less than zero");

  const uint64_t expectedSize =
      sizeof(Header) + (numLrParamsWithDimension_ * header.lrParametersDims + numIntercept_) * sizeof(float);
  ABORT_IF(expectedSize != blobSize, "QE header claims file size should be {} bytes but file is {} bytes", expectedSize,
           blobSize);

  ptr += sizeof(Header);
  const float* memoryIndex = reinterpret_cast<const float*>(ptr);

  const float* stds = memoryIndex;
  const float* means = memoryIndex += header.lrParametersDims;
  const float* coefficientsMemory = memoryIndex += header.lrParametersDims;
  const float intercept = *(memoryIndex += header.lrParametersDims);

  Scale scale;

  Array coefficients;

  for (int i = 0; i < header.lrParametersDims; ++i) {
    scale.stds[i] = *(stds + i);

    ABORT_IF(scale.stds[i] == 0.0, "Invalid stds");

    scale.means[i] = *(means + i);
    coefficients[i] = *(coefficientsMemory + i);
  }

  return LogisticRegressorQualityEstimator(std::move(scale), std::move(coefficients), intercept);
}

AlignedMemory LogisticRegressorQualityEstimator::toAlignedMemory() const {
  const size_t lrParametersDims = scale_.means.size();

  const size_t lrSize =
      (scale_.means.size() + scale_.stds.size() + coefficients_.size()) * sizeof(float) + sizeof(intercept_);

  Header header = {BINARY_QE_MODEL_MAGIC, lrParametersDims};
  marian::bergamot::AlignedMemory memory(sizeof(header) + lrSize);

  char* buffer = memory.begin();

  memcpy(buffer, &header, sizeof(header));
  buffer += sizeof(header);

  for (const float std : scale_.stds) {
    memcpy(buffer, &std, sizeof(std));
    buffer += sizeof(std);
  }

  for (const float mean : scale_.means) {
    memcpy(buffer, &mean, sizeof(mean));
    buffer += sizeof(mean);
  }

  for (size_t i = 0; i < lrParametersDims; ++i) {
    const float coefficient = coefficients_[i];
    memcpy(buffer, &coefficient, sizeof(coefficient));
    buffer += sizeof(coefficient);
  }

  memcpy(buffer, &intercept_, sizeof(intercept_));
  buffer += sizeof(intercept_);

  return memory;
}

void LogisticRegressorQualityEstimator::computeQualityScores(const Histories& histories, Response& response) const {
  for (size_t i = 0; i < histories.size(); ++i) {
    const Result result = histories[i]->top();
    const Hypothesis::PtrType& hypothesis = std::get<1>(result);
    const std::vector<float> logProbs = hypothesis->tracebackWordScores();

    response.qualityScores.push_back(std::move(computeSentenceScores(logProbs, response.target, i)));
  }
}

Response::SentenceQualityScore LogisticRegressorQualityEstimator::computeSentenceScores(
    const std::vector<float>& logProbs, const AnnotatedText& target, const size_t sentenceIdx) const

{
  const std::vector<SubwordRange> wordIndices = mapWords(logProbs, target, sentenceIdx);

  const std::vector<float> wordScores = predict(extractFeatures(wordIndices, logProbs));

  const float sentenceScore =
      std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();

  return {wordScores, wordIndices, sentenceScore};
}

std::vector<float> LogisticRegressorQualityEstimator::predict(const Matrix& features) const {
  std::vector<float> scores(features.rows);

  for (int i = 0; i < features.rows; ++i) {
    for (int j = 0; j < features.cols; ++j) {
      scores[i] += features.at(i, j) * coefficientsByStds_[j];
    }
  }


  for (int i = 0; i < features.rows; ++i) {
    scores[i] = std::log(1 - (1 / (1 + std::exp(-(scores[i] - constantFactor_ + intercept_)))));
  }

  return scores;
}
// Preprocess input data to provide correct features for the LogisticRegression model. Currently, there are
// four features: mean of the log probability for a given word (remember that a word is made of a few subword tokens);
// the minimum log probability of the subword level tokens that a given word is made of; the number of subword level
// tokens that a word is made of and the overall log probability mean of the entire sequence
LogisticRegressorQualityEstimator::Matrix LogisticRegressorQualityEstimator::extractFeatures(
    const std::vector<SubwordRange>& wordIndices, const std::vector<float>& logProbs) const {
  if (wordIndices.empty()) {
    return std::move(Matrix(0, 0));
  }
  // The number of features (numFeatures), which is currently must be 4
  Matrix features(wordIndices.size(), /*numFeatures =*/4);
  size_t featureRow = 0;
  // I_MEAN = index position in the feature vector hat represents the mean of log probability of a given word
  // I_MIN = index position  in the feature vector that represents the minimum of log probability of a given word
  // I_NUM_SUBWORDS = index position in the feature vector that represents the number of subwords that compose a given
  // I_OVERALL_MEAN = index position in the feature vector that represents the overall log probability score in the
  // entire sequence
  const size_t I_MEAN{0}, I_MIN{1}, I_NUM_SUBWORDS{2}, I_OVERALL_MEAN{3};

  float overallMean = 0.0;
  size_t numlogProbs = 0;

  for (const SubwordRange& wordIndice : wordIndices) {
    if (wordIndice.begin == wordIndice.end) {
      ++featureRow;
      continue;
    }

    float minScore = std::numeric_limits<float>::max();

    for (size_t i = wordIndice.begin; i < wordIndice.end; ++i) {
      ++numlogProbs;
      overallMean += logProbs[i];
      features.at(featureRow, I_MEAN) += logProbs[i];

      minScore = std::min<float>(logProbs[i], minScore);
    }

    features.at(featureRow, I_MEAN) /= static_cast<float>(wordIndice.size());
    features.at(featureRow, I_MIN) = minScore;
    features.at(featureRow, I_NUM_SUBWORDS) = wordIndice.size();

    ++featureRow;
  }

  if (numlogProbs == 0) {
    return std::move(Matrix(0, 0));
  }

  overallMean /= wordIndices.rbegin()->end;

  for (int i = 0; i < features.rows; ++i) {
    features.at(i, I_OVERALL_MEAN) = overallMean;
  }

  return std::move(features);
}

std::vector<SubwordRange> mapWords(const std::vector<float>& logProbs, const AnnotatedText& target,
                                   const size_t sentenceIdx) {
  // Ignore empty target
  if ((logProbs.size() < 2) || (target.numWords(sentenceIdx) == 0)) {
    return {};
  }
  // It is expected that translated words will have at least one word
  std::vector<SubwordRange> wordIndices(/*numWords=*/1);

  for (size_t subwordIdx = 0; subwordIdx < (logProbs.size() - 1); ++subwordIdx) {
    ByteRange subword = target.wordAsByteRange(sentenceIdx, subwordIdx);

    const char firstLetter = target.text.at(subword.begin);

    // if the first character is whitespace, it's a beginning of a new word
    if (isspace(firstLetter)) {
      wordIndices.back().end = subwordIdx;
      wordIndices.emplace_back();
      wordIndices.back().begin = subwordIdx;
    }
  }

  wordIndices.back().end = logProbs.size() - 1;

  return wordIndices;
}

}  // namespace marian::bergamot