15.1.244. tablet_qt/maths/glm.cpp

/*
    Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CamCOPS.

    CamCOPS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CamCOPS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
*/

/*

[1] https://github.com/wepe/MachineLearning/tree/master/logistic%20regression/use_cpp_and_eigen
    ... gives WRONG ANSWERS
[2] https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_error_function_and_logistic_regression
[3] https://eigen.tuxfamily.org/dox/group__QuickRefPage.html#title2
[4] https://stackoverflow.com/questions/19824293/regularized-logistic-regression-code-in-matlab
[5] http://www.cs.cmu.edu/~ggordon/IRLS-example/
[6] https://stats.stackexchange.com/questions/166958/multinomial-logistic-loss-vs-cross-entropy-vs-square-error
[7] http://eli.thegreenplace.net/2016/logistic-regression/
[8] http://blog.smellthedata.com/2009/06/python-logistic-regression-with-l2.html
[9] https://github.com/PatWie/CppNumericalSolvers
[10] https://bwlewis.github.io/GLM/  <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
    - Best algorithmic introduction to GLMs
[11] https://en.wikipedia.org/wiki/Generalized_linear_model#Model_components
[12] http://web.as.uky.edu/statistics/users/pbreheny/760/S13/notes/2-19.pdf

Subsequently:
[13] http://madrury.github.io/jekyll/update/statistics/2016/07/20/lm-in-R.html

-------------------------------------------------------------------------------
First, a general linear model
-------------------------------------------------------------------------------
- Cardinal & Aitken 2006, p379 onwards.

Matrix notation: as per standard:
    - define matrix(nrows, ncols)
    - address element as m(row, col)

For a single dependent variable:
    n   number of observations
    k   number of predictors (including intercept)

    Y   dependent variable(s), vector (n * 1)
    X   design matrix (predictors), matrix (n * k)
    b   coefficients/parameters, vector (k * 1)
    e   error, vector (n * 1)
        ... expected to be normally distributed: e ~ N(mean, SD)

Then a general linear model is

    Y = Xb + e

... for which we solve for b.

A generalized linear model extends this with a link function [11]:
    eta = Xb                        // linear predictor
    E(Y) = mu = invlink(eta)        // Y = invlink(eta) or eta = link(y), ignoring error etc.

i.e.

    Y = invlink(Xb + e)             // or Y = invlink(Xb) + e?  In any case, Y_predicted = invlink(Xb)
    link(Y) = Xb + e
    g(Y) = Xb + e                   // the link function is called g()

... so in Wikipedia notation,

    Xb = g(mu) = g(Y)

For logistic regression, then:

    Y = logistic(Xb + e)            // logistic is the INVERSE link function
    logit(Y) = Xb + e               // logit (= inverse logistic) is the link fn

*/

// #define DEBUG_DESIGN_MATRIX

#include "glm.h"
#include <algorithm>
#include <QDebug>
#include "common/preprocessor_aid.h"  // IWYU pragma: keep
#include "maths/dqrls.h"
#include "maths/eigenfunc.h"
#include "maths/statsfunc.h"
using namespace Eigen;

// Eigen's cross() requires specific dimensions, that must be known at
// compile-time, or you get THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE.
// - https://stackoverflow.com/questions/43283444/row-wise-cross-product-eigen
// - http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1037
// A more general cross-product (after R's ?crossprod) is t(x) %*% y.
// However, this doesn't work as a template because Eigen produces objects of
// intermediate type, like Eigen::Product<...>, so let's just use the
// preprocessor, being careful with brackets:
#define CROSSPROD(x, y) ((x).matrix().transpose() * (y).matrix())

// Also helpful to have an svd() macro to match R's.
#define svd(x) ((x).jacobiSvd(ComputeThinU | ComputeThinV))


const double NA = std::numeric_limits<double>::quiet_NaN();
const double INF = std::numeric_limits<double>::infinity();


// ============================================================================
// Constructors
// ============================================================================

Glm::Glm(const LinkFunctionFamily& link_fn_family,
         const SolveMethod solve_method,
         const int max_iterations,
         const double tolerance,
         const RankDeficiencyMethod rank_deficiency_method) :
    m_link_fn_family(link_fn_family),
    m_solve_method(solve_method),
    m_max_iterations(max_iterations),
    m_tolerance(tolerance),
    m_rank_deficiency_method(rank_deficiency_method),
    m_verbose(false)
{
    reset();
}


Glm::Glm(const Eigen::MatrixXd& predictors,
         const Eigen::VectorXd& dependent_variable,
         const LinkFunctionFamily& link_fn_family,
         bool add_intercept,
         SolveMethod solve_method,
         int max_iterations,
         double tolerance,
         RankDeficiencyMethod rank_deficiency_method) :
    // Delegating constructor:
    Glm(
        link_fn_family,
        solve_method,
        max_iterations,
        tolerance,
        rank_deficiency_method
    )
{
    if (add_intercept) {
        fitAddingIntercept(predictors, dependent_variable);
    } else {
        fit(predictors, dependent_variable);
    }
}


// ============================================================================
// Set options
// ============================================================================

void Glm::setVerbose(bool verbose)
{
    m_verbose = verbose;
}


// ============================================================================
// Fit method
// ============================================================================

void Glm::fit(const MatrixXd& predictors,
              const VectorXd& depvar,
              VectorXd* p_weights)
{
    if (m_verbose) {
        qInfo() << "Glm::fit() starting";
    }
    reset();
    m_fit_start_time = QDateTime::currentDateTime();

    // Set up data
    m_predictors = predictors;
    m_dependent_variable = depvar;
    m_p_weights = p_weights;

    // Validate input
    bool ok = true;
    const Eigen::Index n_predictors = nPredictors();
    const Eigen::Index n_observations = nObservations();
    addInfo(QString("Number of observations: %1").arg(n_observations));
    addInfo(QString("Number of predictors: %1").arg(n_predictors));
    if (m_predictors.rows() != n_observations) {  // n
        addError(QString(
                "Mismatch: 'predictors' has %1 rows but 'dependent_variable' "
                "has %2 rows; should match (and be: number of observations)!"
            ).arg(m_predictors.rows()).arg(n_observations));
        ok = false;
    }
    if (m_p_weights && m_p_weights->rows() != n_predictors) {
        addError(QString(
                "Mismatch: '*p_weights' has %1 rows but 'predictors' "
                "has %2 columns; should match (and be: number of predictors)!"
            ).arg(m_p_weights->rows()).arg(n_predictors));
        ok = false;
    }

    // Perform fit
    if (ok) {
        switch (m_solve_method) {
        case SolveMethod::IRLS_KaneLewis:
            fitIRLSKaneLewis();
            break;
        case SolveMethod::IRLS_SVDNewton_KaneLewis:
            fitIRLSSVDNewtonKaneLewis();
            break;
        case SolveMethod::IRLS_R_glmfit:
            fitIRLSRglmfit();
            break;
#ifdef COMPILER_WANTS_DEFAULT_IN_EXHAUSTIVE_SWITCH
        default:
            addError("Unknown solve method!");
            break;
#endif
        }
    }

    m_fit_end_time = QDateTime::currentDateTime();

    // Report any errors
    if (m_verbose && !m_info.isEmpty()) {
        qInfo() << "Info from GLM fit:";
        for (const QString& info : m_info) {
            qInfo().noquote() << "-" << info;
        }
    }
    if (!m_calculation_errors.isEmpty()) {
        qWarning() << "Errors occurred during GLM fit:";
        for (const QString& error : m_calculation_errors) {
            qWarning().noquote() << "-" << error;
        }
    }
    if (!m_fitted) {
        qWarning() << "GLM could not be fitted";
    } else if (!m_converged) {
        qWarning() << "GLM did not converge";
    }
    if (m_verbose) {
        qInfo() << "Glm::fit() finishing";
    }
}


void Glm::fitAddingIntercept(
        const Eigen::MatrixXd& predictors_excluding_intercept,
        const Eigen::VectorXd& dependent_variable)
{
    const MatrixXd predictors = eigenfunc::addOnesAsFirstColumn(
                predictors_excluding_intercept);
    fit(predictors, dependent_variable);
}


// ============================================================================
// Re-retrieve config
// ============================================================================

LinkFunctionFamily Glm::getLinkFunctionFamily() const
{
    return m_link_fn_family;
}


Glm::SolveMethod Glm::getSolveMethod() const
{
    return m_solve_method;
}


int Glm::getMaxIterations() const
{
    return m_max_iterations;
}


double Glm::getTolerance() const
{
    return m_tolerance;
}


Glm::RankDeficiencyMethod Glm::getRankDeficiencyMethod() const
{
    return m_rank_deficiency_method;
}


// ============================================================================
// Design matrix
// ============================================================================

MatrixXd Glm::addInterceptToPredictors(const MatrixXd& x) const
{
    const MatrixXd x_design = eigenfunc::addOnesAsFirstColumn(x);
#ifdef DEBUG_DESIGN_MATRIX
    addInfo("Design matrix: " +
            eigenfunc::qStringFromEigenMatrixOrArray(x_design));
#endif
    return x_design;
}


// ============================================================================
// Re-retrieve input
// ============================================================================

VectorXd Glm::getDependentVariable() const
{
    return m_dependent_variable;
}


MatrixXd Glm::getPredictors() const
{
    return m_predictors;
}


Eigen::VectorXd* Glm::getWeightsPointer() const
{
    return m_p_weights;
}


Eigen::Index Glm::nObservations() const
{
    return m_dependent_variable.rows();
}


Eigen::Index Glm::nPredictors() const
{
    return m_predictors.cols();
}


// ============================================================================
// Get output
// ============================================================================

bool Glm::fitted() const
{
    return m_fitted;
}


bool Glm::converged() const
{
    return m_converged;
}


int Glm::nIterations() const
{
    return m_n_iterations;
}


VectorXd Glm::coefficients() const
{
    return m_coefficients;
}


VectorXd Glm::predict(const MatrixXd& predictors) const
{
    // As per: Y_predicted = invlink(Xb)
    if (!m_fitted || predictors.cols() != nPredictors()) {
        warnReturningGarbage();
        return VectorXd();
    }
    const ArrayXXd eta = predictEta(predictors);
    // y = invlink(eta)
    const ArrayXXd predicted = m_link_fn_family.inv_link_fn(eta);
    return predicted.matrix();
}


VectorXd Glm::predict() const
{
    return predict(m_predictors);
}


Eigen::VectorXd Glm::residuals(const Eigen::MatrixXd& predictors) const
{
    if (!m_fitted || predictors.cols() != nPredictors()) {
        warnReturningGarbage();
        return VectorXd();
    }
    return predict(predictors) - m_dependent_variable;
}


VectorXd Glm::residuals() const
{
    return residuals(m_predictors);
}


Eigen::ArrayXXd Glm::predictEta(const Eigen::MatrixXd& predictors) const
{
    if (!m_fitted || predictors.cols() != nPredictors()) {
        warnReturningGarbage();
        return VectorXd();
    }
    // eta = Xb
    return (predictors * m_coefficients).array();
}


Eigen::ArrayXXd Glm::predictEta() const
{
    return predictEta(m_predictors);
}


// ============================================================================
// Dumb stuff
// ============================================================================

VectorXd Glm::retrodictUnivariatePredictor(const VectorXd& y) const
{
    // If there is >1 predictor, this is utterly meaningless.
    if (!m_fitted || m_coefficients.size() != 2) {
        warnReturningGarbage();
        return VectorXd();
    }
    // But on the assumption that the first column of the predictors is an
    // intercept, and the second is a univariate predictor, there is some
    // meaning:
    const double b0 = m_coefficients(0);  // intercept
    const double b1 = m_coefficients(1);  // slope
    // In these circumstances, the GLM is
    //      y = invlink(xb [+ error somewhere]) = invlink(b0 + x * b1)
    //      link(y) = b0 + x * b1
    //      x = (link(y) - b0) / b1
    //      x = (eta - b0) / b1
    const ArrayXd eta = m_link_fn_family.link_fn(y.array());
    return (eta - b0) / b1;
}


// ============================================================================
// Get debugging info
// ============================================================================

QStringList Glm::calculationErrors() const
{
    return m_calculation_errors;
}


QStringList Glm::getInfo() const
{
    return m_info;
}


qint64 Glm::timeToFitMs() const
{
    return m_fit_start_time.msecsTo(m_fit_end_time);
}


// ============================================================================
// Internals
// ============================================================================

void Glm::reset()
{
    m_dependent_variable = VectorXd();
    m_predictors = MatrixXd();
    m_p_weights = nullptr;

    m_fitted = false;
    m_converged = false;
    m_n_iterations = 0;
    m_coefficients = VectorXd();

    m_calculation_errors.clear();
    m_info.clear();
    m_fit_start_time = m_fit_end_time = QDateTime();
}


void Glm::warnReturningGarbage() const
{
    QString not_fitted("Not fitted! Returning garbage.");
    qWarning() << not_fitted;
    addError(not_fitted);
}


void Glm::addInfo(const QString& msg) const
{
    m_info.append(msg);
}


void Glm::addError(const QString& msg) const
{
    m_calculation_errors.append(msg);
}


// ============================================================================
// The interesting parts
// ============================================================================

void Glm::fitIRLSKaneLewis()
{
    addInfo("Fitting GLM using iteratively reweighted least squares (IRLS) "
            "estimation");
    // https://bwlewis.github.io/GLM/

    // Renaming:
    // Everyone uses a different notation!
    // Translation table:
    //      Thing   Conventional notation       https://bwlewis.github.io/GLM/
    //      ------------------------------------------------------------------
    //      depvar      Y                               b
    //      predictors  X                               A
    //      coeffs      b                               x
    const MatrixXd& A = m_predictors;   // n,k
    const ArrayXXd& b = m_dependent_variable.array();  // n,1
    const LinkFunctionFamily& family = m_link_fn_family;
    const Eigen::Index n_predictors = nPredictors();
    using statsfunc::svdSolve;

    if (m_p_weights) {
        addError("Warning: weights specified but not supported by "
                 "fitIRLSKaneLewis(); will be IGNORED");
    }

    VectorXd x = VectorXd::Zero(n_predictors);  // k,1
    VectorXd xold = VectorXd::Zero(n_predictors);  // k,1
    for (m_n_iterations = 1;
            m_n_iterations <= m_max_iterations;
            ++m_n_iterations) {
        // Note also, for debugging, that you can inspect matrices, but not
        // arrays, in the Qt debugger.
        const ArrayXXd eta = (A   * x).array();
                           // n,k * k,1  -> n,1
        const ArrayXXd g = family.inv_link_fn(eta);  // apply invlink to eta -> n,1
        const ArrayXXd gprime = family.derivative_inv_link_fn(eta);  // -> n,1
        const ArrayXXd gprime_squared = gprime.square();  // -> n,1
        const VectorXd z = (eta + (b - g) / gprime).matrix();  // n,1
        const ArrayXXd var_g = family.variance_fn(g);
        const MatrixXd W = (gprime_squared / var_g).matrix().asDiagonal();  // n,n
        xold = x;

        // Now the tricky bit.
        // The source has:
        //      Let x[j+1] = (A_T W A)^−1 A_T W z
        // In R, it uses:
        //      x = solve(crossprod(A,W*A), crossprod(A,W*z), tol=2*.Machine$double.eps)
        // R says "solve" solves "a %*% x = b" for x
        // ... i.e.
        //              a * x = b
        //              a_INV * a * x = a_INV * b
        //              x = a_INV * b
        // Therefore, we translate to:
        //      "A" = A_T W A = A.cross(W * A) ?
        //      "b" = A_T W z = A.cross(W * z) ?
        // ... yes, except that we can't use Eigen's "cross" like that; see
        //     instead the preprocessor macro CROSSPROD to create some
        //     shorthand while still allowing compiler optimization for Eigen
        //     code.
        // In Eigen, we solve Ax = b using
        //      x = A.jacobiSvd(options).solve(b)
        // So we end up with:

        x = svdSolve(CROSSPROD(A,     W   * A),
                            // n,k ; (n,n * n,k) -> n,k        --> k,k
                     CROSSPROD(A,     W   * z));
                            // n,k ; (n,n * n,1) -> n,1        --> k,1
        // -> k,1

        double euclidean_norm_of_change = (x - xold).norm();
        // = sqrt(sum of squared values of (x - xold))
        if (euclidean_norm_of_change < m_tolerance) {
            m_converged = true;
            break;
        }
    }

    m_fitted = true;
    m_coefficients = x;  // k,1
}


void Glm::fitIRLSSVDNewtonKaneLewis()
{
    addInfo("Fitting GLM using iteratively reweighted least squares (IRLS) "
            "estimation, SVD (singular value decomposition) Newton variant");
    // https://bwlewis.github.io/GLM/
    // Because of the variability in variable names, for dimensional analysis
    // we'll use nobs, npred.

    // Renaming, as above
    MatrixXd A = m_predictors;  // nobs,npred
    const ArrayXXd& b = m_dependent_variable.array();  // nobs,1
    const LinkFunctionFamily& family = m_link_fn_family;
    const Eigen::Index n_predictors = nPredictors();  // = npred
    const Eigen::Index m = nObservations();  // n (sigh...) = nobs
    const Eigen::Index& n = n_predictors;  // = npred
    using namespace eigenfunc;

    ArrayXd weights(m);  // nobs,1
    // Below, can't use "weights = m_p_weights ? ... : ...", because the
    // resulting template types are not identical. But this works fine:
    if (m_p_weights) {
        weights = m_p_weights->array();
    } else {
        weights = ArrayXd::Ones(m);
    }
    if (weights.rows() != m) {
        addError(QString(
                     "'weights' is of length %1, but should match number of "
                     "observations, %2").arg(
                     QString::number(weights.rows()),
                     QString::number(m)));
        return;
    }

    // If any weights are zero, set corresponding A row to zero
    for (int i = 0; i < m; ++i) {
        if (qFuzzyIsNull(weights(i))) {  // "if (weights(i) == 0)..."
            A.row(i).setConstant(0);
        }
    }

    JacobiSVD<MatrixXd> S = svd(A);
    // In R, the "d" part of an SVD is the vector of singular values; "u" is
    // the matrix of left singular values vectors; "v" is the matrix of right
    // singular values vectors. The original here used S$d.
    // In Eigen, singular values are given by singularValues(), and are always
    // in descending order (I presume they're also in descending order in R!).
    // https://eigen.tuxfamily.org/dox/classEigen_1_1SVDBase.html
    ArrayXd S_d = S.singularValues().array();
    if (S_d.size() == 0) {
        // Before we address d(0)... check it exists!
        addError("Singular values: empty!");
        return;
    }
    IndexArray select_pred_indices = indexSeq(0, n_predictors - 1);
    ArrayXb tiny_singular_values = S_d / S_d(0) < m_tolerance;
    Eigen::Index k = tiny_singular_values.cast<Eigen::Index>().sum();  // number of tiny singular values; ntiny
    if (k > 0) {
        addInfo("Numerically rank-deficient model matrix");
        switch (m_rank_deficiency_method) {
        case RankDeficiencyMethod::SelectColumns:
            addInfo("RankDeficiencyMethod::SelectColumns");
            select_pred_indices = svdsubsel(A, n - k);
            S = svd(subsetByColumnIndex(A, select_pred_indices));
            S_d = S.singularValues().array();  // Since we change S, rewrite S_d
            break;
        case RankDeficiencyMethod::MinimumNorm:
            addInfo("RankDeficiencyMethod::MinimiumNorm");
            // Dealt with at the end; see below
            break;
        case RankDeficiencyMethod::Error:
            addError("Near rank-deficient model matrix");
            return;
#ifdef COMPILER_WANTS_DEFAULT_IN_EXHAUSTIVE_SWITCH
        default:
            addError("Unknown rank deficiency method!");
            return;
#endif
        }
    }

    ArrayXd t = ArrayXd::Zero(m);  // nobs,1  // NB confusing name choice, cf. R's t() for transpose
    MatrixXd s = VectorXd::Zero(select_pred_indices.size());  // npred_unless_subselected,1
    MatrixXd s_old = s;  // npred_unless_subselected,1
    ArrayXb select_pred_bool = selectBoolFromIndices(select_pred_indices, n_predictors);
    ArrayXb good = weights > 0;  // nobs,1
    double two_epsilon = 2.0 * std::numeric_limits<double>::epsilon();

    for (m_n_iterations = 1;
            m_n_iterations <= m_max_iterations;
            ++m_n_iterations) {
        const ArrayXd t_good = subsetByElementBoolean(t, good);  // nobs_where_good,1
        const ArrayXd b_good = subsetByElementBoolean(b, good);  // nobs_where_good,1
        const ArrayXd weights_good = subsetByElementBoolean(weights, good);  // nobs_where_good,1

        const ArrayXd g = family.inv_link_fn(t_good);  // nobs_where_good,1

        const ArrayXd varg = family.variance_fn(g);  // nobs_where_good,1
        if (varg.isNaN().any()) {
            // As per original...
            addError(QString("NAs in variance of the inverse link function "
                             "(iteration %1)").arg(m_n_iterations));
            return;
        }
        // But also (RNC):
        if (varg.isInf().any()) {
            // As per original...
            addError(QString("Infinities in variance of the inverse link "
                             "function (iteration %1)").arg(m_n_iterations));
            return;
        }
        if ((varg == 0).any()) {
            addError(QString("Zero value in variance of the inverse link "
                             "function (iteration %1)").arg(m_n_iterations));
            return;
        }

        const ArrayXd gprime = family.derivative_inv_link_fn(t_good);  // nobs_where_good,1
        if (gprime.isNaN().any()) {
            // As per original...
            addError(QString("NAs in the inverse link function derivative "
                             "(iteration %1)").arg(m_n_iterations));
            return;
        }
        // But also (RNC):
        if (gprime.isInf().any()) {
            // As per original...
            addError(QString("Infinities in the inverse link function "
                             "derivative (iteration %1)").arg(m_n_iterations));
            return;
        }

        ArrayXd z = ArrayXd::Zero(m);  // nobs,1
        ArrayXd W = ArrayXd::Zero(m);  // nobs,1
        ArrayXd to_z_good = t_good + (b_good - g) / gprime;  // nobs_where_ngood,1
        assignByBooleanSequentially(z, good, to_z_good);
        ArrayXd W_new_good = weights_good * (gprime.square() / varg);  // nobs_where_ngood,1
        assignByBooleanSequentially(W, good, W_new_good);
        good = W > two_epsilon;
        // --------------------------------------------------------------------
        // NB good changes here; cached versions invalidated
        // --------------------------------------------------------------------
        int n_good = good.cast<int>().sum();
        if (n_good < m) {
            addInfo(QString("Warning: tiny weights encountered (iteration "
                            "%1)").arg(m_n_iterations));
        }
        s_old = s;

        const MatrixXd& S_u = S.matrixU();  // nobs,npred
        const ArrayXXd S_u_good = subsetByRowBoolean(S_u, good);  // nobs_where_ngood,npred
        // Note that mat[boolvec] gives a 1-d result, whereas
        // mat[boolvec,] gives a 2-d result.
        const ArrayXd W_good = subsetByElementBoolean(W, good);  // nobs_where_ngood,1
        const ArrayXd z_good = subsetByElementBoolean(z, good);  // nobs_where_ngood,1
        // Now, about W_good * S_u_good, where S_u_good is e.g. 20x2:
        // In R, if W_good is 20x1, you get a "non-conformable arrays" error,
        // but if W_good is a 20-length vector, it works, applying it across
        // all columns of S_u_good.
        // Let's create a new multiplication function:
        MatrixXd tmp_matrix_to_chol = CROSSPROD(
                    S_u_good,  // nobs_where_ngood,npred
                    multiply(W_good, S_u_good)  // nobs_where_ngood,npred
        );  // npred,npred
        MatrixXd C = chol(tmp_matrix_to_chol);  // npred,npred
        MatrixXd tmp_matrix_rhs = CROSSPROD(
                    S_u_good,  // nobs_where_ngood,npred
                    W_good * z_good  // nobs_where_ngood,1
        );  // npred,1
        s = forwardsolve(C.transpose(), tmp_matrix_rhs);  // npred,1
        s = backsolve(C, s);  // npred,1

        t = ArrayXd::Zero(m);  // nobs,1
        MatrixXd t_new_good = S_u_good.matrix() * s;  // nobs_where_ngood,1
        assignByBooleanSequentially(t, good, t_new_good);  // nobs,1

        // Converged?
        double euclidean_norm_of_change = (s - s_old).matrix().norm();
        // = sqrt(sum of squared values of (s - s_old))
        if (euclidean_norm_of_change < m_tolerance) {
            m_converged = true;
            break;
        }
    }

    VectorXd& x = m_coefficients;
    x = VectorXd(n).setConstant(NA);
    if (m_rank_deficiency_method == RankDeficiencyMethod::MinimumNorm) {
        S_d = tiny_singular_values.select(INF, S_d);
    }

    const ArrayXd t_good = subsetByElementBoolean(t, good);  // nobs_where_good,1
    const MatrixXd S_u = S.matrixU();  // nobs,npred
    const MatrixXd S_u_good = subsetByRowBoolean(S_u, good);  // nobs_where_good,npred
    const MatrixXd S_v = S.matrixV();
    const MatrixXd x_possible = S_v * ((1 / S_d) * CROSSPROD(
                                           S_u_good,
                                           t_good).array()).matrix();  // npred,1
    x = select_pred_bool.select(x_possible, x);

    m_fitted = true;
}


eigenfunc::IndexArray Glm::svdsubsel(const MatrixXd& A, Eigen::Index k)
{
    // As per http://bwlewis.github.io/GLM/svdss.html
    // Input:
    //      A: m*p matrix, m >= p
    //      k: number of output columns, k <= p
    // Returns a column array containing the COLUMN INDICES of the columns of A
    // that *estimates* the k most linearly independent columns of A.
    //
    // Note the differences from the original relating to Eigen being 0-based
    // versus R being 1-based in its indexing.

    using namespace eigenfunc;

    // Input validation as per requirements above:
    Q_ASSERT(A.rows() >= A.cols());
    // ... we will force k, as below

    Eigen::Index index_k = k - 1;
    if (index_k < 0 || index_k >= A.cols()) {
        index_k = A.cols() - 1;
    }
    JacobiSVD<MatrixXd> S = svd(scale(A, false, true));
    ArrayXd d = svd(A).singularValues().array();
    double epsilon = std::numeric_limits<double>::epsilon();
    IndexArray small_sv_indices = which(d < 2.0 * epsilon);
    if (small_sv_indices.size() > 0) {
        Index n = small_sv_indices(0);  // index of first small singular value
        if (index_k >= n) {
            index_k = n - 1;
            addInfo("k was reduced to match the rank of A");
        }
    }
    MatrixXd subsetted = subsetByColumnIndex(S.matrixV(), indexSeq(0, index_k)).transpose();  // k,?
    // The original uses qr(..., LAPACK=TRUE), and R's ?qr says "Using
    // LAPACK... uses column pivoting..." so the Eigen equivalent is probably:
    ColPivHouseholderQR<MatrixXd> Q = subsetted.colPivHouseholderQr();
    // Then, the original uses Q$pivot, which is a list of column indices in
    // R. Thanks to
    // https://stackoverflow.com/questions/26385561/how-to-get-matrix-pivot-index-in-c-eigen
    // we have:

    ArrayXi column_indices_int = Q.colsPermutation().indices();
    IndexArray column_indices = column_indices_int.cast<Index>();
    sort(column_indices);
    return column_indices;
}


void Glm::fitIRLSRglmfit()
{
    addInfo("Fitting GLM using IRLS as implemented by R's glm.fit");

    /*
    In R:
    - ?glm
    - ?glm.control  -- gives default epsilon, maxit
    - ?glm.fit -- default "method" to glm(), and the main fitting function
    */

    using namespace eigenfunc;

    // ------------------------------------------------------------------------
    // Input parameters
    // ------------------------------------------------------------------------

    // Number of observations (values of the dependent variable)
    const Eigen::Index nobs = nObservations();

    // Number of predictor variables (for each observation)
    const Eigen::Index nvars = nPredictors();

    // Predictor values for each y value
    const MatrixXd& x = m_predictors;  // nobs,nvars

    // Dependent variable (observations)
    ArrayXd y = m_dependent_variable.array();  // nobs,1

    // Weights to apply to the dependent variable
    ArrayXd weights(nobs);  // nobs,1
    if (m_p_weights) {
        weights = m_p_weights->array();
    } else {
        // Default weighting: all observations have equal weights
        weights = ArrayXd::Ones(nobs);
    }

    // A priori known component to incorporate in the linear predictor
    ArrayXd offset = ArrayXd::Zero(nobs);  // specifying it is not yet supported

    // Include an intercept term?
    // const bool intercept = true;  // specifying it is not yet supported

    // Link function family (incorporating link function, variance function,
    // etc.)
    const LinkFunctionFamily& family = m_link_fn_family;

        // Shorthands for the various parts of the family:

    // Link function, eta = linkfun(mu)
    const LinkFunctionFamily::LinkFnType linkfun = family.link_fn;

    // Variance function, variance = variance_fn(mu)
    const LinkFunctionFamily::VarianceFnType& variance = family.variance_fn;

    // Inverse link function, mu = linkinv(eta)
    const LinkFunctionFamily::InvLinkFnType& linkinv = family.inv_link_fn;

    // Function to validate eta
    const LinkFunctionFamily::ValidEtaFnType valideta = family.valid_eta_fn;

    // Function to validate mu
    const LinkFunctionFamily::ValidMuFnType validmu = family.valid_mu_fn;

    // Derivative of the inverse link function, d(mu)/d(eta); "mu.eta" in R
    const LinkFunctionFamily::DerivativeInvLinkFnType mu_eta = family.derivative_inv_link_fn;

    // GLM initialization function
    const LinkFunctionFamily::InitializeFnType initialize = family.initialize_fn;

    // Function to calculate the deviance for each observation as a function
    // of (y, mu, wt).
    const LinkFunctionFamily::DevResidsFnType dev_resids = family.dev_resids_fn;

#ifdef LINK_FUNCTION_FAMILY_USE_AIC
    // AIC calculation function
    const LinkFunctionFamily::AICFnType& aic = family.aic_fn;
#endif

    // ------------------------------------------------------------------------
    // Control parameters
    // ------------------------------------------------------------------------

    // Starting values for the parameters in the linear predictor
    ArrayXd start;  // not implemented as a parameter

    // Starting values for the linear predictor
    ArrayXd etastart;  // not implemented as a parameter

    // Starting values for the vector of means
    ArrayXd mustart;  // not implemented as a parameter

    // Maximum number of iterations permitted
    const int& maxit = m_max_iterations;

    // Be verbose?
    const bool& trace = m_verbose;

    // Tolerance (used to determine GLM convergence)
    const double& epsilon = m_tolerance;

    // Tolerance (threshold) for the QR decomposition
    const double qr_tol = std::min(1e-07, epsilon / 1000);

    // ------------------------------------------------------------------------
    // Derived information variables
    // ------------------------------------------------------------------------

    // Does the model have no predictors?
    const bool empty = nvars == 0;

    // ------------------------------------------------------------------------
    // Working variables
    // ------------------------------------------------------------------------

    // Residuals, closely related to (y - mu)
    ArrayXd residuals;  // nobs,1

    // The linear predictor values (in "x space")
    ArrayXd eta;  // nobs,1

    // The means (in "y space")
    ArrayXd mu;  // nobs,1

    // The sum of all deviances
    double dev = 0.0;

    // Array of flags indicating whether each observation is "good" for
    // prediction (e.g. has a weight that is not zero; sometimes other
    // validity checks too).
    ArrayXb good;  // nobs,1

    // w = sqrt( weights * mu'^2 / V(mu) )
    ArrayXd w;  // nobs,1

    // z = (eta - offset) + [(y - mu) / mu']
    ArrayXd z;  // nobs,1

    // ?
    ArrayXd n;  // nobs,1

    // ?
    ArrayXd m;  // nobs,1

    // Did the algorithm stop at a boundary value?
    bool boundary = false;

    // ------------------------------------------------------------------------
    // Output variables
    // ------------------------------------------------------------------------

    // Has the model converged?
    bool& conv = m_converged;

    // Number of iterations used
    int& iter = m_n_iterations;

    // Coefficients for the predictors -- what we are trying to fit
    VectorXd& coef = m_coefficients;

    // ------------------------------------------------------------------------
    // Initialize
    // ------------------------------------------------------------------------

    // Initialize as the link family dictates
    if (mustart.size() == 0) {
        initialize(m_calculation_errors, family,
                   y, n, m, weights,
                   start, etastart, mustart);
    } else {
        ArrayXd mukeep(mustart);
        initialize(m_calculation_errors, family,
                   y, n, m, weights,
                   start, etastart, mustart);
        mustart = mukeep;
    }

    // ------------------------------------------------------------------------
    // Main bit
    // ------------------------------------------------------------------------

    if (empty) {
        // No predictors.

        // Set linear predictors.
        eta = offset;
        if (!valideta(eta)) {
            addError("invalid linear predictor values in empty model");
            return;
        }

        // Calculate means from linear predictors
        mu = linkinv(eta);
        if (!validmu(mu)) {
            addError("invalid fitted means in empty model");
            return;
        }

        // Calculate total deviance.
        dev = dev_resids(y, mu, weights).sum();

        // ?
        ArrayXd mu_eta_of_eta = mu_eta(eta);
        w = (
            (weights * mu_eta_of_eta.square()) /
            variance(mu)
        ).sqrt();

        // Calculate residuals.
        residuals = (y - mu) / mu_eta_of_eta;

        // All values are good.
        good = ArrayXb(residuals.size());
        good.setConstant(true);

        // Finished at a boundary value.
        boundary = true;

        // Converged (trivially).
        conv = true;

        // No coefficients.
        coef = VectorXd();

        // No iterations required.
        iter = 0;

    } else {
        // Predictors are present. The normal situation!

        ArrayXd coefold;

        // Deviance on previous iteration.
        double devold;

        // Used to store the results of a least-squares fit operation.
        dqrls::DqrlsResult fit;

        // No prizes for code clarity in R...

        // Set starting values for eta, the linear predictors.
        if (etastart.size() > 0) {
            // The user has given us the starting values.
            eta = etastart;
        } else {
            // Work out some starting values for eta, the linear predictors.
            if (start.size() > 0) {
                // User has given initial coefficients.
                // Use eta = offset + X * initial_coeffs.
                if (start.size() != nvars) {
                    addError(QString(
                                 "length of 'start' should equal %1 and "
                                 "correspond to initial coefs...").arg(nvars));
                    return;
                }
                coefold = start;
                eta = offset + (x * start.matrix()).array();
            } else {
                // Our initialization function will have set mustart.
                // Use eta = link(mustart) as the starting value for eta.
                eta = linkfun(mustart);
            }
        }

        // Set initial values for mu.
        mu = linkinv(eta);

        // Check starting values are OK.
        if (!validmu(mu) && valideta(eta)) {
            addError("cannot find valid starting values: please specify some");
            return;
        }

        // "Initial" deviance is the deviance based on the starting values of
        // mu.
        devold = dev_resids(y, mu, weights).sum();

        // We've not reached a boundary value (yet).
        boundary = false;

        // We haven't converged yet.
        conv = false;

        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        // MAIN CALCULATION LOOP
        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        // Iterate as many times as we're allowed (we'll break if we converge).
        for (iter = 1; iter <= maxit; ++iter) {

            // Checks.
            // For now, "good" means "observation has non-zero weight".
            good = weights > 0;

            // Calculate V(mu).
            ArrayXd varmu = subsetByElementBoolean(variance(mu), good);

            // Check that the variance function, when applied to our current
            // values of mu, hasn't produced silly values. In the R code,
            // varmu, being variance(mu), isn't used for anything else -- but
            // that is because it contains an inefficiency and it recalculates
            // variance(mu) later. Search "glm.fit" code for
            // "variance(mu)[good]" to see this. Note that mu is not
            // recalculated between the two uses.
            if (varmu.isNaN().any()) {
                addError("NAs in V(mu)");
                return;
            }
            if ((varmu == 0).any()) {
                addError("0s in V(mu)");
                return;
            }

            // Calculate d(mu)/d(eta), which we'll refer to as mu'.
            ArrayXd mu_eta_val = mu_eta(eta);
            if (subsetByElementBoolean(mu_eta_val.isNaN(), good).any()) {
                addError("NAs in d(mu)/d(eta)");
                return;
            }

            // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
            // "good" is reset here; don't rely on cached info
            // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
            // Now, we say "good" means "observation has a non-zero weight and
            // a non-zero value of d(mu)/d(eta)".
            good = (weights > 0) && (mu_eta_val != 0);
            if (!good.any()) {
                conv = false;
                addError(QString("no observations informative at iteration "
                                 "%1").arg(iter));
                break;
            }

            // For good values, calculate
            //      z = (eta - offset) + [(y - mu) / mu']
            ArrayXd mu_eta_val_good = subsetByElementBoolean(mu_eta_val, good);
            z = subsetByElementBoolean(eta - offset, good) +
                    subsetByElementBoolean(y - mu, good) /
                    mu_eta_val_good;  // n_good,1

            // For good values, calculate
            //      w = sqrt( weights * mu'^2 / V(mu) )
            w = (
                (subsetByElementBoolean(weights, good) *
                    mu_eta_val_good.square()) /
                varmu
                // R doesn't re-use varmu but recalculates variance(mu)[good],
                // which is less efficient.
                // So we'll use varmu instead of repeating:
                //      subsetByElementBoolean(variance(mu), good)
            ).sqrt();  // n_good,1

            // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
            // Main moment of fitting
            // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

            // x, predictors
            ArrayXXd x_good = subsetByRowBoolean(x, good).array();

            // x * w
            ArrayXXd x_good_times_w = multiply(x_good, w);
            // ... in R, multiplication of a matrix(n_good, nvar) by a vector
            // of length (n_good)

            // Solve an equation of the form XB = Y, for B.
            // Here, we solve (x * w)B = (z * w)
            // The result, B, is returned in fit.coeffiecients.
            fit = dqrls::Cdqrls(x_good_times_w.matrix(),  // "X"
                                (z * w).matrix(),  // "Y"
                                qr_tol,
                                false);  // check

            // Checks
            if (!fit.coefficients.array().isFinite().all()) {
                conv = false;
                addError(QString("non-finite coefficients at iteration "
                                 "%1").arg(iter));
                break;
            }
            if (nobs < fit.rank) {
                addError(QString("X matrix has rank %1, but only %2 "
                                 "observation(s)").arg(fit.rank).arg(nobs));
                return;
            }

            // Store our working coefficients in "start".
            //
            // start[fit$pivot] <- fit$coefficients
            // ... fit$pivot contained indices of pivoted columns
            // ... but we're using Eigen::FullPivHouseholderQR to do full
            //     pivoting, i.e. all
            start = fit.coefficients.array();

            // Calculate linear predictors,
            //      eta = X[predictors] * b[coefficients]
            //
            // eta <- drop(x %*% start)
            // ... the drop() bit takes a one-dimensional matrix and makes a vector
            eta = (x * start.matrix()).array();

            // Apply offset to eta.
            //
            // mu <- linkinv(eta <- eta + offset)
            // http://blog.revolutionanalytics.com/2008/12/use-equals-or-arrow-for-assignment.html
            eta = eta + offset;

            // Calculate means from linear predictors.
            mu = linkinv(eta);

            // Calculate deviance, based on the discrepancies between y
            // and mu (potentially weighted).
            dev = dev_resids(y, mu, weights).sum();
            if (trace) {
                addInfo(QString("Deviance = %1 Iterations - %2")
                        .arg(dev).arg(iter));
            }

            // Check validity.
            boundary = false;
            if (!std::isfinite(dev)) {
                // Infinite deviance.
                if (coefold.size() == 0) {
                    addError("no valid set of coefficients has been found: "
                             "please supply starting values");
                    return;
                }
                // Try reducing step size.
                addInfo("step size truncated due to divergence");
                int ii = 1;
                while (!std::isfinite(dev)) {
                    if (ii > maxit) {
                        addError("inner loop 1; cannot correct step size");
                        return;
                    }
                    ii += 1;
                    start = (start + coefold) / 2;
                    eta = (x * start.matrix()).array();
                    eta = eta + offset;
                    mu = linkinv(eta);
                    dev = dev_resids(y, mu, weights).sum();
                }
                boundary = true;
                if (trace) {
                    addInfo(QString("Step halved: new deviance = %1").arg(dev));
                }
            }
            if (!(valideta(eta) && validmu(mu))) {
                // Either the linear predictors or the means are invalid.
                if (coefold.size() == 0) {
                    addError("no valid set of coefficients has been found: "
                             "please supply starting values");
                    return;
                }
                addInfo("step size truncated: out of bounds");
                int ii = 1;
                while (!(valideta(eta) && validmu(mu))) {
                    if (ii > maxit) {
                        addError("inner loop 2; cannot correct step size");
                        return;
                    }
                    ii += 1;
                    start = (start + coefold) / 2;
                    eta = (x * start.matrix()).array();
                    eta = eta + offset;
                    mu = linkinv(eta);
                }
                boundary = true;
                dev = dev_resids(y, mu, weights).sum();
                if (trace) {
                    addInfo(QString("Step halved: new deviance = %1").arg(dev));
                }
            }

            // ----------------------------------------------------------------
            // Converged?
            // ----------------------------------------------------------------
            if (std::abs(dev - devold) / (0.1 + std::abs(dev)) < epsilon) {
                // Deviance (dev) is very close to previous deviance (devold).
                // We have converged.
                conv = true;

                // Store the coefficients in our output variable.
                coef = start;

                // DONE; EXIT LOOP.
                break;
            }

            // Store current deviance as old deviance, for next iteration.
            devold = dev;

            // Copy current coefficients to "coef" and "coefold".
            coef = start;
            coefold = start;
        }

        // Report any problems
        if (!conv) {
            addError("algorithm did not converge");
        }
        if (boundary) {
            addError("algorithm stopped at boundary value");
        }

        // Special checks for mu (predicted y) values for particular
        // distributions.
        const double eps = 10 * std::numeric_limits<double>::epsilon();
        if (family.family_name == LINK_FAMILY_NAME_BINOMIAL) {
            if ((mu > 1 - eps).any() || (mu < eps).any()) {
                addError("warning: fitted probabilities numerically 0 or 1 occurred");
            }
        }
        if (family.family_name == LINK_FAMILY_NAME_POISSON) {
            if ((mu < eps).any()) {
                addError("warning: fitted rates numerically 0 occurred");
            }
        }

        // ?
        if (fit.rank < nvars) {
            // coef[fit$pivot][seq.int(fit$rank + 1, nvars)] <- NA
            addError("Not sure how to wipe out duff coefficients with full "
                     "pivoting; may be discrepancy with R");
        }

#if 0
        xxnames <- xnames[fit$pivot]
        residuals <- (y - mu)/mu.eta(eta)
        fit$qr <- as.matrix(fit$qr)
        nr <- min(sum(good), nvars)
        if (nr < nvars) {
            Rmat <- diag(nvars)
            Rmat[1L:nr, 1L:nvars] <- fit$qr[1L:nr, 1L:nvars]
        }
        else Rmat <- fit$qr[1L:nvars, 1L:nvars]
        Rmat <- as.matrix(Rmat)
        Rmat[row(Rmat) > col(Rmat)] <- 0
        names(coef) <- xnames
        colnames(fit$qr) <- xxnames
        dimnames(Rmat) <- list(xxnames, xxnames)
#endif

    }

#if 0
    ArrayXd wt = good.select(w.square(), 0);
    double wtdmu = intercept
            ? (weights * y).sum() / weights.sum()
            : linkinv(offset);
    double nulldev = dev_resids(y, wtdmu, weights);
    int n_ok = nobs - (weights == 0).cast<int>().sum();
    int nulldf = n_ok - static_cast<int>(intercept);
    int rank = empty ? 0 : fit.rank;
    int resdef = n_ok - rank;
    double aic_model = aic(y, n, mu, weights, dev) + 2 * rank;
    // skipped: return all the extra results
#endif

    m_fitted = true;
}