15.1.259. tablet_qt/maths/statsfunc.cpp

/*
    Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CamCOPS.

    CamCOPS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CamCOPS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
*/

#include "statsfunc.h"
#include <cmath>
#include <limits>
#include "maths/eigenfunc.h"
using namespace Eigen;
using eigenfunc::ArrayXb;


// ============================================================================
// Static (file-local) definitions
// ============================================================================
// As per R's family.c:
// - https://github.com/wch/r-source/blob/trunk/src/library/stats/src/family.c

#ifdef STATSFUNC_OFFER_AIC
static const double PI = 3.141592653589793238462643383279502884197169399375;  // as per R's Constants.h
#endif

static const double DOUBLE_EPS = std::numeric_limits<double>::epsilon();
static const double THRESH = 30.;
static const double MTHRESH = -30.;
static const double INVEPS = 1/DOUBLE_EPS;


static inline
double x_d_opx(const double x)
{
    return x / (1 + x);
}


static inline
double x_d_omx(const double x)
{
    return x / (1 - x);
}


static inline
double y_log_y(const double y, const double mu)
{
    return (y != 0.) ? (y * std::log(y / mu)) : 0;
}



namespace statsfunc {

// ============================================================================
// Elementary functions
// ============================================================================

double identity(const double x)
{
    return x;
}


ArrayXXd identityArray(const ArrayXXd& x)
{
    return x;
}


double one(const double x)
{
    // Derivative of the identity function:
    //      y = x
    //      y' = 1
    Q_UNUSED(x)
    return 1.0;
}


ArrayXXd oneArray(const ArrayXXd& x)
{
    ArrayXXd result(x.rows(), x.cols());
    result.setConstant(1.0);
    return result;
}


Eigen::ArrayXXd logArray(const Eigen::ArrayXXd& x)
{
    return x.log();  // or Eigen::log(x)
}


Eigen::ArrayXXd expArray(const Eigen::ArrayXXd& x)
{
    return x.exp();  // or Eigen::exp(x);
}


double logistic(const double x)
{
    // = 1 / (1 + exp(-x))
    // = exp(x) / (1 + exp(x))
    // - The core logistic function, a sigmoid.
    // - Transforms logit -> probability; inverse of logit()
    // - In R's family.c, the equivalent is logit_linkinv().
    // - Curiously, that does exp(x)/(1 + exp(x)), which is mathematically
    //   equivalent but maybe performs better; I shall trust R.
    //   It also checks for numerical limits, as follows.
    const double tmp = x < MTHRESH ? DOUBLE_EPS
                                   : (x > THRESH ? INVEPS : exp(x));
    return x_d_opx(tmp);
}


ArrayXXd logisticArray(const ArrayXXd& x)
{
    return x.unaryExpr(&logistic);
}


double logisticInterceptSlope(const double x,
                              const double intercept, const double slope)
{
    return logistic(intercept + slope * x);
}


double logisticX0K(const double x, const double x0, const double k)
{
    // Generalized logistic function with k steepness, x0 midpoint
    // https://en.wikipedia.org/wiki/Logistic_function
    return logistic(k * (x - x0));

    // HOWEVER, note that there are other formulations of slope/intercept:
    // see e.g. mathfunc::LogisticDescriptives, and as above.
}


double derivativeOfLogistic(const double x)
{
    // = exp(x) / (1 + exp(x))^2
    // = f(x)(1 - f(x))      where f(x) = logistic(x) = 1 / (1 + exp(-x))
    // https://en.wikipedia.org/wiki/Logistic_function#Derivative
    // In R's family.c, logit_mu_eta
    // Let's follow R's method, but improve its sequencing (it calculates opexp
    // when it may ignore the result).
    // Notation: "opexp" = "one plus exp()".
    if (x > THRESH || x < MTHRESH) {
        return DOUBLE_EPS;
    }
    const double opexp = 1 + std::exp(x);
    return std::exp(x) / (opexp * opexp);
}


ArrayXXd derivativeOfLogisticArray(const ArrayXXd& x)
{
    return x.unaryExpr(&derivativeOfLogistic);
}


double logit(const double p)
{
    // = log(p / (1 - p))
    // - Transforms probability -> logit; inverse of logistic()
    // - https://en.wikipedia.org/wiki/Logit
    // - Uses natural logs (std::log).
    // - In R's family.c, the equivalent is logit_link().
    return std::log(x_d_omx(p));
}


ArrayXXd logitArray(const ArrayXXd& p)
{
    return p.unaryExpr(&logit);
}


bool alwaysTrue(const ArrayXd& x)
{
    Q_UNUSED(x)
    return true;
}


bool allInteger(const Eigen::ArrayXd& x, double threshold)
{
    return ((x - x.round()).abs() <= threshold).all();
    //       ^^^^^^^^^^^^^
    //       non-integer part
    //      ^^^^^^^^^^^^^^^^^^^^^
    //      absolute non-integer part
}


// ============================================================================
// Functions for specific GLM families
// ============================================================================

// ----------------------------------------------------------------------------
// binomial
// ----------------------------------------------------------------------------

ArrayXXd binomialVariance(const ArrayXXd& mu)
{
    // - R: binomial()$variance
    // - https://en.wikipedia.org/wiki/Variance_function#Example_.E2.80.93_Bernoulli
    return mu * (1.0 - mu);
}


ArrayXd binomialDevResids(const ArrayXd& y,
                          const ArrayXd& mu,
                          const ArrayXd& wt)
{
    // R: binomial_dev_resids() in src/library/stats/src/family.c
    // ... but it's much simpler in Eigen! At least if you assume conformable
    // arrays. That means "lmu > 1" and "lwt > 1", effectively.
    const Index n = y.size();
    ArrayXd yly_y(n);
    ArrayXd yly_omy(n);
    for (Index i = 0; i < n; ++i) {
        const double y_i = y(i);
        const double mu_i = mu(i);
        yly_y(i)   = y_log_y(    y_i,     mu_i);
        yly_omy(i) = y_log_y(1 - y_i, 1 - mu_i);
    }

    return 2 * wt * (yly_y + yly_omy);
}


bool binomialValidMu(const ArrayXd& mu)
{
    // R: binomial()$validmu
    return mu.isFinite().all() && (mu > 0 && mu < 1).all();
}


bool binomialInitialize(QStringList& errors,
                        const LinkFunctionFamily& family,
                        Eigen::ArrayXd& y,
                        Eigen::ArrayXd& n,
                        Eigen::ArrayXd& m,
                        Eigen::ArrayXd& weights,
                        Eigen::ArrayXd& start,
                        Eigen::ArrayXd& etastart,
                        Eigen::ArrayXd& mustart)
{
    // returns: OK?
    // R: binomial()$initialize
    Q_UNUSED(family)
    Q_UNUSED(start)
    Q_UNUSED(etastart)

    const Index ncol_y = y.cols();
    const Index nobs = y.size();

    if (ncol_y == 1) {
        // NOT HANDLED: factors
        n = ArrayXd::Ones(nobs);
        y = (weights == 0).select(0, y);
        if ((y < 0 || y > 1).any()) {
            errors.append("y values must be 0 <= y <= 1");
            return false;
        }
        mustart = (weights * y + 0.5) / (weights + 1);
        m = weights * y;
        if (!allInteger(m)) {
            errors.append("warning: non-integer #successes in a binomial glm!");
            // ... but continue
        }
    } else if (ncol_y == 2) {
        if (!allInteger(y)) {
            errors.append("warning: non-integer counts in a binomial glm!");
            // ... but continue
        }
        n = y.col(0) + y.col(1);
        y = (n == 0).select(0, y.col(0) / n);
        weights = weights * n;
        mustart = (n * y + 0.5) / (n + 1);
    } else {
        errors.append(
                    "for the 'binomial' family, y must be a vector of 0 and "
                    "1's or a 2 column matrix where col 1 is no. successes and "
                    "col 2 is no. failures [THOUGH probably not all of that "
                    "implemented!]");
        return false;
    }

    return true;
}


#ifdef STATSFUNC_OFFER_AIC
double dbinom(const double x,
              const int n, const double p, const bool log)
{
    // As per R's dbinom.c
}


ArrayXd dbinom(const ArrayXd& x, const ArrayXi& n, const ArrayXd& p,
               const bool log)
{
    // R recycles arguments, I think; we'll ignore that for now.
    Q_ASSERT(n.size() == x.size());
    Q_ASSERT(p.size() == x.size());
    const int len = x.size();
    ArrayXd d(len);
    for (int i = 0; i < len; ++i) {
        d[i] = dbinom(x[i], n[i], p[i], log);
    }
}


double binomialAIC(const ArrayXd& y,
                   const ArrayXd& n,
                   const ArrayXd& mu,
                   const ArrayXd& wt,
                   const double dev)
{
    // R: binomial()$aic
    const int nobs = y.size();

    ArrayXd m(nobs);
    if ((n > 1).any()) {
        m = n;
    } else {
        m = wt;
    }

    // -2 * sum(ifelse(m > 0,
    //                 (wt/m),
    //                 0)      * dbinom(round(m * y),   // x
    //                                  round(m),       // size
    //                                  mu,             // prob
    //                                  log = TRUE)   )

    ArrayXd wt_over_m = (m > 0).select(wt / m, 0);
    ArrayXd binom_dens = dbinom((m * y).round(), m.round(), mu, true);
    return -2 * (wt_over_m * binom_dens).sum();
}
#endif


// ----------------------------------------------------------------------------
// gaussian
// ----------------------------------------------------------------------------

ArrayXd gaussianDevResids(const ArrayXd& y,
                          const ArrayXd& mu,
                          const ArrayXd& wt)
{
    // R: gaussian()$dev.resids
    return wt * ((y - mu).square());
}


bool gaussianInitialize(QStringList& errors,
                        const LinkFunctionFamily& family,
                        Eigen::ArrayXd& y,
                        Eigen::ArrayXd& n,
                        Eigen::ArrayXd& m,
                        Eigen::ArrayXd& weights,
                        Eigen::ArrayXd& start,
                        Eigen::ArrayXd& etastart,
                        Eigen::ArrayXd& mustart)
{
    // returns: OK?
    // R: gaussian()$initialize

    Q_UNUSED(errors)
    Q_UNUSED(family)
    Q_UNUSED(m)
    Q_UNUSED(weights)
    Q_UNUSED(start)
    Q_UNUSED(etastart)

    // NOT IMPLEMENTED: some other options for inverse/log links; q.v.

    const Index nobs = y.size();
    n = ArrayXd::Ones(nobs);
    mustart = y;

    return true;
}


#ifdef STATSFUNC_OFFER_AIC

double gaussianAIC(const ArrayXd& y,
                   const ArrayXd& n,
                   const ArrayXd& mu,
                   const ArrayXd& wt,
                   const double dev)
{
    // R: gaussian()$aic
    Q_UNUSED(n)
    Q_UNUSED(mu)
    const Index nobs = y.size();
    return nobs * (std::log(dev / nobs * 2 * PI) + 1) + 2 - wt.log().sum();
}

#endif


// ----------------------------------------------------------------------------
// poisson
// ----------------------------------------------------------------------------

bool poissonValidMu(const Eigen::ArrayXd& mu)
{
    return mu.isFinite().all() && (mu > 0).all();
}


ArrayXd poissonDevResids(const ArrayXd& y,
                         const ArrayXd& mu,
                         const ArrayXd& wt)
{
    // R: poisson()$dev.resids
    // Original:
    //      r <- mu * wt
    //      p <- which(y > 0)
    //      r[p] <- (wt * (y * log(y/mu) - (y - mu)))[p]
    //      2 * r

    const ArrayXb p = y > 0;
    const ArrayXd r = p.select(
        // where p true (y > 0):
        wt * (y * log(y / mu) - (y - mu)),
        // where p false (y = 0):
        wt * mu
    );
    return 2 * r;
}


bool poissonInitialize(QStringList& errors,
                       const LinkFunctionFamily& family,
                       Eigen::ArrayXd& y,
                       Eigen::ArrayXd& n,
                       Eigen::ArrayXd& m,
                       Eigen::ArrayXd& weights,
                       Eigen::ArrayXd& start,
                       Eigen::ArrayXd& etastart,
                       Eigen::ArrayXd& mustart)
{
    // R: poisson()$initialize

    Q_UNUSED(errors)
    Q_UNUSED(family)
    Q_UNUSED(m)
    Q_UNUSED(weights)
    Q_UNUSED(start)
    Q_UNUSED(etastart)

    if ((y < 0).any()) {
        qWarning() << "negative values not allowed for the 'Poisson' family";
        return false;
    }

    const Index nobs = y.size();
    n = ArrayXd::Ones(nobs);
    mustart = y + 0.1;

    return true;
}


// ============================================================================
// Solving
// ============================================================================

VectorXd svdSolve(const MatrixXd& A, const VectorXd& b)
{
    // solves Ax = b [or b = Ax + e], for x, minimizing e (in a least-squares sense)
    // https://eigen.tuxfamily.org/dox/group__LeastSquares.html
    return A.jacobiSvd(ComputeThinU | ComputeThinV).solve(b);
}


}  // namespace statsfunc