Source code for pyglmnet.pyglmnet

"""Python implementation of elastic-net regularized GLMs."""

import warnings
from copy import deepcopy

import numpy as np
from scipy.special import expit
from scipy.stats import norm
from .utils import logger, set_log_level, _check_params
from .base import BaseEstimator, is_classifier, check_version


ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                  'probit', 'gamma']


def _probit_g1(z, pdfz, cdfz, thresh=5):
    res = np.zeros_like(z)
    res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
    res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
    res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
    return res


def _probit_g2(z, pdfz, cdfz, thresh=5):
    res = np.zeros_like(z)
    res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
    res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
    res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
    return res


def _probit_g3(z, pdfz, cdfz, thresh=5):
    res = np.zeros_like(z)
    res[z < -thresh] = -z[z < -thresh]
    res[np.abs(z) <= thresh] = \
        pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
    res[z > thresh] = pdfz[z > thresh]
    return res


def _probit_g4(z, pdfz, cdfz, thresh=5):
    res = np.zeros_like(z)
    res[z < -thresh] = pdfz[z < -thresh]
    res[np.abs(z) <= thresh] = \
        pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
    res[z > thresh] = z[z > thresh]
    return res


def _probit_g5(z, pdfz, cdfz, thresh=5):
    res = np.zeros_like(z)
    res[z < -thresh] = 0 * z[z < -thresh]
    res[np.abs(z) <= thresh] = \
        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
        cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
                                     cdfz[np.abs(z) <= thresh]) ** 2
    res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
    return res


def _probit_g6(z, pdfz, cdfz, thresh=5):
    res = np.zeros_like(z)
    res[z < -thresh] = \
        pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
    res[np.abs(z) <= thresh] = \
        (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
        (1 - cdfz[np.abs(z) <= thresh])
    res[z > thresh] = 0 * z[z > thresh]
    return res


def _z(beta0, beta, X, fit_intercept):
    """Compute z to be passed through non-linearity"""
    if fit_intercept:
        z = beta0 + np.dot(X, beta)
    else:
        z = np.dot(X, np.r_[beta0, beta])
    return z


def _lmb(distr, beta0, beta, X, eta, fit_intercept=True):
    """Conditional intensity function."""
    z = _z(beta0, beta, X, fit_intercept)
    return _mu(distr, z, eta, fit_intercept)


def _mu(distr, z, eta, fit_intercept):
    """The non-linearity (inverse link)."""
    if distr in ['softplus', 'gamma']:
        mu = np.log1p(np.exp(z))
    elif distr == 'poisson':
        mu = z.copy()
        beta0 = (1 - eta) * np.exp(eta) if fit_intercept else 0.
        mu[z > eta] = z[z > eta] * np.exp(eta) + beta0
        mu[z <= eta] = np.exp(z[z <= eta])
    elif distr == 'gaussian':
        mu = z
    elif distr == 'binomial':
        mu = expit(z)
    elif distr == 'probit':
        mu = norm.cdf(z)
    return mu


def _grad_mu(distr, z, eta):
    """Derivative of the non-linearity."""
    if distr in ['softplus', 'gamma']:
        grad_mu = expit(z)
    elif distr == 'poisson':
        grad_mu = z.copy()
        grad_mu[z > eta] = np.ones_like(z)[z > eta] * np.exp(eta)
        grad_mu[z <= eta] = np.exp(z[z <= eta])
    elif distr == 'gaussian':
        grad_mu = np.ones_like(z)
    elif distr == 'binomial':
        grad_mu = expit(z) * (1 - expit(z))
    elif distr in 'probit':
        grad_mu = norm.pdf(z)
    return grad_mu


def _logL(distr, y, y_hat, z=None):
    """The log likelihood."""
    if distr in ['softplus', 'poisson']:
        eps = np.spacing(1)
        logL = np.sum(y * np.log(y_hat + eps) - y_hat)
    elif distr == 'gaussian':
        logL = -0.5 * np.sum((y - y_hat)**2)
    elif distr == 'binomial':

        # prevents underflow
        if z is not None:
            logL = np.sum(y * z - np.log(1 + np.exp(z)))
        # for scoring
        else:
            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    elif distr == 'probit':
        if z is not None:
            pdfz, cdfz = norm.pdf(z), norm.cdf(z)
            logL = np.sum(y * _probit_g1(z, pdfz, cdfz) +
                          (1 - y) * _probit_g2(z, pdfz, cdfz))
        else:
            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    elif distr == 'gamma':
        # see
        # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
        nu = 1.  # shape parameter, exponential for now
        logL = np.sum(nu * (-y / y_hat - np.log(y_hat)))
    return logL


def _penalty(alpha, beta, Tau, group):
    """The penalty."""
    # Combine L1 and L2 penalty terms
    P = 0.5 * (1 - alpha) * _L2penalty(beta, Tau) + \
        alpha * _L1penalty(beta, group)
    return P


def _L2penalty(beta, Tau):
    """The L2 penalty."""
    # Compute the L2 penalty
    if Tau is None:
        # Ridge=like penalty
        L2penalty = np.linalg.norm(beta, 2) ** 2
    else:
        # Tikhonov penalty
        if (Tau.shape[0] != beta.shape[0] or
                Tau.shape[1] != beta.shape[0]):
            raise ValueError('Tau should be (n_features x n_features)')
        else:
            L2penalty = np.linalg.norm(np.dot(Tau, beta), 2) ** 2
    return L2penalty


def _L1penalty(beta, group=None):
    """The L1 penalty."""
    # Compute the L1 penalty
    if group is None:
        # Lasso-like penalty
        L1penalty = np.linalg.norm(beta, 1)
    else:
        # Group sparsity case: apply group sparsity operator
        group_ids = np.unique(group)
        L1penalty = 0.0
        for group_id in group_ids:
            if group_id != 0:
                L1penalty += \
                    np.linalg.norm(beta[group == group_id], 2)
        L1penalty += np.linalg.norm(beta[group == 0], 1)
    return L1penalty


def _loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta,
          fit_intercept=True):
    """Define the objective function for elastic net."""
    n_samples, n_features = X.shape
    z = _z(beta[0], beta[1:], X, fit_intercept)
    y_hat = _mu(distr, z, eta, fit_intercept)
    L = 1. / n_samples * _logL(distr, y, y_hat, z)
    if fit_intercept:
        P = _penalty(alpha, beta[1:], Tau, group)
    else:
        P = _penalty(alpha, beta, Tau, group)
    J = -L + reg_lambda * P
    return J


def _L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta,
            fit_intercept=True):
    """Define the objective function for elastic net."""
    n_samples, n_features = X.shape
    z = _z(beta[0], beta[1:], X, fit_intercept)
    y_hat = _mu(distr, z, eta, fit_intercept)
    L = 1. / n_samples * _logL(distr, y, y_hat, z)
    if fit_intercept:
        P = 0.5 * (1 - alpha) * _L2penalty(beta[1:], Tau)
    else:
        P = 0.5 * (1 - alpha) * _L2penalty(beta, Tau)
    J = -L + reg_lambda * P
    return J


def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta,
                 fit_intercept=True):
    """The gradient."""
    n_samples, n_features = X.shape
    n_samples = np.float(n_samples)

    if Tau is None:
        if fit_intercept:
            Tau = np.eye(beta[1:].shape[0])
        else:
            Tau = np.eye(beta.shape[0])
    InvCov = np.dot(Tau.T, Tau)

    z = _z(beta[0], beta[1:], X, fit_intercept)
    mu = _mu(distr, z, eta, fit_intercept)
    grad_mu = _grad_mu(distr, z, eta)

    grad_beta0 = 0.
    if distr in ['poisson', 'softplus']:
        if fit_intercept:
            grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
        grad_beta = ((np.dot(grad_mu.T, X) -
                      np.dot((y * grad_mu / mu).T, X)).T)

    elif distr == 'gaussian':
        if fit_intercept:
            grad_beta0 = np.sum((mu - y) * grad_mu)
        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T

    elif distr == 'binomial':
        if fit_intercept:
            grad_beta0 = np.sum(mu - y)
        grad_beta = np.dot((mu - y).T, X).T

    elif distr == 'probit':
        grad_logl = (y * _probit_g3(z, grad_mu, mu) -
                     (1 - y) * _probit_g4(z, grad_mu, mu))
        if fit_intercept:
            grad_beta0 = -np.sum(grad_logl)
        grad_beta = -np.dot(grad_logl.T, X).T

    elif distr == 'gamma':
        nu = 1.
        grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
        if fit_intercept:
            grad_beta0 = -nu * np.sum(grad_logl)
        grad_beta = -nu * np.dot(grad_logl.T, X).T

    grad_beta0 *= 1. / n_samples
    grad_beta *= 1. / n_samples
    if fit_intercept:
        grad_beta += reg_lambda * (1 - alpha) * np.dot(InvCov, beta[1:])
        g = np.zeros((n_features + 1, ))
        g[0] = grad_beta0
        g[1:] = grad_beta
    else:
        grad_beta += reg_lambda * (1 - alpha) * np.dot(InvCov, beta)
        g = grad_beta

    return g


def _gradhess_logloss_1d(distr, xk, y, z, eta, fit_intercept=True):
    """
    Compute gradient (1st derivative)
    and Hessian (2nd derivative)
    of log likelihood for a single coordinate.

    Parameters
    ----------
    xk: float
        (n_samples,)
    y: float
        (n_samples,)
    z: float
        (n_samples,)

    Returns
    -------
    gk: gradient, float:
        (n_features + 1,)
    hk: float:
        (n_features + 1,)
    """
    n_samples = xk.shape[0]

    if distr == 'softplus':
        mu = _mu(distr, z, eta, fit_intercept)
        s = expit(z)
        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)

        grad_s = s * (1 - s)
        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)

    elif distr == 'poisson':
        mu = _mu(distr, z, eta, fit_intercept)
        s = expit(z)
        gk = np.sum((mu[z <= eta] - y[z <= eta]) *
                    xk[z <= eta]) + \
            np.exp(eta) * \
            np.sum((1 - y[z > eta] / mu[z > eta]) *
                   xk[z > eta])
        hk = np.sum(mu[z <= eta] * xk[z <= eta] ** 2) + \
            np.exp(eta) ** 2 * \
            np.sum(y[z > eta] / (mu[z > eta] ** 2) *
                   (xk[z > eta] ** 2))

    elif distr == 'gaussian':
        gk = np.sum((z - y) * xk)
        hk = np.sum(xk * xk)

    elif distr == 'binomial':
        mu = _mu(distr, z, eta, fit_intercept)
        gk = np.sum((mu - y) * xk)
        hk = np.sum(mu * (1.0 - mu) * xk * xk)

    elif distr == 'probit':
        pdfz = norm.pdf(z)
        cdfz = norm.cdf(z)
        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))

    elif distr == 'gamma':
        raise NotImplementedError('cdfast is not implemented for Gamma '
                                  'distribution')

    return 1. / n_samples * gk, 1. / n_samples * hk


def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
                 sample=False):
    """Simulate target data under a generative model.

    Parameters
    ----------
    distr: str
        distribution
    beta0: float
        intercept coefficient
    beta: array
        coefficients of shape (n_features,)
    X: array
        design matrix of shape (n_samples, n_features)
    eta: float
        parameter for poisson non-linearity
    random_state: float
        random state
    sample: bool
        If True, sample from distribution. Otherwise, return
        conditional intensity function

    Returns
    -------
    y: array
        simulated target data of shape (n_samples,)
    """
    if distr not in ALLOWED_DISTRS:
        raise ValueError("'distr' must be in %s, got %s"
                         % (repr(ALLOWED_DISTRS), distr))

    if not isinstance(beta0, float):
        raise ValueError("'beta0' must be float, got %s" % type(beta0))

    if beta.ndim != 1:
        raise ValueError("'beta' must be 1D, got %dD" % beta.ndim)

    if not sample:
        return _lmb(distr, beta0, beta, X, eta)

    _random_state = np.random.RandomState(random_state)
    if distr == 'softplus' or distr == 'poisson':
        y = _random_state.poisson(_lmb(distr, beta0, beta, X, eta))
    if distr == 'gaussian':
        y = _random_state.normal(_lmb(distr, beta0, beta, X, eta))
    if distr == 'binomial' or distr == 'probit':
        y = _random_state.binomial(1, _lmb(distr, beta0, beta, X, eta))
    if distr == 'gamma':
        mu = _lmb(distr, beta0, beta, X, eta)
        y = np.exp(mu)
    return y


[docs]class GLM(BaseEstimator): """Class for estimating regularized generalized linear models (GLM). The regularized GLM minimizes the penalized negative log likelihood: .. math:: \\min_{\\beta_0, \\beta} \\frac{1}{N} \\sum_{i = 1}^N \\mathcal{L} (y_i, \\beta_0 + \\beta^T x_i) + \\lambda [ \\frac{1}{2}(1 - \\alpha) \\mathcal{P}_2 + \\alpha \\mathcal{P}_1 ] where :math:`\\mathcal{P}_2` and :math:`\\mathcal{P}_1` are the generalized L2 (Tikhonov) and generalized L1 (Group Lasso) penalties, given by: .. math:: \\mathcal{P}_2 = \\|\\Gamma \\beta \\|_2^2 \\ \\mathcal{P}_1 = \\sum_g \\|\\beta_{j,g}\\|_2 where :math:`\\Gamma` is the Tikhonov matrix: a square factorization of the inverse covariance matrix and :math:`\\beta_{j,g}` is the :math:`j` th coefficient of group :math:`g`. The generalized L2 penalty defaults to the ridge penalty when :math:`\\Gamma` is identity. The generalized L1 penalty defaults to the lasso penalty when each :math:`\\beta` belongs to its own group. Parameters ---------- distr: str distribution family can be one of the following 'gaussian' | 'binomial' | 'poisson' | 'softplus' | 'probit' | 'gamma' default: 'poisson'. alpha: float the weighting between L1 penalty and L2 penalty term of the loss function. default: 0.5 Tau: array | None the (n_features, n_features) Tikhonov matrix. default: None, in which case Tau is identity and the L2 penalty is ridge-like group: array | list | None the (n_features, ) list or array of group identities for each parameter :math:`\\beta`. Each entry of the list/ array should contain an int from 1 to n_groups that specify group membership for each parameter (except :math:`\\beta_0`). If you do not want to specify a group for a specific parameter, set it to zero. default: None, in which case it defaults to L1 regularization reg_lambda: float regularization parameter :math:`\\lambda` of penalty term. default: 0.1 solver: str optimization method, can be one of the following 'batch-gradient' (vanilla batch gradient descent) 'cdfast' (Newton coordinate gradient descent). default: 'batch-gradient' learning_rate: float learning rate for gradient descent. default: 2e-1 max_iter: int maximum iterations for the model. default: 1000 tol: float convergence threshold or stopping criteria. Optimization loop will stop when norm(gradient) is below the threshold. default: 1e-6 eta: float a threshold parameter that linearizes the exp() function above eta. default: 2.0 score_metric: str specifies the scoring metric. one of either 'deviance' or 'pseudo_R2'. default: 'deviance' fit_intercept: boolean specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. default: True random_state : int seed of the random number generator used to initialize the solution. default: 0 verbose: boolean or int default: False Attributes ---------- beta0_: int The intercept beta_: array, (n_features) The learned betas n_iter_: int The number of iterations Examples -------- >>> import numpy as np >>> random_state = 1 >>> n_samples, n_features = 100, 4 >>> rng = np.random.RandomState(random_state) >>> X = rng.normal(0, 1, (n_samples, n_features)) >>> y = 2.2 * X[:, 0] -1.0 * X[:, 1] + 0.3 * X[:, 3] + 1.0 >>> glm = GLM(distr='gaussian', verbose=False, random_state=random_state) >>> glm = glm.fit(X, y) >>> glm.beta0_ # The intercept 1.005380485553247 >>> glm.beta_ # The coefficients array([ 1.90216711, -0.78782533, -0. , 0.03227455]) >>> y_pred = glm.predict(X) Reference --------- Friedman, Hastie, Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent, J Statistical Software. https://core.ac.uk/download/files/153/6287975.pdf """ def __init__(self, distr='poisson', alpha=0.5, Tau=None, group=None, reg_lambda=0.1, solver='batch-gradient', learning_rate=2e-1, max_iter=1000, tol=1e-6, eta=2.0, score_metric='deviance', fit_intercept=True, random_state=0, callback=None, verbose=False): _check_params(distr=distr, max_iter=max_iter, fit_intercept=fit_intercept) self.distr = distr self.alpha = alpha self.reg_lambda = reg_lambda self.Tau = Tau self.group = group self.solver = solver self.learning_rate = learning_rate self.max_iter = max_iter self.beta0_ = None self.beta_ = None self.ynull_ = None self.n_iter_ = 0 self.tol = tol self.eta = eta self.score_metric = score_metric self.fit_intercept = fit_intercept self.random_state = random_state self.rng = np.random.RandomState(self.random_state) self.callback = callback self.verbose = verbose set_log_level(verbose) def _set_cv(cv, estimator=None, X=None, y=None): """Set the default CV depending on whether clf is classifier/regressor.""" # Detect whether classification or regression if estimator in ['classifier', 'regressor']: est_is_classifier = estimator == 'classifier' else: est_is_classifier = is_classifier(estimator) # Setup CV if check_version('sklearn', '0.18'): from sklearn import model_selection as models from sklearn.model_selection import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): XFold = StratifiedKFold if est_is_classifier else KFold cv = XFold(n_splits=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) cv = cv() cv = check_cv(cv=cv, y=y, classifier=est_is_classifier) else: from sklearn import cross_validation as models from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): if est_is_classifier: cv = StratifiedKFold(y=y, n_folds=cv) else: cv = KFold(n=len(y), n_folds=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) if cv.__name__ not in ['KFold', 'LeaveOneOut']: raise NotImplementedError('CV cannot be defined with str' ' for sklearn < .017.') cv = cv(len(y)) cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier) # Extract train and test set to retrieve them at predict time if hasattr(cv, 'split'): cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)] else: # XXX support sklearn.cross_validation cv cv_splits = [(train, test) for train, test in cv] if not np.all([len(train) for train, _ in cv_splits]): raise ValueError('Some folds do not have any train epochs.') return cv, cv_splits def __repr__(self): """Description of the object.""" reg_lambda = self.reg_lambda s = '<\nDistribution | %s' % self.distr s += '\nalpha | %0.2f' % self.alpha s += '\nmax_iter | %0.2f' % self.max_iter s += '\nlambda: %0.2f\n>' % reg_lambda return s
[docs] def copy(self): """Return a copy of the object. Parameters ---------- none Returns ------- self: instance of GLM A copy of the GLM instance. """ return deepcopy(self)
def _prox(self, beta, thresh): """Proximal operator.""" if self.group is None: # The default case: soft thresholding return np.sign(beta) * (np.abs(beta) - thresh) * \ (np.abs(beta) > thresh) else: # Group sparsity case: apply group sparsity operator group_ids = np.unique(self.group) group_norms = np.abs(beta) for group_id in group_ids: if group_id != 0: group_norms[self.group == group_id] = \ np.linalg.norm(beta[self.group == group_id], 2) nzero_norms = group_norms > 0.0 over_thresh = group_norms > thresh idxs_to_update = nzero_norms & over_thresh result = beta result[idxs_to_update] = (beta[idxs_to_update] - thresh * beta[idxs_to_update] / group_norms[idxs_to_update]) result[~idxs_to_update] = 0.0 return result def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True): """ Perform one cycle of Newton updates for all coordinates. Parameters ---------- X: array n_samples x n_features The input data y: array Labels to the data n_samples x 1 ActiveSet: array n_features + 1 x 1, or n_features Active set storing which betas are non-zero beta: array n_features + 1 x 1, or n_features Parameters to be updated rl: float Regularization lambda Returns ------- beta: array (n_features + 1) x 1, or (n_features) Updated parameters """ n_samples, n_features = X.shape reg_scale = rl * (1 - self.alpha) z = _z(beta[0], beta[1:], X, fit_intercept) for k in range(0, n_features + int(fit_intercept)): # Only update parameters in active set if ActiveSet[k] != 0: if fit_intercept: if k == 0: xk = np.ones((n_samples, )) else: xk = X[:, k - 1] else: xk = X[:, k] # Calculate grad and hess of log likelihood term gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta, fit_intercept) # Add grad and hess of regularization term if self.Tau is None: if k == 0 and fit_intercept: gk_reg, hk_reg = 0.0, 0.0 else: gk_reg, hk_reg = beta[k], 1.0 else: InvCov = np.dot(self.Tau.T, self.Tau) if fit_intercept: gk_reg = np.sum(InvCov[k - 1, :] * beta[1:]) hk_reg = InvCov[k - 1, k - 1] else: gk_reg = np.sum(InvCov[k, :] * beta) hk_reg = InvCov[k, k] gk += reg_scale * gk_reg hk += reg_scale * hk_reg # Update parameters, z update = 1. / hk * gk beta[k], z = beta[k] - update, z - update * xk return beta
[docs] def fit(self, X, y): """The fit function. Parameters ---------- X: array The 2D input data of shape (n_samples, n_features) y: array The 1D target data of shape (n_samples,) Returns ------- self: instance of GLM The fitted model. """ # checks for group if self.group is not None: self.group = np.array(self.group) self.group = self.group.astype(np.int64) # shape check if self.group.shape[0] != X.shape[1]: raise ValueError('group should be (n_features,)') # int check if not np.all([isinstance(g, np.int64) for g in self.group]): raise ValueError('all entries of group should be integers') # type check for data if not (isinstance(X, np.ndarray) and isinstance(y, np.ndarray)): msg = ("Input must be ndarray. Got {} and {}" .format(type(X), type(y))) raise ValueError(msg) if X.ndim != 2: raise ValueError("X must be a 2D array, got %sD" % X.ndim) if y.ndim != 1: raise ValueError("y must be 1D, got %sD" % y.ndim) if hasattr(self, '_allow_refit') and self._allow_refit is False: raise ValueError("This glm object has already been fit before." "A refit is not allowed") n_observations, n_features = X.shape if n_observations != len(y): raise ValueError('Shape mismatch.' + 'X has {} observations, y has {}.' .format(n_observations, len(y))) # Initialize parameters beta = np.zeros((n_features + int(self.fit_intercept),)) if self.fit_intercept: if self.beta0_ is None and self.beta_ is None: beta[0] = 1 / (n_features + 1) * \ self.rng.normal(0.0, 1.0, 1) beta[1:] = 1 / (n_features + 1) * \ self.rng.normal(0.0, 1.0, (n_features, )) else: beta[0] = self.beta0_ beta[1:] = self.beta_ else: if self.beta0_ is None and self.beta_ is None: beta = 1 / (n_features + 1) * \ self.rng.normal(0.0, 1.0, (n_features, )) else: beta = self.beta_ logger.info('Lambda: %6.4f' % self.reg_lambda) tol = self.tol alpha = self.alpha reg_lambda = self.reg_lambda if self.solver == 'cdfast': # init active set ActiveSet = np.ones_like(beta) # Iterative updates for t in range(0, self.max_iter): self.n_iter_ += 1 beta_old = beta.copy() if self.solver == 'batch-gradient': grad = _grad_L2loss(self.distr, alpha, self.Tau, reg_lambda, X, y, self.eta, beta, self.fit_intercept) # Update beta = beta - self.learning_rate * grad elif self.solver == 'cdfast': beta = \ self._cdfast(X, y, ActiveSet, beta, reg_lambda, self.fit_intercept) else: raise ValueError("solver must be one of " "'('batch-gradient', 'cdfast'), got %s." % (self.solver)) # Apply proximal operator if self.fit_intercept: beta[1:] = self._prox(beta[1:], reg_lambda * alpha) else: beta = self._prox(beta, reg_lambda * alpha) # Update active set if self.solver == 'cdfast': ActiveSet[beta == 0] = 0 if self.fit_intercept: ActiveSet[0] = 1. # Convergence by relative parameter change tolerance norm_update = np.linalg.norm(beta - beta_old) if t > 1 and (norm_update / np.linalg.norm(beta)) < tol: msg = ('\tParameter update tolerance. ' + 'Converged in {0:d} iterations'.format(t)) logger.info(msg) break # Compute and save loss if callbacks are requested if callable(self.callback): self.callback(beta) if self.n_iter_ == self.max_iter: warnings.warn( "Reached max number of iterations without convergence.") # Update the estimated variables if self.fit_intercept: self.beta0_ = beta[0] self.beta_ = beta[1:] else: self.beta0_ = 0 self.beta_ = beta self.ynull_ = np.mean(y) self._allow_refit = False return self
[docs] def predict(self, X): """Predict targets. Parameters ---------- X: array Input data for prediction, of shape (n_samples, n_features) Returns ------- yhat: array The predicted targets of shape (n_samples,) """ if not isinstance(X, np.ndarray): raise ValueError('Input data should be of type ndarray (got %s).' % type(X)) yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta, fit_intercept=True) if self.distr == 'binomial': yhat = (yhat > 0.5).astype(int) yhat = np.asarray(yhat) return yhat
[docs] def predict_proba(self, X): """Predict class probability for binomial. Parameters ---------- X: array Input data for prediction, of shape (n_samples, n_features) Returns ------- yhat: array The predicted targets of shape (n_samples,). Raises ------ Works only for the binomial distribution. Raises error otherwise. """ if self.distr not in ['binomial', 'probit']: raise ValueError('This is only applicable for \ the binomial distribution.') if not isinstance(X, np.ndarray): raise ValueError('Input data should be of type ndarray (got %s).' % type(X)) yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta, fit_intercept=True) yhat = np.asarray(yhat) return yhat
[docs] def fit_predict(self, X, y): """Fit the model and predict on the same data. Parameters ---------- X: array The input data to fit and predict, of shape (n_samples, n_features) Returns ------- yhat: array The predicted targets of shape (n_samples,). """ return self.fit(X, y).predict(X)
[docs] def score(self, X, y): """Score the model. Parameters ---------- X: array The input data whose prediction will be scored, of shape (n_samples, n_features). y: array The true targets against which to score the predicted targets, of shape (n_samples,). Returns ------- score: float The score metric """ from . import metrics valid_metrics = ['deviance', 'pseudo_R2', 'accuracy'] if self.score_metric not in valid_metrics: raise ValueError("score_metric has to be one of: " ",".join(valid_metrics)) # If the model has not been fit it cannot be scored if self.ynull_ is None: raise ValueError('Model must be fit before ' + 'prediction can be scored') # For f1 as well if self.score_metric in ['accuracy']: if self.distr not in ['binomial', 'multinomial']: raise ValueError(self.score_metric + ' is only defined for binomial ' + 'or multinomial distributions') y = np.asarray(y).ravel() if self.distr in ['binomial', 'probit'] and \ self.score_metric != 'accuracy': yhat = self.predict_proba(X) else: yhat = self.predict(X) # Check whether we have a list of estimators or a single estimator if self.score_metric == 'deviance': return metrics.deviance(y, yhat, self.distr) elif self.score_metric == 'pseudo_R2': return metrics.pseudo_R2(X, y, yhat, self.ynull_, self.distr) if self.score_metric == 'accuracy': return metrics.accuracy(y, yhat)
[docs]class GLMCV(object): """Class for estimating regularized generalized linear models (GLM) along a regularization path with warm restarts. The regularized GLM minimizes the penalized negative log likelihood: .. math:: \\min_{\\beta_0, \\beta} \\frac{1}{N} \\sum_{i = 1}^N \\mathcal{L} (y_i, \\beta_0 + \\beta^T x_i) + \\lambda [ \\frac{1}{2}(1 - \\alpha) \\mathcal{P}_2 + \\alpha \\mathcal{P}_1 ] where :math:`\\mathcal{P}_2` and :math:`\\mathcal{P}_1` are the generalized L2 (Tikhonov) and generalized L1 (Group Lasso) penalties, given by: .. math:: \\mathcal{P}_2 = \\|\\Gamma \\beta \\|_2^2 \\ \\mathcal{P}_1 = \\sum_g \\|\\beta_{j,g}\\|_2 where :math:`\\Gamma` is the Tikhonov matrix: a square factorization of the inverse covariance matrix and :math:`\\beta_{j,g}` is the :math:`j` th coefficient of group :math:`g`. The generalized L2 penalty defaults to the ridge penalty when :math:`\\Gamma` is identity. The generalized L1 penalty defaults to the lasso penalty when each :math:`\\beta` belongs to its own group. Parameters ---------- distr: str distribution family can be one of the following 'gaussian' | 'binomial' | 'poisson' | 'softplus' | 'probit' | 'gamma' default: 'poisson'. alpha: float the weighting between L1 penalty and L2 penalty term of the loss function. default: 0.5 Tau: array | None the (n_features, n_features) Tikhonov matrix. default: None, in which case Tau is identity and the L2 penalty is ridge-like group: array | list | None the (n_features, ) list or array of group identities for each parameter :math:`\\beta`. Each entry of the list/ array should contain an int from 1 to n_groups that specify group membership for each parameter (except :math:`\\beta_0`). If you do not want to specify a group for a specific parameter, set it to zero. default: None, in which case it defaults to L1 regularization reg_lambda: array | list | None array of regularized parameters :math:`\\lambda` of penalty term. default: None, a list of 10 floats spaced logarithmically (base e) between 0.5 and 0.01. cv: cross validation object (default 10) Iterator for doing cross validation solver: str optimization method, can be one of the following 'batch-gradient' (vanilla batch gradient descent) 'cdfast' (Newton coordinate gradient descent). default: 'batch-gradient' learning_rate: float learning rate for gradient descent. default: 2e-1 max_iter: int maximum iterations for the model. default: 1000 tol: float convergence threshold or stopping criteria. Optimization loop will stop when norm(gradient) is below the threshold. default: 1e-6 eta: float a threshold parameter that linearizes the exp() function above eta. default: 2.0 score_metric: str specifies the scoring metric. one of either 'deviance' or 'pseudo_R2'. default: 'deviance' fit_intercept: boolean specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. default: True random_state : int seed of the random number generator used to initialize the solution. default: 0 verbose: boolean or int default: False Attributes ---------- beta0_: int The intercept beta_: array, (n_features) The learned betas glm_: instance of GLM The GLM object with best score reg_lambda_opt_: float The reg_lambda parameter for best GLM object n_iter_: int The number of iterations Reference --------- Friedman, Hastie, Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent, J Statistical Software. https://core.ac.uk/download/files/153/6287975.pdf Notes ----- To select subset of fitted glm models, you can simply do: glm = glm[1:3] glm[2].predict(X_test) """ def __init__(self, distr='poisson', alpha=0.5, Tau=None, group=None, reg_lambda=None, cv=10, solver='batch-gradient', learning_rate=2e-1, max_iter=1000, tol=1e-6, eta=2.0, score_metric='deviance', fit_intercept=True, random_state=0, verbose=False): if reg_lambda is None: reg_lambda = np.logspace(np.log(0.5), np.log(0.01), 10, base=np.exp(1)) if not isinstance(reg_lambda, (list, np.ndarray)): reg_lambda = [reg_lambda] _check_params(distr=distr, max_iter=max_iter, fit_intercept=fit_intercept) self.distr = distr self.alpha = alpha self.reg_lambda = reg_lambda self.cv = cv self.Tau = Tau self.group = group self.solver = solver self.learning_rate = learning_rate self.max_iter = max_iter self.beta0_ = None self.beta_ = None self.reg_lambda_opt_ = None self.glm_ = None self.scores_ = None self.ynull_ = None self.tol = tol self.eta = eta self.score_metric = score_metric self.fit_intercept = fit_intercept self.random_state = random_state self.verbose = verbose set_log_level(verbose) def __repr__(self): """Description of the object.""" reg_lambda = self.reg_lambda s = '<\nDistribution | %s' % self.distr s += '\nalpha | %0.2f' % self.alpha s += '\nmax_iter | %0.2f' % self.max_iter if len(reg_lambda) > 1: s += ('\nlambda: %0.2f to %0.2f\n>' % (reg_lambda[0], reg_lambda[-1])) else: s += '\nlambda: %0.2f\n>' % reg_lambda[0] return s
[docs] def copy(self): """Return a copy of the object. Parameters ---------- none Returns ------- self: instance of GLM A copy of the GLM instance. """ return deepcopy(self)
[docs] def fit(self, X, y): """The fit function. Parameters ---------- X: array The input data of shape (n_samples, n_features) y: array The target data Returns ------- self: instance of GLM The fitted model. """ logger.info('Looping through the regularization path') glms, scores = list(), list() self.ynull_ = np.mean(y) if not type(int): raise ValueError('cv must be int. We do not support scikit-learn ' 'cv objects at the moment') idxs = np.arange(y.shape[0]) np.random.shuffle(idxs) cv_splits = np.array_split(idxs, self.cv) for idx, rl in enumerate(self.reg_lambda): glm = GLM(distr=self.distr, alpha=self.alpha, Tau=self.Tau, group=self.group, reg_lambda=0.1, solver=self.solver, learning_rate=self.learning_rate, max_iter=self.max_iter, tol=self.tol, eta=self.eta, score_metric=self.score_metric, fit_intercept=self.fit_intercept, random_state=self.random_state, verbose=self.verbose) logger.info('Lambda: %6.4f' % rl) glm.reg_lambda = rl scores_fold = list() for fold in range(self.cv): val = cv_splits[fold] train = np.setdiff1d(idxs, val) if idx == 0: glm.beta0_, glm.beta_ = self.beta0_, self.beta_ else: glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_ glm.n_iter_ = 0 glm.fit(X[train], y[train]) glm._allow_refit = True scores_fold.append(glm.score(X[val], y[val])) scores.append(np.mean(scores_fold)) if idx == 0: glm.beta0_, glm.beta_ = self.beta0_, self.beta_ else: glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_ glm.n_iter_ = 0 glm.fit(X, y) glms.append(glm) for glm in glms: glm._allow_refit = False # Update the estimated variables if self.score_metric == 'deviance': opt = np.array(scores).argmin() elif self.score_metric in ['pseudo_R2', 'accuracy']: opt = np.array(scores).argmax() else: raise ValueError("Unknown score_metric: %s" % (self.score_metric)) self.beta0_, self.beta_ = glms[opt].beta0_, glms[opt].beta_ self.reg_lambda_opt_ = self.reg_lambda[opt] self.glm_ = glms[opt] self.scores_ = scores return self
[docs] def predict(self, X): """Predict targets. Parameters ---------- X: array Input data for prediction, of shape (n_samples, n_features) Returns ------- yhat: array The predicted targets of shape based on the model with optimal reg_lambda (n_samples,) """ return self.glm_.predict(X)
[docs] def predict_proba(self, X): """Predict class probability for binomial. Parameters ---------- X: array Input data for prediction, of shape (n_samples, n_features) Returns ------- yhat: array The predicted targets of shape (n_samples, ). Raises ------ Works only for the binomial distribution. Raises error otherwise. """ return self.glm_.predict_proba(X)
[docs] def fit_predict(self, X, y): """Fit the model and predict on the same data. Parameters ---------- X: array The input data to fit and predict, of shape (n_samples, n_features) Returns ------- yhat: array The predicted targets of shape based on the model with optimal reg_lambda (n_samples,) """ self.fit(X, y) return self.glm_.predict(X)
[docs] def score(self, X, y): """Score the model. Parameters ---------- X: array The input data whose prediction will be scored, of shape (n_samples, n_features). y: array The true targets against which to score the predicted targets, of shape (n_samples,). Returns ------- score: float The score metric for the optimal reg_lambda """ return self.glm_.score(X, y)