Cheatsheet
This is a simple cheatsheet with the gradients and Hessians
of the penalized log likelihood loss to use as updates in the
Newton coordinate descent algorithm for GLMs.
Poisson: softplus
Mean Function
\[\begin{split}z_i = \beta_0 + \sum_j \beta_j x_{ij} \\
\mu_i = \log( 1 + \exp(z_i) )\end{split}\]
Log-likelihood function
\[\mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i\]
L2-penalized loss function
\[\begin{split}J = \frac{1}{n}\sum_i \left\{ \log( 1 + \exp( \beta_0 + \sum_j \beta_j x_{ij} ) ) \right\} \\
- \frac{1}{n}\sum_i \left\{ y_i \log( \log( 1 + \exp(\beta_0 + \sum_j \beta_j x_{ij} ) ) ) \right\} \\
+ \lambda (1-\alpha) \frac{1}{2} \sum_j \beta_j^2\end{split}\]
Gradient
\[\begin{split}\mu(z_i) &= \log(1 + \exp(z_i)) \\
\sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\
\frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_i \sigma(z_i) - \frac{1}{n}\sum_i y_i \frac{\sigma(z_i)}{\mu(z_i)} \\
\frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_i \sigma(z_i) x_{ij} - \frac{1}{n}\sum_i \sigma(z_i) y_i \frac{\sigma(z_i)}{\mu(z_i)}x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]
Hessian
\[\begin{split}\mu(z_i) &= \log(1 + \exp(z_i)) \\
\sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\
\frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i))
- \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} \\
\frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i)) x_{ij}^2
- \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} x_{ij}^2
+ \lambda (1 - \alpha)\end{split}\]
Poisson (linearized): poisson
Mean Function
\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\
\mu_i &=
\begin{cases}
\exp(z_i), & z_i \leq \eta \\
\\
\exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta
\end{cases}\end{split}\]
Log-likelihood function
\[\mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i\]
L2-penalized loss function
\[J = -\frac{1}{n} \mathcal{L} + \lambda (1 - \alpha) \frac{1}{2} \sum_j \beta_j^2\]
Gradient
\[\begin{split}\mu_i &=
\begin{cases}
\exp(z_i), & z_i \leq \eta \\
\\
\exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta
\end{cases}
\\
\frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i)
+ \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) \\
\frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i) x_{ij}
+ \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) x_{ij}\end{split}\]
Hessian
\[\begin{split}\mu_i &=
\begin{cases}
\exp(z_i), & z_i \leq \eta \\
\\
\exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta
\end{cases}
\\
\frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i
+ \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} \\
\frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i x_{ij}^2
+ \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} x_{ij}^2
+ \lambda (1 - \alpha)\end{split}\]
Gaussian: gaussian
Mean Function
\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\
\mu_i &= z_i\end{split}\]
Log-likelihood function
\[\begin{split}\mathcal{L} = -\frac{1}{2} \sum_i (y_i - \mu_i)^2 \\\end{split}\]
L2-penalized loss function
\[\begin{split}J = \frac{1}{2n}\sum_i (y_i - (\beta_0 + \sum_j \beta_j x_{ij}))^2 +
\lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]
Gradient
\[\begin{split}\mu(z_i) &= z_i \\
\frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\
\frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij}
+ \lambda (1 - \alpha) \beta_j\end{split}\]
Hessian
\[\begin{split}\frac{\partial^2 J}{\partial \beta_0^2} &= 1 \\
\frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i x_{ij}^2
+ \lambda (1 - \alpha)\end{split}\]
Logistic: binomial
Mean Function
\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\
\mu_i &= \frac{1}{1+\exp(-z_i)}\end{split}\]
Log-likelihood function
\[\begin{split}\mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\\end{split}\]
L2-penalized loss function
\[\begin{split}J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) +
(1-y_i) \log(1 - \mu_i) \right\}
+ \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]
Gradient
\[\begin{split}\mu(z_i) &= \frac{1}{1 + \exp(-z_i)} \\
\frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\
\frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij}
+ \lambda (1 - \alpha) \beta_j\end{split}\]
Hessian
\[\begin{split}\frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) \\
\frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) x_{ij}^2
+ \lambda (1 - \alpha)\end{split}\]
Logistic: probit
Mean Function
\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\
\mu_i &= \Phi(z_i)\end{split}\]
where \(\Phi(z_i)\) is the standard normal cumulative distribution function.
Log-likelihood function
\[\begin{split}\mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\\end{split}\]
L2-penalized loss function
\[\begin{split}J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) +
(1-y_i) \log(1 - \mu_i) \right\}
+ \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]
Gradient
\[\begin{split}\mu(z_i) &= \Phi(z_i) \\
\mu'(z_i) &= \phi(z_i)\end{split}\]
where \(\Phi(z_i)\) and \(\phi(z_i)\) are the standard normal cdf and pdf.
\[\begin{split}\frac{\partial J}{\partial \beta_0} &=
-\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} \\
\frac{\partial J}{\partial \beta_j} &=
-\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} x_{ij}
+ \lambda (1 - \alpha) \beta_j\end{split}\]
Hessian
\[\begin{split}\frac{\partial^2 J}{\partial \beta_0^2} &=
\frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} +
(1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} \\
\frac{\partial^2 J}{\partial \beta_j^2} &=
\frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} +
(1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} x_{ij}^2
+ \lambda (1 - \alpha)\end{split}\]
In practice, the probit gradients suffer from instability primarily due to precision of evaluating the normal cdf.
Thus, in pyglmnet we use approximate formulas for computing the loss, gradients, and hessians from Demidenko et al. (2001).
For more details, see Eqns. 17-20 in the paper.
Gamma
Mean function
\[\begin{split}z_i = \beta_0 + \sum_j \beta_j x_{ij} \\
\mu_i = \log(1 + \exp(z_i))\end{split}\]
Log-likelihood function
\[\mathcal{L} = \sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - \log(\mu_i)\Bigg\}\]
where \(\nu\) is the shape parameter. It is exponential for \(\nu = 1\)
and normal for \(\nu = \infty\).
L2-penalized loss function
\[\begin{split}J = -\frac{1}{n}\sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - \log(\mu_i)\Bigg\}
+ \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]
Gradient
\[\begin{split}\frac{\partial J}{\partial \beta_0} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2}
- \frac{1}{\mu_i}\Bigg\}{\mu_i'} \\
\frac{\partial J}{\partial \beta_j} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2}
- \frac{1}{\mu_i}\Bigg\}{\mu_i'}x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]
where \(\mu_i' = \frac{1}{1 + \exp(-z_i)}\).