Cheatsheet

This is a simple cheatsheet with the gradients and Hessians of the penalized log likelihood loss to use as updates in the Newton coordinate descent algorithm for GLMs.

Poisson: softplus

Mean Function

\[\begin{split}z_i = \beta_0 + \sum_j \beta_j x_{ij} \\ \mu_i = \log( 1 + \exp(z_i) )\end{split}\]

Log-likelihood function

\[\mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i\]

L2-penalized loss function

\[\begin{split}J = \frac{1}{n}\sum_i \left\{ \log( 1 + \exp( \beta_0 + \sum_j \beta_j x_{ij} ) ) \right\} \\ - \frac{1}{n}\sum_i \left\{ y_i \log( \log( 1 + \exp(\beta_0 + \sum_j \beta_j x_{ij} ) ) ) \right\} \\ + \lambda (1-\alpha) \frac{1}{2} \sum_j \beta_j^2\end{split}\]

Gradient

\[\begin{split}\mu(z_i) &= \log(1 + \exp(z_i)) \\ \sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\ \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_i \sigma(z_i) - \frac{1}{n}\sum_i y_i \frac{\sigma(z_i)}{\mu(z_i)} \\ \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_i \sigma(z_i) x_{ij} - \frac{1}{n}\sum_i \sigma(z_i) y_i \frac{\sigma(z_i)}{\mu(z_i)}x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]

Hessian

\[\begin{split}\mu(z_i) &= \log(1 + \exp(z_i)) \\ \sigma(z_i) &= \frac{1}{1 + \exp(-z_i)} \\ \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i)) - \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} \\ \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \sigma(z_i) (1 - \sigma(z_i)) x_{ij}^2 - \frac{1}{n}\sum_i y_i \left\{ \frac{\sigma(z_i) (1 - \sigma(z_i))}{\mu(z_i)} - \frac{\sigma(z_i)}{\mu(z_i)^2} \right\} x_{ij}^2 + \lambda (1 - \alpha)\end{split}\]

Poisson (linearized): poisson

Mean Function

\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\ \mu_i &= \begin{cases} \exp(z_i), & z_i \leq \eta \\ \\ \exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta \end{cases}\end{split}\]

Log-likelihood function

\[\mathcal{L} = \sum_i y_i \log(\mu_i) - \sum_i \mu_i\]

L2-penalized loss function

\[J = -\frac{1}{n} \mathcal{L} + \lambda (1 - \alpha) \frac{1}{2} \sum_j \beta_j^2\]

Gradient

\[\begin{split}\mu_i &= \begin{cases} \exp(z_i), & z_i \leq \eta \\ \\ \exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta \end{cases} \\ \frac{\partial J}{\partial \beta_0} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i) + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) \\ \frac{\partial J}{\partial \beta_j} &= \frac{1}{n}\sum_{i; z_i \leq \eta} (\mu_i - y_i) x_{ij} + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta) (1 - y_i/\mu_i) x_{ij}\end{split}\]

Hessian

\[\begin{split}\mu_i &= \begin{cases} \exp(z_i), & z_i \leq \eta \\ \\ \exp(\eta)z_i + (1-\eta)\exp(\eta), & z_i > \eta \end{cases} \\ \frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} \\ \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_{i; z_i \leq \eta} \mu_i x_{ij}^2 + \frac{1}{n}\sum_{i; z_i > \eta} \exp(\eta)^2 \frac{y_i}{\mu_i^2} x_{ij}^2 + \lambda (1 - \alpha)\end{split}\]

Gaussian: gaussian

Mean Function

\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\ \mu_i &= z_i\end{split}\]

Log-likelihood function

\[\begin{split}\mathcal{L} = -\frac{1}{2} \sum_i (y_i - \mu_i)^2 \\\end{split}\]

L2-penalized loss function

\[\begin{split}J = \frac{1}{2n}\sum_i (y_i - (\beta_0 + \sum_j \beta_j x_{ij}))^2 + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]

Gradient

\[\begin{split}\mu(z_i) &= z_i \\ \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\ \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]

Hessian

\[\begin{split}\frac{\partial^2 J}{\partial \beta_0^2} &= 1 \\ \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i x_{ij}^2 + \lambda (1 - \alpha)\end{split}\]

Logistic: binomial

Mean Function

\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\ \mu_i &= \frac{1}{1+\exp(-z_i)}\end{split}\]

Log-likelihood function

\[\begin{split}\mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\\end{split}\]

L2-penalized loss function

\[\begin{split}J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]

Gradient

\[\begin{split}\mu(z_i) &= \frac{1}{1 + \exp(-z_i)} \\ \frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i (y_i - \mu_i) \\ \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i (y_i - \mu_i) x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]

Hessian

\[\begin{split}\frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) \\ \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \mu_i (1 - \mu_i) x_{ij}^2 + \lambda (1 - \alpha)\end{split}\]

Logistic: probit

Mean Function

\[\begin{split}z_i &= \beta_0 + \sum_j \beta_j x_{ij} \\ \mu_i &= \Phi(z_i)\end{split}\]

where \(\Phi(z_i)\) is the standard normal cumulative distribution function.

Log-likelihood function

\[\begin{split}\mathcal{L} = \sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} \\\end{split}\]

L2-penalized loss function

\[\begin{split}J = J = -\frac{1}{n}\sum_i \left\{ y_i \log(\mu_i) + (1-y_i) \log(1 - \mu_i) \right\} + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]

Gradient

\[\begin{split}\mu(z_i) &= \Phi(z_i) \\ \mu'(z_i) &= \phi(z_i)\end{split}\]

where \(\Phi(z_i)\) and \(\phi(z_i)\) are the standard normal cdf and pdf.

\[\begin{split}\frac{\partial J}{\partial \beta_0} &= -\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} \\ \frac{\partial J}{\partial \beta_j} &= -\frac{1}{n}\sum_i \Bigg\{y_i \frac{\mu'(z_i)}{\mu(z_i)} - (1 - y_i)\frac{\mu'(z_i)}{1 - \mu(z_i)}\Bigg\} x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]

Hessian

\[\begin{split}\frac{\partial^2 J}{\partial \beta_0^2} &= \frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} + (1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} \\ \frac{\partial^2 J}{\partial \beta_j^2} &= \frac{1}{n}\sum_i \mu'(z_i) \Bigg\{y_i \frac{z_i\mu(z_i) + \mu'(z_i)}{\mu^2(z_i)} + (1 - y_i)\frac{-z_i(1 - \mu(z_i)) + \mu'(z_i)}{(1 - \mu(z_i))^2} \Bigg\} x_{ij}^2 + \lambda (1 - \alpha)\end{split}\]

Gamma

Mean function

\[\begin{split}z_i = \beta_0 + \sum_j \beta_j x_{ij} \\ \mu_i = \log(1 + \exp(z_i))\end{split}\]

Log-likelihood function

\[\mathcal{L} = \sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\}\]

where \(\nu\) is the shape parameter. It is exponential for \(\nu = 1\) and normal for \(\nu = \infty\).

L2-penalized loss function

\[\begin{split}J = -\frac{1}{n}\sum_{i} \nu\Bigg\{\frac{-y_i}{\mu_i} - log(\mu_i)\Bigg\} + \lambda (1 - \alpha) \frac{1}{2}\sum_j \beta_j^2\\\end{split}\]

Gradient

\[\begin{split}\frac{\partial J}{\partial \beta_0} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2} - \frac{1}{\mu_i}\Bigg\}{\mu_i'} \\ \frac{\partial J}{\partial \beta_j} &= \frac{1}{n} \sum_{i} \nu\Bigg\{\frac{y_i}{\mu_i^2} - \frac{1}{\mu_i}\Bigg\}{\mu_i'}x_{ij} + \lambda (1 - \alpha) \beta_j\end{split}\]

where \(\mu_i' = \frac{1}{1 + \exp(-z_i)}\).