\[ \newcommand{\argmin}{\operatorname{argmin}} \newcommand{\sign}{\operatorname{sign}} \newcommand{\diag}[1]{\operatorname{diag}(#1)} \newcommand{\prox}[2]{\operatorname{prox}_{#1}\left(#2\right)} \]
We’re going to demonstrate an example from the BayesMAP paper 1 for a squared error loss and \(q\)-norm penalty for \(0 < q < 1\).
Given \(N\) observations \(y_i\) and an \(M \times N\) covariate matrix \(A\) we are interested in obtaining the following \[ \begin{gather*} \argmin_x \left\{ F(x) = f(x) + \lambda \sum_j^M \phi(x_j) \right\} \end{gather*} \] with \(f(x) = \|A x - b\|^2\) and \(\phi(x_j) = |x_j|^q\).
This approach uses a proximal operator in a coordinate descent algorithm (see 2 and 3). The \(\operatorname{prox}\) operator for the \(q\)-norm has an exact solution given, component-wise, by \[ \begin{align*} \prox{\lambda \phi_q}{x} &= \begin{cases} 0 & \text{ if } |x| < h_\lambda \\ \{0, \sign(x) \beta_\lambda \} & \text{ if } |x| = h_\lambda \\ \sign(x) \hat{\beta} & \text{ if } |x| > h_\lambda \\ \end{cases} \end{align*} \] where \[ \begin{align*} b_{\lambda,q} &= \left(2 \lambda (1-q)\right)^{\frac{1}{2-q}} \\ h_{\lambda,q} &= b_{\lambda,q} + \lambda q b_{\lambda,q}^{q-1} \\ \hat{\beta} + \lambda q \hat{\beta}^{q-1} &= |x|, \hat{\beta} \in (b_{\lambda, q},|x|) \end{align*} \]
library(inline)
library(Rcpp)
##
## Attaching package: 'Rcpp'
##
## The following object is masked from 'package:inline':
##
## registerPlugin
library(RcppArmadillo)
includes.cpp = '
double isZero(double x) {
return std::fabs(x) < 1e-7;
}
/*
* This function is the l_q (0 < q < 1) prox operator
* plus the T map in step b) of Marjanovic and Solo.
*
*/
double tau(double beta, double z, double lambda, double q) {
double b_lq = 2.0 * lambda * std::pow(1.0 - q, 1.0/(2.0-q));
double h_lq = b_lq + lambda * q * std::pow(b_lq, q-1);
double absz = std::fabs(z);
double abszh_diff = absz - h_lq;
double beta_hat;
if (isZero(abszh_diff)) {
beta_hat = isZero(beta) ? 0.0 : b_lq;
beta_hat *= ((z > 0) - (z < 0));
} else if (abszh_diff < 0) {
beta_hat = 0.0;
} else if (isZero(absz)) {
beta_hat = 0.0;
} else {
beta_hat = (absz - b_lq)/2.0;
//printf("beta_hat=%f, b_lq=%f, h_lq=%f, absz=%f, lam=%f, q=%f\\n", beta_hat, b_lq, h_lq, absz, lambda, q);
double beta_last;
do {
beta_last = beta_hat;
beta_hat = absz - lambda * q * std::pow(std::fabs(beta_hat), q-1);
} while(std::fabs(beta_hat - beta_last) > 1e-7);
if (beta_hat < 0.0)
throw new std::runtime_error("beta_hat < 0");
beta_hat *= ((z > 0) - (z < 0));
}
if (std::isnan(beta_hat))
throw new std::runtime_error("beta_hat is nan");
return(beta_hat);
}
double obj_val(arma::vec &y, arma::mat &X, arma::vec &beta, double sigma, double lambda0, double q) {
return arma::sum(arma::square(y - X * beta))/(2.0*sigma*sigma)
+ lambda0 * arma::sum(arma::pow(arma::abs(beta), q))/(sigma*sigma);
}
'
main.cpp = '
arma::vec y = Rcpp::as<arma::vec>(r_y);
arma::mat X = Rcpp::as<arma::mat>(r_X);
double lambda = as<double>(r_lambda);
double sigma = as<double>(r_sigma);
double q = as<double>(r_q);
int max_iters = as<int>(r_max_iters);
int N = X.n_rows;
int M = X.n_cols;
printf("N=%i, M=%i, lambda=%f, q=%f, sigma=%f, max_iters=%i\\n", N, M, lambda, q, sigma, max_iters);
arma::vec beta = arma::zeros(M);
arma::vec r = y - X * beta;
int k = 0;
double L_prev = obj_val(y, X, beta, sigma, lambda, q);
double L_diff = 0.0;
do {
arma::vec beta_last = beta;
for(int i=0; i<M; i++){
double beta_now = beta(i);
double z = arma::as_scalar(X.col(i).t() * r + beta_now);
double beta_new = tau(beta_now, z, lambda, q);
r = r - (beta_new - beta_now) * X.col(i);
beta(i) = beta_new;
}
k++;
double L = obj_val(y, X, beta, sigma, lambda, q);
L_diff = std::fabs(L - L_prev);
L_prev = L;
//std::cout << "k=" << k << ", beta=" << beta.t();
//std::cout << ", L=" << L << std::endl;
} while(k < max_iters && L_diff > 1e-7);
return Rcpp::List::create(Rcpp::Named("beta")=beta);
'
ms.lq.algo = cxxfunction(signature(r_X="numeric", r_y="numeric",
r_lambda="numeric", r_q="numeric",
r_sigma="numeric", r_max_iters="integer"),
plugin="RcppArmadillo",
body=main.cpp, includes=includes.cpp)
## OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.
We set up the problem dimensions, signal sparsity, and the signal-to-noise ratio (SNR):
set.seed(0)
M = 256
N = 100
SNR = 16.5
n.sparse = ceiling(M * 0.05)
Let \(x^*\) be the true signal, comprised of 13 non-zero \(N(0,1)\) terms.
x.true = matrix(0, M, 1)
x.true[sample(M, size = n.sparse)] = rnorm(n.sparse)
The observation matrix, \(A\), has dimensions 100x256, is comprised of \(N(0,1)\) entries, and is column-normalized:
A = matrix(rnorm(M * N), nrow=N, ncol=M)
Apart = 1/sqrt(colSums(A^2))
A = A %*% diag(Apart)
sigma2_inv = exp(SNR/10 * log(10))/crossprod(A %*% x.true)*M
sigma = sqrt(1/sigma2_inv)
b = rnorm(N, A %*% x.true, sigma)
stopifnot(ncol(A) == M)
stopifnot(length(b) == N)
stopifnot(length(x.true) == M)
The observations are sampled from \(y_i \sim \operatorname{N}(A x, \sigma^2)\) with \(\sigma=0.0369\).
Next, we produce a grid of \(q\) and \(\lambda\) values, over which we’ll evaluate each method:
q.lam.grid = expand.grid(q=seq(0,1,length.out=20),
log10lambda=seq(log10(1e-3),
log10(0.1), length.out=20))
sim.params = cbind(q.lam.grid, 'sigma'=sigma)
library(plyr)
library(gamlr)
## Loading required package: Matrix
ms.lq.results = adply(sim.params, 1,
function(params) {
lambda = 10^(params$log10lambda)
ms.lq.res = ms.lq.algo(A, b,
lambda, params$q,
params$sigma,
10000)
MSEb = sqrt(crossprod(x.true - ms.lq.res$beta)/crossprod(x.true))
return(data.frame(params, 'MSEb'=MSEb))
})
library(ggplot2)
ggplot(ms.lq.results, aes(x=q, y=log10lambda, z=log10(MSEb))) +
geom_tile(aes(fill=log10(MSEb))) +
ylab(expression(log10(lambda))) +
ggtitle(paste0("SNR=",SNR, " (sigma=", round(sigma,3), "), non-zero coef's=", n.sparse))