\[ \newcommand{\argmin}{\operatorname{argmin}} \newcommand{\sign}{\operatorname{sign}} \newcommand{\diag}[1]{\operatorname{diag}(#1)} \newcommand{\prox}[2]{\operatorname{prox}_{#1}\left(#2\right)} \]

Introduction

We’re going to demonstrate an example from the BayesMAP paper 1 for a squared error loss and \(q\)-norm penalty for \(0 < q < 1\).

Given \(N\) observations \(y_i\) and an \(M \times N\) covariate matrix \(A\) we are interested in obtaining the following \[ \begin{gather*} \argmin_x \left\{ F(x) = f(x) + \lambda \sum_j^M \phi(x_j) \right\} \end{gather*} \] with \(f(x) = \|A x - b\|^2\) and \(\phi(x_j) = |x_j|^q\).

Methods

Marjanovic and Solo

This approach uses a proximal operator in a coordinate descent algorithm (see 2 and 3). The \(\operatorname{prox}\) operator for the \(q\)-norm has an exact solution given, component-wise, by \[ \begin{align*} \prox{\lambda \phi_q}{x} &= \begin{cases} 0 & \text{ if } |x| < h_\lambda \\ \{0, \sign(x) \beta_\lambda \} & \text{ if } |x| = h_\lambda \\ \sign(x) \hat{\beta} & \text{ if } |x| > h_\lambda \\ \end{cases} \end{align*} \] where \[ \begin{align*} b_{\lambda,q} &= \left(2 \lambda (1-q)\right)^{\frac{1}{2-q}} \\ h_{\lambda,q} &= b_{\lambda,q} + \lambda q b_{\lambda,q}^{q-1} \\ \hat{\beta} + \lambda q \hat{\beta}^{q-1} &= |x|, \hat{\beta} \in (b_{\lambda, q},|x|) \end{align*} \]

Rcpp Implementation

library(inline)
library(Rcpp)
## 
## Attaching package: 'Rcpp'
## 
## The following object is masked from 'package:inline':
## 
##     registerPlugin
library(RcppArmadillo)
includes.cpp = '
double isZero(double x) {
  return std::fabs(x) < 1e-7;
}

/*
 * This function is the l_q (0 < q < 1) prox operator
 * plus the T map in step b) of Marjanovic and Solo.
 *
 */
double tau(double beta, double z, double lambda, double q) {
  double b_lq = 2.0 * lambda * std::pow(1.0 - q, 1.0/(2.0-q));
  double h_lq = b_lq + lambda * q * std::pow(b_lq, q-1);
  double absz = std::fabs(z);
  double abszh_diff = absz - h_lq;
  double beta_hat;
  if (isZero(abszh_diff)) {
    beta_hat = isZero(beta) ? 0.0 : b_lq;
    beta_hat *= ((z > 0) - (z < 0));
  } else if (abszh_diff < 0) {
    beta_hat = 0.0;
  } else if (isZero(absz)) {
    beta_hat = 0.0;
  } else {
    beta_hat = (absz - b_lq)/2.0;
    //printf("beta_hat=%f, b_lq=%f, h_lq=%f, absz=%f, lam=%f, q=%f\\n", beta_hat, b_lq, h_lq, absz, lambda, q);
    double beta_last;
    do {
      beta_last = beta_hat;
      beta_hat = absz - lambda * q * std::pow(std::fabs(beta_hat), q-1);
    } while(std::fabs(beta_hat - beta_last) > 1e-7);

    if (beta_hat < 0.0)
      throw new std::runtime_error("beta_hat < 0");

    beta_hat *= ((z > 0) - (z < 0));
  }
  if (std::isnan(beta_hat))
    throw new std::runtime_error("beta_hat is nan");
  return(beta_hat);
}

double obj_val(arma::vec &y, arma::mat &X, arma::vec &beta, double sigma, double lambda0, double q) {
  return arma::sum(arma::square(y - X * beta))/(2.0*sigma*sigma) 
    + lambda0 * arma::sum(arma::pow(arma::abs(beta), q))/(sigma*sigma);
}
' 

main.cpp = '
arma::vec y = Rcpp::as<arma::vec>(r_y);
arma::mat X = Rcpp::as<arma::mat>(r_X);
double lambda = as<double>(r_lambda);
double sigma = as<double>(r_sigma);
double q = as<double>(r_q);
int max_iters = as<int>(r_max_iters);

int N = X.n_rows;
int M = X.n_cols;

printf("N=%i, M=%i, lambda=%f, q=%f, sigma=%f, max_iters=%i\\n", N, M, lambda, q, sigma, max_iters);

arma::vec beta = arma::zeros(M);
arma::vec r = y - X * beta;

int k = 0;
double L_prev = obj_val(y, X, beta, sigma, lambda, q);
double L_diff = 0.0; 
do {
  arma::vec beta_last = beta;
  for(int i=0; i<M; i++){
    double beta_now = beta(i);
    double z = arma::as_scalar(X.col(i).t() * r + beta_now);  
    double beta_new = tau(beta_now, z, lambda, q);
    r = r - (beta_new - beta_now) * X.col(i);
    beta(i) = beta_new;
  }
  k++;

  double L = obj_val(y, X, beta, sigma, lambda, q);
  L_diff = std::fabs(L - L_prev);
  L_prev = L;

  //std::cout << "k=" << k << ", beta=" << beta.t();
  //std::cout << ", L=" << L << std::endl;

} while(k < max_iters && L_diff > 1e-7);

return Rcpp::List::create(Rcpp::Named("beta")=beta);
'
ms.lq.algo = cxxfunction(signature(r_X="numeric", r_y="numeric", 
                                   r_lambda="numeric", r_q="numeric",
                                   r_sigma="numeric", r_max_iters="integer"), 
                         plugin="RcppArmadillo", 
                         body=main.cpp, includes=includes.cpp)    
## OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.

Simulation

We set up the problem dimensions, signal sparsity, and the signal-to-noise ratio (SNR):

set.seed(0)
M = 256
N = 100
SNR = 16.5
n.sparse = ceiling(M * 0.05)

Let \(x^*\) be the true signal, comprised of 13 non-zero \(N(0,1)\) terms.

x.true = matrix(0, M, 1)
x.true[sample(M, size = n.sparse)] = rnorm(n.sparse)

The observation matrix, \(A\), has dimensions 100x256, is comprised of \(N(0,1)\) entries, and is column-normalized:

A = matrix(rnorm(M * N), nrow=N, ncol=M)
Apart = 1/sqrt(colSums(A^2))
A = A %*% diag(Apart)

sigma2_inv = exp(SNR/10 * log(10))/crossprod(A %*% x.true)*M
sigma = sqrt(1/sigma2_inv)

b = rnorm(N, A %*% x.true, sigma)

stopifnot(ncol(A) == M)
stopifnot(length(b) == N)
stopifnot(length(x.true) == M)

The observations are sampled from \(y_i \sim \operatorname{N}(A x, \sigma^2)\) with \(\sigma=0.0369\).

Next, we produce a grid of \(q\) and \(\lambda\) values, over which we’ll evaluate each method:

q.lam.grid = expand.grid(q=seq(0,1,length.out=20), 
                         log10lambda=seq(log10(1e-3), 
                                         log10(0.1), length.out=20)) 
sim.params = cbind(q.lam.grid, 'sigma'=sigma)

Results

library(plyr)
library(gamlr)
## Loading required package: Matrix
ms.lq.results = adply(sim.params, 1,
                      function(params) {
                        lambda = 10^(params$log10lambda)
                        ms.lq.res = ms.lq.algo(A, b, 
                                               lambda, params$q, 
                                               params$sigma, 
                                               10000)
                        MSEb = sqrt(crossprod(x.true - ms.lq.res$beta)/crossprod(x.true))
                        return(data.frame(params, 'MSEb'=MSEb))
                      })
library(ggplot2)

ggplot(ms.lq.results, aes(x=q, y=log10lambda, z=log10(MSEb))) +
geom_tile(aes(fill=log10(MSEb))) + 
ylab(expression(log10(lambda))) +
ggtitle(paste0("SNR=",SNR, " (sigma=", round(sigma,3), "), non-zero coef's=", n.sparse))

plot of chunk plot_ms_results


  1. “BayesMAP” Walter Dempsey, Nicholas Polson, Brandon Willard

  2. “On \(\ell_q\) Estimation of Sparse Inverse Covariance” Marjanovic and Solo link

  3. \(\ell_q\) Sparsity Penalized Linear Regression With Cyclic Descent” Marjanovic and Solo link