Skip to contents
Setup code

We benchmark RmlxStats against base R and specialized fast fitting packages, across varying numbers of cases (n) and predictors (p). We also check accuracy.

Benchmarking was run on an M2 Macbook Air.

Metadata Metadata
Generated on 2026-05-03
Commit 773a23d
Branch master
Rmlx version 0.2.3

Benchmarking Code

Data Generation

Code
set.seed(20251111)

n_sizes <- c(10000, 50000, 250000) 
p_sizes <- c(50, 100, 200, 400, 800)
n_max <- max(n_sizes)
p_max <- max(p_sizes)

X <- matrix(rnorm(n_max * p_max), nrow = n_max, ncol = p_max)
colnames(X) <- paste0("x", seq_len(p_max))

X_glmnet <- matrix(rnorm(n_max * p_max), nrow = n_max, ncol = p_max)
for (j in 2:p_max) {
  X_glmnet[, j] <- 0.6 * X_glmnet[, j - 1] + sqrt(1 - 0.6^2) * X_glmnet[, j]
}
colnames(X_glmnet) <- paste0("x", seq_len(p_max))

beta_true <- rnorm(p_max, mean = 0, sd = 0.5)
y_continuous <- drop(X %*% beta_true + rnorm(n_max, sd = 5))

# only 1 in 10 predictors matter:
beta_glmnet_true <- rep(0, p_max)
beta_glmnet_true[seq(10, p_max, 10)] <- rnorm(length(seq(10, p_max, 10)), sd = 0.35)
y_sparse <- drop(X_glmnet %*% beta_glmnet_true + rnorm(n_max, sd = 5))

linpred <- drop(X %*% beta_true) / 10
prob <- 1 / (1 + exp(-linpred))
y_binary <- rbinom(n_max, size = 1, prob = prob)

full_data <- data.frame(
  y_cont = y_continuous,
  y_bin = y_binary,
  y_sparse = y_sparse,
  X
)

# for fast debugging
if (params$develop) {
  n_sizes <- n_sizes/10
  p_sizes <- p_sizes/5
}

bench_grid <- expand.grid(
  n = n_sizes,
  p = p_sizes,
  stringsAsFactors = FALSE
)

bench_grid <- bench_grid[bench_grid$n > bench_grid$p, ]

all_results <- data.frame()
all_accuracy <- data.frame()
Helper functions

relative_rmse <- function(estimate, truth) {
  scale <- sqrt(mean(truth^2))
  sqrt(mean((estimate - truth)^2)) / max(scale, 1e-12)
}

safe_relative_rmse <- function(estimate, truth) {
  estimate <- as.numeric(estimate)
  truth <- as.numeric(truth)

  if (anyNA(estimate) || anyNA(truth) ||
      any(!is.finite(estimate)) || any(!is.finite(truth))) {
    return(NA_real_)
  }

  relative_rmse(estimate, truth)
}

make_pca_fixture <- function(n, p, rank_k, noise_sd = 2) {
  scores_raw <- qr.Q(qr(matrix(rnorm(n * rank_k), nrow = n, ncol = rank_k)))
  scores_raw <- scale(scores_raw, center = TRUE, scale = FALSE)
  scores_basis <- qr.Q(qr(scores_raw))
  rotation <- qr.Q(qr(matrix(rnorm(p * rank_k), nrow = p, ncol = rank_k)))

  sdev_true <- seq(from = 2.5, to = 1.5, length.out = rank_k)
  singular_values <- sdev_true * sqrt(n - 1)
  x_signal <- scores_basis %*% diag(singular_values, nrow = rank_k) %*% 
    t(rotation)
  x <- x_signal + matrix(rnorm(n * p, sd = noise_sd), nrow = n, ncol = p)

  colnames(x) <- paste0("pcx", seq_len(p))

  list(
    x = x,
    rotation = rotation,
    sdev = sdev_true,
    rank = rank_k
  )
}

extract_pca_rotation <- function(fit) {
  if (inherits(fit, "big_SVD")) {
    fit$v
  } else {
    fit$rotation
  }
}

extract_pca_sdev <- function(fit) {
  if (inherits(fit, "big_SVD")) {
    fit$d / sqrt(nrow(fit$u) - 1)
  } else {
    fit$sdev
  }
}

projector_error <- function(estimate, truth) {
  estimate <- as.matrix(estimate)
  truth <- as.matrix(truth)
  proj_estimate <- estimate %*% t(estimate)
  proj_truth <- truth %*% t(truth)
  sqrt(sum((proj_estimate - proj_truth)^2)) / sqrt(sum(proj_truth^2))
}

pca_accuracy_score <- function(fit, truth_rotation, truth_sdev) {
  projector_error(extract_pca_rotation(fit), truth_rotation) +
    relative_rmse(
      as.numeric(extract_pca_sdev(fit)[seq_along(truth_sdev)]),
      truth_sdev
    )
}

make_bootstrap_data <- function(n, p) {
  x <- X[1:n, 1:p, drop = FALSE]
  colnames(x) <- paste0("x", seq_len(p))
  beta <- beta_true[seq_len(p)]
  sigma <- 1 + 2 * abs(x[, 1])
  y <- drop(x %*% beta + rnorm(n, sd = sigma))

  data.frame(y_boot = y, x)
}

bootstrap_oracle_se <- function(x, beta, formula, reps) {
  sigma <- 1 + 2 * abs(x[, 1])
  estimates <- replicate(reps, {
    y <- drop(x %*% beta + rnorm(nrow(x), sd = sigma))
    coef(lm(formula, data = data.frame(y_boot = y, x)))
  })
  apply(t(estimates), 2, sd)
}

extract_bootstrap_se <- function(method, fit) {
  if (identical(method, "stats::lm")) {
    return(as.numeric(fit))
  }

  if (identical(method, "boot::boot")) {
    return(apply(fit$t, 2, sd))
  }

  if (grepl("^lmboot::", method)) {
    return(apply(fit$bootEstParam, 2, sd))
  }

  as.numeric(fit$std.error)
}

mlxs_lm

Code
lm_results <- list()
lm_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[3], ]
lm_accuracy <- list()

for (i in seq_len(nrow(lm_grid))) {
  n <- lm_grid$n[i]
  p <- lm_grid$p[i]

  subset_data <- full_data[1:n, c("y_cont", paste0("x", 1:p))]
  lm_formula <- reformulate(paste0("x", 1:p), response = "y_cont")
  beta_target <- beta_true[seq_len(p)]
  fitters <- list(
    "stats::lm" = function() lm(lm_formula, data = subset_data),
    "RmlxStats::mlxs_lm" = function() {
      fit <- mlxs_lm(lm_formula, data = subset_data)
      Rmlx::mlx_eval(fit$coefficients)
      fit
    },
    "fixest::feols" = function() feols(lm_formula, data = subset_data),
    "RcppEigen::fastLm" = function() RcppEigen::fastLm(
      lm_formula,
      data = subset_data
    ),
    "speedglm::speedlm" = function() speedglm::speedlm(
      lm_formula,
      data = subset_data
    )
  )

  bm <- mark(
    "stats::lm" = fitters[["stats::lm"]](),
    "RmlxStats::mlxs_lm" = fitters[["RmlxStats::mlxs_lm"]](),
    "fixest::feols" = fitters[["fixest::feols"]](),
    "RcppEigen::fastLm" = fitters[["RcppEigen::fastLm"]](),
    "speedglm::speedlm" = fitters[["speedglm::speedlm"]](),
    iterations = 3,
    check = function (r1, r2) {
      all.equal(as.vector(coef(r1)), as.vector(coef(r2)), 
                tolerance = 1e-6)
    },
    filter_gc = FALSE
  )

  bm$n <- n
  bm$p <- p
  bm$model_type <- "lm"
  lm_results[[i]] <- bm

  fits <- lapply(fitters, function(fit_method) fit_method())
  scores <- lapply(names(fits), function(method) {
    beta_hat <- as.numeric(coef(fits[[method]]))[-1]
    data.frame(
      model_type = "lm",
      n = n,
      p = p,
      method = method,
      accuracy = relative_rmse(beta_hat, beta_target),
      stringsAsFactors = FALSE
    )
  })
  lm_accuracy[[i]] <- do.call(rbind, scores)
}

lm_df <- do.call(rbind, lm_results)
all_results <- rbind(all_results, lm_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, lm_accuracy))

mlxs_glm

Code
glm_results <- list()
glm_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[3], ]
glm_accuracy <- list()

for (i in seq_len(nrow(glm_grid))) {
  n <- glm_grid$n[i]
  p <- glm_grid$p[i]

  subset_data <- full_data[1:n, c("y_bin", paste0("x", 1:p))]
  glm_formula <- reformulate(paste0("x", 1:p), response = "y_bin")
  beta_target <- beta_true[seq_len(p)] / 5
  fitters <- list(
    "stats::glm" = function() glm(
      glm_formula,
      family = binomial(),
      data = subset_data,
      control = list(maxit = 50)
    ),
    "RmlxStats::mlxs_glm" = function() {
      fit <- mlxs_glm(
        glm_formula,
        family = mlxs_binomial(),
        data = subset_data,
        control = list(maxit = 50, epsilon = 1e-5)
      )
      Rmlx::mlx_eval(fit$coefficients)
      fit
    },
    "speedglm::speedglm" = function() speedglm::speedglm(
      glm_formula,
      family = binomial(),
      data = subset_data
    )
  )

  bm <- mark(
    "stats::glm" = fitters[["stats::glm"]](),
    "RmlxStats::mlxs_glm" = fitters[["RmlxStats::mlxs_glm"]](),
    "speedglm::speedglm" = fitters[["speedglm::speedglm"]](),
    iterations = 3,
    check = function (r1, r2) {
      all.equal(as.vector(coef(r1)), as.vector(coef(r2)), 
                tolerance = 1e-4)
    },
    filter_gc = FALSE
  )

  bm$n <- n
  bm$p <- p
  bm$model_type <- "glm"
  glm_results[[i]] <- bm

  fits <- lapply(fitters, function(fit_method) fit_method())
  scores <- lapply(names(fits), function(method) {
    beta_hat <- as.numeric(coef(fits[[method]]))[-1]
    data.frame(
      model_type = "glm",
      n = n,
      p = p,
      method = method,
      accuracy = relative_rmse(beta_hat, beta_target),
      stringsAsFactors = FALSE
    )
  })
  glm_accuracy[[i]] <- do.call(rbind, scores)
}

glm_df <- do.call(rbind, glm_results)
all_results <- rbind(all_results, glm_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, glm_accuracy))

mlxs_cv_glmnet

Code
glmnet_results <- list()
glmnet_grid <- bench_grid
glmnet_accuracy <- list()
glmnet_nfolds <- 3L

for (i in seq_len(nrow(glmnet_grid))) {
  n <- glmnet_grid$n[i]
  p <- glmnet_grid$p[i]

  xvars <- paste0("x", 1:p)
  x <- X_glmnet[1:n, xvars, drop = FALSE]
  y_sparse_subset <- y_sparse[1:n]
  beta_target <- beta_glmnet_true[seq_len(p)]
  stats_time <- system.time({
    fit_stats <- glmnet::cv.glmnet(
      x,
      y_sparse_subset,
      nfolds = glmnet_nfolds
    )
    as.numeric(fit_stats$lambda.min)
  })[["elapsed"]]
  mlx_time <- system.time({
    fit_mlx <- mlxs_cv_glmnet(
      x,
      y_sparse_subset,
      nfolds = glmnet_nfolds
    )
    Rmlx::mlx_eval(fit_mlx$glmnet.fit$x_center)
    Rmlx::mlx_eval(fit_mlx$glmnet.fit$x_scale)
    as.numeric(fit_mlx$lambda.min)
  })[["elapsed"]]

  bm <- data.frame(
    expression = c("glmnet::cv.glmnet", "RmlxStats::mlxs_cv_glmnet"),
    median = bench::as_bench_time(c(stats_time, mlx_time)),
    stringsAsFactors = FALSE
  )

  bm$n <- n
  bm$p <- p
  bm$model_type <- "glmnet"
  for (name in setdiff(names(all_results), names(bm))) {
    bm[[name]] <- NA
  }
  bm <- bm[, names(all_results), drop = FALSE]
  glmnet_results[[i]] <- bm

  fits <- list(
    "glmnet::cv.glmnet" = fit_stats,
    "RmlxStats::mlxs_cv_glmnet" = fit_mlx
  )
  scores <- lapply(names(fits), function(method) {
    beta_hat <- as.numeric(coef(fits[[method]], s = "lambda.min"))[-1]
    data.frame(
      model_type = "glmnet",
      n = n,
      p = p,
      method = method,
      accuracy = safe_relative_rmse(beta_hat, beta_target),
      stringsAsFactors = FALSE
    )
  })
  glmnet_accuracy[[i]] <- do.call(rbind, scores)
}

glmnet_df <- do.call(rbind, glmnet_results)
all_results <- rbind(all_results, glmnet_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, glmnet_accuracy))

mlxs_prcomp

Code
pca_results <- list()
pca_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[4], ]
pca_accuracy <- list()

for (i in seq_len(nrow(pca_grid))) {
  n <- pca_grid$n[i]
  p <- pca_grid$p[i]

  rank_k <- min(8L, max(4L, floor(min(n, p) / 2)))
  pca_fixture <- make_pca_fixture(n, p, rank_k)
  x <- pca_fixture$x
  x_big <- bigstatsr::FBM(nrow = n, ncol = p, init = as.matrix(x))
  rotation_true <- pca_fixture$rotation
  fitters <- list(
    "stats::prcomp" = function() prcomp(
      x,
      center = TRUE,
      scale. = FALSE,
      rank. = rank_k
    ),
    "irlba::prcomp_irlba" = function() irlba::prcomp_irlba(
      x,
      n = rank_k,
      center = TRUE,
      scale. = FALSE
    ),
    "rsvd::rpca" = function() rsvd::rpca(
      x,
      k = rank_k,
      center = TRUE,
      scale = FALSE
    ),
    "bigstatsr::big_randomSVD" = function() bigstatsr::big_randomSVD(
      x_big,
      fun.scaling = bigstatsr::big_scale(center = TRUE, scale = FALSE),
      k = rank_k
    ),
    "RmlxStats::mlxs_prcomp" = function() {
      fit <- mlxs_prcomp(
        x,
        center = TRUE,
        scale. = FALSE,
        rank. = rank_k,
        n_iter = 2,
        seed = 1
      )
      Rmlx::mlx_eval(fit$rotation)
      if (!is.null(fit$x)) {
        Rmlx::mlx_eval(fit$x)
      }
      fit
    }
  )

  bm <- mark(
    "stats::prcomp" = fitters[["stats::prcomp"]](),
    "irlba::prcomp_irlba" = fitters[["irlba::prcomp_irlba"]](),
    "rsvd::rpca" = fitters[["rsvd::rpca"]](),
    "bigstatsr::big_randomSVD" = fitters[["bigstatsr::big_randomSVD"]](),
    "RmlxStats::mlxs_prcomp" = fitters[["RmlxStats::mlxs_prcomp"]](),
    iterations = 3,
    check = FALSE,
    filter_gc = FALSE
  )

  bm$n <- n
  bm$p <- p
  bm$model_type <- "pca"
  pca_results[[i]] <- bm

  fits <- lapply(fitters, function(fit_method) fit_method())
  scores <- lapply(names(fits), function(method) {
    data.frame(
      model_type = "pca",
      n = n,
      p = p,
      method = method,
      accuracy = pca_accuracy_score(
        fits[[method]],
        rotation_true,
        pca_fixture$sdev
      ),
      stringsAsFactors = FALSE
    )
  })
  pca_accuracy[[i]] <- do.call(rbind, scores)
}

pca_df <- do.call(rbind, pca_results)
all_results <- rbind(all_results, pca_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, pca_accuracy))

Bootstrap

For bootstrap, we use only smaller datasets due to computational cost.

Code
boot_results <- list()
boot_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[2], ]
boot_accuracy <- list()
bootstrap_reps <- if (params$develop) 50L else 100L
oracle_reps <- if (params$develop) 50L else 200L

for (i in seq_len(nrow(boot_grid))) {
  n <- boot_grid$n[i]
  p <- boot_grid$p[i]

  subset_data <- make_bootstrap_data(n, p)
  x <- as.matrix(subset_data[paste0("x", seq_len(p))])
  beta_target <- beta_true[seq_len(p)]
  formula_str <- paste("y_boot ~", paste(paste0("x", 1:p), collapse = " + "))
  boot_formula <- as.formula(formula_str)
  fit_mlxs <- mlxs_lm(boot_formula, data = subset_data)
  oracle_se <- bootstrap_oracle_se(x, beta_target, boot_formula, oracle_reps)
  oracle_se <- oracle_se[-1]

  boot_stat <- function(dat, idx) {
    coef(lm(boot_formula, data = dat[idx, , drop = FALSE]))
  }
  fitters <- list(
    "boot::boot" = function() boot::boot(
      subset_data,
      statistic = boot_stat,
      R = bootstrap_reps,
      parallel = "no"
    ),
    "lmboot::paired.boot" = function() lmboot::paired.boot(
      boot_formula,
      data = subset_data,
      B = bootstrap_reps
    ),
    "lmboot::residual.boot" = function() lmboot::residual.boot(
      boot_formula,
      data = subset_data,
      B = bootstrap_reps
    ),
    "mlxs_case" = function() {
      summary(
        fit_mlxs,
        bootstrap = TRUE,
        bootstrap_args = list(
          B = bootstrap_reps,
          seed = 42,
          bootstrap_type = "case",
          progress = FALSE
        )
      )
    },
    "mlxs_resid" = function() {
      summary(
        fit_mlxs,
        bootstrap = TRUE,
        bootstrap_args = list(
          B = bootstrap_reps,
          seed = 42,
          bootstrap_type = "resid",
          progress = FALSE
        )
      )
    }
  )

  bm <- mark(
    "boot::boot" = fitters[["boot::boot"]](),
    "lmboot::paired.boot" = fitters[["lmboot::paired.boot"]](),
    "lmboot::residual.boot" = fitters[["lmboot::residual.boot"]](),
    "mlxs_case" = {
      fit <- fitters[["mlxs_case"]]()
      Rmlx::mlx_eval(fit$std.error)
      fit
    },
    "mlxs_resid" = {
      fit <- fitters[["mlxs_resid"]]()
      Rmlx::mlx_eval(fit$std.error)
      fit
    },
    iterations = 3,
    check = FALSE,
    filter_gc = FALSE,
    memory = FALSE
  )

  bm$n <- n
  bm$p <- p
  bm$model_type <- "Bootstrap"
  boot_results[[i]] <- bm

  fits <- lapply(fitters, function(fit_method) fit_method())
  scores <- lapply(names(fits), function(method) {
    se_hat <- extract_bootstrap_se(method, fits[[method]])[-1]
    data.frame(
      model_type = "Bootstrap",
      n = n,
      p = p,
      method = method,
      accuracy = relative_rmse(se_hat, oracle_se),
      stringsAsFactors = FALSE
    )
  })
  boot_accuracy[[i]] <- do.call(rbind, scores)
}

boot_df <- do.call(rbind, boot_results)
all_results <- rbind(all_results, boot_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, boot_accuracy))

Results: Speed

We compare functions both against base R implementation, and against the fastest alternative tested.

Display code

mlxs_lm

Data table
Method Median seconds Rel. to base Rel. to best
n = 10000, p = 50
fixest::feols 0.011   44.8% 108.0%
RcppEigen::fastLm 0.013   53.6% 129.2%
speedglm::speedlm 0.021   86.1% 207.7%
stats::lm 0.024   100.0% 241.3%
RmlxStats::mlxs_lm 0.010   41.5% 100.0%
n = 50000, p = 50
fixest::feols 0.034   29.7% 100.0%
RcppEigen::fastLm 0.065   57.3% 193.1%
speedglm::speedlm 0.093   81.6% 275.0%
stats::lm 0.11    100.0% 337.1%
RmlxStats::mlxs_lm 0.038   33.4% 112.5%
n = 10000, p = 100
fixest::feols 0.027   33.0% 132.1%
RcppEigen::fastLm 0.035   42.8% 171.5%
speedglm::speedlm 0.069   84.9% 339.7%
stats::lm 0.081   100.0% 400.2%
RmlxStats::mlxs_lm 0.020   25.0% 100.0%
n = 50000, p = 100
fixest::feols 0.10    26.4% 130.6%
RcppEigen::fastLm 0.18    46.6% 230.1%
speedglm::speedlm 0.43    109.5% 540.9%
stats::lm 0.39    100.0% 494.2%
RmlxStats::mlxs_lm 0.080   20.2% 100.0%
n = 10000, p = 200
fixest::feols 0.086   28.8% 217.2%
RcppEigen::fastLm 0.12    39.5% 298.2%
speedglm::speedlm 0.26    85.6% 646.2%
stats::lm 0.30    100.0% 754.6%
RmlxStats::mlxs_lm 0.040   13.3% 100.0%
n = 50000, p = 200
fixest::feols 0.38    23.3% 192.9%
RcppEigen::fastLm 0.65    39.8% 329.2%
speedglm::speedlm 1.3     79.2% 655.5%
stats::lm 1.6     100.0% 827.5%
RmlxStats::mlxs_lm 0.20    12.1% 100.0%
Base is base R implementation in 'stats' or 'boot' packages.

mlxs_glm

Data table
Method Median seconds Rel. to base Rel. to best
n = 10000, p = 50
speedglm::speedglm 0.079   74.4% 223.2%
stats::glm 0.11    100.0% 300.1%
RmlxStats::mlxs_glm 0.035   33.3% 100.0%
n = 50000, p = 50
speedglm::speedglm 0.39    76.5% 365.5%
stats::glm 0.50    100.0% 477.8%
RmlxStats::mlxs_glm 0.11    20.9% 100.0%
n = 10000, p = 100
speedglm::speedglm 0.28    84.3% 440.3%
stats::glm 0.33    100.0% 522.5%
RmlxStats::mlxs_glm 0.063   19.1% 100.0%
n = 50000, p = 100
speedglm::speedglm 1.3     78.5% 507.9%
stats::glm 1.7     100.0% 647.2%
RmlxStats::mlxs_glm 0.26    15.5% 100.0%
n = 10000, p = 200
speedglm::speedglm 0.98    80.3% 861.3%
stats::glm 1.2     100.0% 1072.2%
RmlxStats::mlxs_glm 0.11    9.3% 100.0%
n = 50000, p = 200
speedglm::speedglm 5.0     83.6% 1065.0%
stats::glm 6.0     100.0% 1274.6%
RmlxStats::mlxs_glm 0.47    7.8% 100.0%
Base is base R implementation in 'stats' or 'boot' packages.

mlxs_cv_glmnet

Data table
Method Median seconds Rel. to base Rel. to best
n = 10000, p = 50
glmnet::cv.glmnet 0.13        100.0%
RmlxStats::mlxs_cv_glmnet 2.0         1494.7%
n = 50000, p = 50
glmnet::cv.glmnet 0.42        100.0%
RmlxStats::mlxs_cv_glmnet 1.8         420.8%
n = 250000, p = 50
glmnet::cv.glmnet 1.7         100.0%
RmlxStats::mlxs_cv_glmnet 2.3         134.1%
n = 10000, p = 100
glmnet::cv.glmnet 0.11        100.0%
RmlxStats::mlxs_cv_glmnet 2.2         2112.3%
n = 50000, p = 100
glmnet::cv.glmnet 0.46        100.0%
RmlxStats::mlxs_cv_glmnet 1.9         423.4%
n = 250000, p = 100
glmnet::cv.glmnet 2.8         102.5%
RmlxStats::mlxs_cv_glmnet 2.8         100.0%
n = 10000, p = 200
glmnet::cv.glmnet 0.36        100.0%
RmlxStats::mlxs_cv_glmnet 7.9         2157.4%
n = 50000, p = 200
glmnet::cv.glmnet 1.1         100.0%
RmlxStats::mlxs_cv_glmnet 2.4         215.6%
n = 250000, p = 200
glmnet::cv.glmnet 7.1         157.0%
RmlxStats::mlxs_cv_glmnet 4.5         100.0%
n = 10000, p = 400
glmnet::cv.glmnet 0.76        100.0%
RmlxStats::mlxs_cv_glmnet 20.          2600.0%
n = 50000, p = 400
glmnet::cv.glmnet 3.9         112.8%
RmlxStats::mlxs_cv_glmnet 3.4         100.0%
n = 250000, p = 400
glmnet::cv.glmnet 23.          263.7%
RmlxStats::mlxs_cv_glmnet 8.6         100.0%
n = 10000, p = 800
glmnet::cv.glmnet 2.3         100.0%
RmlxStats::mlxs_cv_glmnet 48.          2044.5%
n = 50000, p = 800
glmnet::cv.glmnet 9.3         100.0%
RmlxStats::mlxs_cv_glmnet 81.          872.1%
n = 250000, p = 800
glmnet::cv.glmnet 44.          191.8%
RmlxStats::mlxs_cv_glmnet 23.          100.0%
Base is base R implementation in 'stats' or 'boot' packages.

mlxs_prcomp

Data table
Method Median seconds Rel. to base Rel. to best
n = 10000, p = 50
bigstatsr::big_randomSVD 0.033   60.0% 324.7%
irlba::prcomp_irlba 0.028   52.1% 282.3%
rsvd::rpca 0.083   152.0% 823.2%
stats::prcomp 0.054   100.0% 541.5%
RmlxStats::mlxs_prcomp 0.010   18.5% 100.0%
n = 50000, p = 50
bigstatsr::big_randomSVD 0.12    42.7% 407.4%
irlba::prcomp_irlba 0.12    42.9% 410.0%
rsvd::rpca 0.42    147.0% 1404.0%
stats::prcomp 0.28    100.0% 954.9%
RmlxStats::mlxs_prcomp 0.030   10.5% 100.0%
n = 10000, p = 100
bigstatsr::big_randomSVD 0.064   30.6% 436.7%
irlba::prcomp_irlba 0.057   27.2% 387.9%
rsvd::rpca 0.14    67.0% 956.7%
stats::prcomp 0.21    100.0% 1428.5%
RmlxStats::mlxs_prcomp 0.015   7.0% 100.0%
n = 50000, p = 100
bigstatsr::big_randomSVD 0.22    20.6% 480.7%
irlba::prcomp_irlba 0.23    21.2% 494.2%
rsvd::rpca 0.70    65.4% 1525.1%
stats::prcomp 1.1     100.0% 2331.1%
RmlxStats::mlxs_prcomp 0.046   4.3% 100.0%
n = 10000, p = 200
bigstatsr::big_randomSVD 0.10    11.5% 684.8%
irlba::prcomp_irlba 0.10    11.4% 681.1%
rsvd::rpca 0.25    28.7% 1712.5%
stats::prcomp 0.89    100.0% 5966.6%
RmlxStats::mlxs_prcomp 0.015   1.7% 100.0%
n = 50000, p = 200
bigstatsr::big_randomSVD 0.43    8.8% 862.9%
irlba::prcomp_irlba 0.45    9.1% 897.1%
rsvd::rpca 1.4     27.9% 2746.0%
stats::prcomp 4.9     100.0% 9859.2%
RmlxStats::mlxs_prcomp 0.050   1.0% 100.0%
n = 10000, p = 400
bigstatsr::big_randomSVD 0.22    6.2% 688.8%
irlba::prcomp_irlba 0.19    5.5% 612.5%
rsvd::rpca 0.48    13.9% 1539.3%
stats::prcomp 3.5     100.0% 11045.3%
RmlxStats::mlxs_prcomp 0.031   0.9% 100.0%
n = 50000, p = 400
bigstatsr::big_randomSVD 0.86    5.2% 965.3%
irlba::prcomp_irlba 0.99    6.0% 1110.8%
rsvd::rpca 2.6     15.6% 2875.9%
stats::prcomp 16.      100.0% 18443.5%
RmlxStats::mlxs_prcomp 0.089   0.5% 100.0%
Base is base R implementation in 'stats' or 'boot' packages.

Bootstrap

Data table
Method Median seconds Rel. to base Rel. to best
n = 10000, p = 50
boot::boot 2.8     100.0% 2071.6%
lmboot::paired.boot 4.3     154.7% 3205.6%
lmboot::residual.boot 0.13    4.8% 100.0%
mlxs_case 0.90    32.1% 665.7%
mlxs_resid 0.16    5.7% 118.8%
n = 50000, p = 50
boot::boot 16.      100.0% 4875.8%
lmboot::paired.boot 21.      135.1% 6588.1%
lmboot::residual.boot 0.63    4.0% 196.5%
mlxs_case 3.1     20.0% 973.3%
mlxs_resid 0.32    2.1% 100.0%
n = 10000, p = 100
boot::boot 9.0     100.0% 5629.1%
lmboot::paired.boot 15.      170.8% 9614.5%
lmboot::residual.boot 0.34    3.8% 214.3%
mlxs_case 1.8     19.9% 1118.5%
mlxs_resid 0.16    1.8% 100.0%
n = 50000, p = 100
boot::boot 47.      100.0% 14243.1%
lmboot::paired.boot 77.      162.6% 23154.4%
lmboot::residual.boot 1.8     3.7% 529.9%
mlxs_case 7.6     16.1% 2286.9%
mlxs_resid 0.33    0.7% 100.0%
Base is base R implementation in 'stats' or 'boot' packages.

Results: Accuracy

mlxs_lm

Errors are calculated as the relative root mean squared error of the betas (i.e. root mean squared error of the estimated betas compared to the true betas, divided by the root mean square of the true betas).

Data table
Method Error Rel. to best
n = 10000, p = 50
stats::lm 0.290 100.0%
RmlxStats::mlxs_lm 0.290 100.0%
fixest::feols 0.290 100.0%
RcppEigen::fastLm 0.290 100.0%
speedglm::speedlm 0.290 100.0%
n = 50000, p = 50
stats::lm 0.137 100.0%
RmlxStats::mlxs_lm 0.137 100.0%
fixest::feols 0.137 100.0%
RcppEigen::fastLm 0.137 100.0%
speedglm::speedlm 0.137 100.0%
n = 10000, p = 100
stats::lm 0.343 100.0%
RmlxStats::mlxs_lm 0.343 100.0%
fixest::feols 0.343 100.0%
RcppEigen::fastLm 0.343 100.0%
speedglm::speedlm 0.343 100.0%
n = 50000, p = 100
stats::lm 0.141 100.0%
RmlxStats::mlxs_lm 0.141 100.0%
fixest::feols 0.141 100.0%
RcppEigen::fastLm 0.141 100.0%
speedglm::speedlm 0.141 100.0%
n = 10000, p = 200
stats::lm 0.286 100.0%
RmlxStats::mlxs_lm 0.286 100.0%
fixest::feols 0.286 100.0%
RcppEigen::fastLm 0.286 100.0%
speedglm::speedlm 0.286 100.0%
n = 50000, p = 200
stats::lm 0.116 100.0%
RmlxStats::mlxs_lm 0.116 100.0%
fixest::feols 0.116 100.0%
RcppEigen::fastLm 0.116 100.0%
speedglm::speedlm 0.116 100.0%
Graph

mlxs_glm

Errors are calculated as the relative root mean squared error of the estimated betas.

Data table
Method Error Rel. to best
n = 10000, p = 50
stats::glm 0.632 100.0%
RmlxStats::mlxs_glm 0.632 100.0%
speedglm::speedglm 0.632 100.0%
n = 50000, p = 50
stats::glm 0.630 100.0%
RmlxStats::mlxs_glm 0.630 100.0%
speedglm::speedglm 0.630 100.0%
n = 10000, p = 100
stats::glm 0.645 100.0%
RmlxStats::mlxs_glm 0.645 100.0%
speedglm::speedglm 0.645 100.0%
n = 50000, p = 100
stats::glm 0.625 100.0%
RmlxStats::mlxs_glm 0.625 100.0%
speedglm::speedglm 0.625 100.0%
n = 10000, p = 200
stats::glm 0.641 100.0%
RmlxStats::mlxs_glm 0.641 100.0%
speedglm::speedglm 0.641 100.0%
n = 50000, p = 200
stats::glm 0.617 100.0%
RmlxStats::mlxs_glm 0.617 100.0%
speedglm::speedglm 0.617 100.0%
Graph

mlxs_cv_glmnet

Errors are calculated as the relative root mean squared error of the estimated betas, for lambda = lambda.min.

Data table
Method Error Rel. to best
n = 10000, p = 50
glmnet::cv.glmnet 0.152  100.0%
RmlxStats::mlxs_cv_glmnet 0.153  100.4%
n = 50000, p = 50
glmnet::cv.glmnet 0.108  101.3%
RmlxStats::mlxs_cv_glmnet 0.106  100.0%
n = 250000, p = 50
glmnet::cv.glmnet 0.0375 105.2%
RmlxStats::mlxs_cv_glmnet 0.0357 100.0%
n = 10000, p = 100
glmnet::cv.glmnet 0.229  100.0%
RmlxStats::mlxs_cv_glmnet 0.230  100.6%
n = 50000, p = 100
glmnet::cv.glmnet 0.124  100.0%
RmlxStats::mlxs_cv_glmnet 0.125  100.9%
n = 250000, p = 100
glmnet::cv.glmnet 0.0618 100.0%
RmlxStats::mlxs_cv_glmnet 0.0634 102.6%
n = 10000, p = 200
glmnet::cv.glmnet 0.276  101.2%
RmlxStats::mlxs_cv_glmnet 0.272  100.0%
n = 50000, p = 200
glmnet::cv.glmnet 0.164  100.0%
RmlxStats::mlxs_cv_glmnet 0.168  102.4%
n = 250000, p = 200
glmnet::cv.glmnet 0.0832 101.1%
RmlxStats::mlxs_cv_glmnet 0.0823 100.0%
n = 10000, p = 400
glmnet::cv.glmnet 0.291  100.0%
RmlxStats::mlxs_cv_glmnet 0.296  101.6%
n = 50000, p = 400
glmnet::cv.glmnet 0.147  100.0%
RmlxStats::mlxs_cv_glmnet 0.147  100.0%
n = 250000, p = 400
glmnet::cv.glmnet 0.0713 100.0%
RmlxStats::mlxs_cv_glmnet 0.0731 102.6%
n = 10000, p = 800
glmnet::cv.glmnet 0.280  100.0%
RmlxStats::mlxs_cv_glmnet 0.284  101.8%
n = 50000, p = 800
glmnet::cv.glmnet 0.127  100.0%
RmlxStats::mlxs_cv_glmnet 0.127  100.0%
n = 250000, p = 800
glmnet::cv.glmnet 0.0614 100.0%
RmlxStats::mlxs_cv_glmnet 0.0622 101.3%
Graph

mlxs_prcomp

PCA errors are the projector error of the rotation matrix compared to the true data-generating rotation matrix, plus the relative root mean squared error of the standard deviations. Projector error is equivalent to measuring the principal angles between the estimated and true PCA subspaces.

Data table
Method Error Rel. to best
n = 10000, p = 50
stats::prcomp 0.559 100.0%
irlba::prcomp_irlba 0.559 100.0%
rsvd::rpca 0.761 136.2%
bigstatsr::big_randomSVD 0.559 100.0%
RmlxStats::mlxs_prcomp 0.900 161.0%
n = 50000, p = 50
stats::prcomp 0.483 100.0%
irlba::prcomp_irlba 0.483 100.0%
rsvd::rpca 0.712 147.5%
bigstatsr::big_randomSVD 0.483 100.0%
RmlxStats::mlxs_prcomp 0.957 198.4%
n = 10000, p = 100
stats::prcomp 0.642 100.0%
irlba::prcomp_irlba 0.642 100.0%
rsvd::rpca 0.924 144.0%
bigstatsr::big_randomSVD 0.642 100.0%
RmlxStats::mlxs_prcomp 1.16  181.3%
n = 50000, p = 100
stats::prcomp 0.516 100.0%
irlba::prcomp_irlba 0.516 100.0%
rsvd::rpca 0.875 169.6%
bigstatsr::big_randomSVD 0.516 100.0%
RmlxStats::mlxs_prcomp 1.11  215.7%
n = 10000, p = 200
stats::prcomp 0.740 100.0%
irlba::prcomp_irlba 0.740 100.0%
rsvd::rpca 1.10  148.8%
bigstatsr::big_randomSVD 0.740 100.0%
RmlxStats::mlxs_prcomp 1.30  175.5%
n = 50000, p = 200
stats::prcomp 0.552 100.0%
irlba::prcomp_irlba 0.552 100.0%
rsvd::rpca 1.01  183.4%
bigstatsr::big_randomSVD 0.552 100.0%
RmlxStats::mlxs_prcomp 1.23  222.3%
n = 10000, p = 400
stats::prcomp 0.870 100.0%
irlba::prcomp_irlba 0.870 100.0%
rsvd::rpca 1.19  136.7%
bigstatsr::big_randomSVD 0.870 100.0%
RmlxStats::mlxs_prcomp 1.40  161.1%
n = 50000, p = 400
stats::prcomp 0.611 100.0%
irlba::prcomp_irlba 0.611 100.0%
rsvd::rpca 1.14  186.0%
bigstatsr::big_randomSVD 0.611 100.0%
RmlxStats::mlxs_prcomp 1.33  217.6%
Graph

Bootstraps

Errors are calculated as the root mean squared error of the bootstrap-estimated standard errors. We calculate the true standard error by repeatedly estimating linear models, drawing a new dependent variable from the (known) data generating process and calculating the standard deviation of the estimates.

Data table
Method Error Rel. to best
n = 10000, p = 50
boot::boot 0.0937 123.0%
lmboot::paired.boot 0.0761 100.0%
lmboot::residual.boot 0.110  144.6%
mlxs_case 0.0945 124.1%
mlxs_resid 0.0997 130.9%
n = 50000, p = 50
boot::boot 0.0803 100.0%
lmboot::paired.boot 0.0912 113.6%
lmboot::residual.boot 0.114  141.9%
mlxs_case 0.0879 109.4%
mlxs_resid 0.120  149.7%
n = 10000, p = 100
boot::boot 0.101  115.7%
lmboot::paired.boot 0.0897 103.0%
lmboot::residual.boot 0.106  122.3%
mlxs_case 0.0870 100.0%
mlxs_resid 0.116  133.1%
n = 50000, p = 100
boot::boot 0.0940 116.1%
lmboot::paired.boot 0.0810 100.0%
lmboot::residual.boot 0.107  132.8%
mlxs_case 0.0854 105.5%
mlxs_resid 0.0940 116.1%
Graph