Setup code
We benchmark RmlxStats against base R and specialized fast fitting
packages, across varying numbers of cases (n) and
predictors (p). We also check accuracy.
Benchmarking was run on an M2 Macbook Air.
| Metadata | Metadata |
|---|---|
| Generated on | 2026-05-03 |
| Commit | 773a23d |
| Branch | master |
| Rmlx version | 0.2.3 |
Benchmarking Code
Data Generation
Code
set.seed(20251111)
n_sizes <- c(10000, 50000, 250000)
p_sizes <- c(50, 100, 200, 400, 800)
n_max <- max(n_sizes)
p_max <- max(p_sizes)
X <- matrix(rnorm(n_max * p_max), nrow = n_max, ncol = p_max)
colnames(X) <- paste0("x", seq_len(p_max))
X_glmnet <- matrix(rnorm(n_max * p_max), nrow = n_max, ncol = p_max)
for (j in 2:p_max) {
X_glmnet[, j] <- 0.6 * X_glmnet[, j - 1] + sqrt(1 - 0.6^2) * X_glmnet[, j]
}
colnames(X_glmnet) <- paste0("x", seq_len(p_max))
beta_true <- rnorm(p_max, mean = 0, sd = 0.5)
y_continuous <- drop(X %*% beta_true + rnorm(n_max, sd = 5))
# only 1 in 10 predictors matter:
beta_glmnet_true <- rep(0, p_max)
beta_glmnet_true[seq(10, p_max, 10)] <- rnorm(length(seq(10, p_max, 10)), sd = 0.35)
y_sparse <- drop(X_glmnet %*% beta_glmnet_true + rnorm(n_max, sd = 5))
linpred <- drop(X %*% beta_true) / 10
prob <- 1 / (1 + exp(-linpred))
y_binary <- rbinom(n_max, size = 1, prob = prob)
full_data <- data.frame(
y_cont = y_continuous,
y_bin = y_binary,
y_sparse = y_sparse,
X
)
# for fast debugging
if (params$develop) {
n_sizes <- n_sizes/10
p_sizes <- p_sizes/5
}
bench_grid <- expand.grid(
n = n_sizes,
p = p_sizes,
stringsAsFactors = FALSE
)
bench_grid <- bench_grid[bench_grid$n > bench_grid$p, ]
all_results <- data.frame()
all_accuracy <- data.frame()Helper functions
relative_rmse <- function(estimate, truth) {
scale <- sqrt(mean(truth^2))
sqrt(mean((estimate - truth)^2)) / max(scale, 1e-12)
}
safe_relative_rmse <- function(estimate, truth) {
estimate <- as.numeric(estimate)
truth <- as.numeric(truth)
if (anyNA(estimate) || anyNA(truth) ||
any(!is.finite(estimate)) || any(!is.finite(truth))) {
return(NA_real_)
}
relative_rmse(estimate, truth)
}
make_pca_fixture <- function(n, p, rank_k, noise_sd = 2) {
scores_raw <- qr.Q(qr(matrix(rnorm(n * rank_k), nrow = n, ncol = rank_k)))
scores_raw <- scale(scores_raw, center = TRUE, scale = FALSE)
scores_basis <- qr.Q(qr(scores_raw))
rotation <- qr.Q(qr(matrix(rnorm(p * rank_k), nrow = p, ncol = rank_k)))
sdev_true <- seq(from = 2.5, to = 1.5, length.out = rank_k)
singular_values <- sdev_true * sqrt(n - 1)
x_signal <- scores_basis %*% diag(singular_values, nrow = rank_k) %*%
t(rotation)
x <- x_signal + matrix(rnorm(n * p, sd = noise_sd), nrow = n, ncol = p)
colnames(x) <- paste0("pcx", seq_len(p))
list(
x = x,
rotation = rotation,
sdev = sdev_true,
rank = rank_k
)
}
extract_pca_rotation <- function(fit) {
if (inherits(fit, "big_SVD")) {
fit$v
} else {
fit$rotation
}
}
extract_pca_sdev <- function(fit) {
if (inherits(fit, "big_SVD")) {
fit$d / sqrt(nrow(fit$u) - 1)
} else {
fit$sdev
}
}
projector_error <- function(estimate, truth) {
estimate <- as.matrix(estimate)
truth <- as.matrix(truth)
proj_estimate <- estimate %*% t(estimate)
proj_truth <- truth %*% t(truth)
sqrt(sum((proj_estimate - proj_truth)^2)) / sqrt(sum(proj_truth^2))
}
pca_accuracy_score <- function(fit, truth_rotation, truth_sdev) {
projector_error(extract_pca_rotation(fit), truth_rotation) +
relative_rmse(
as.numeric(extract_pca_sdev(fit)[seq_along(truth_sdev)]),
truth_sdev
)
}
make_bootstrap_data <- function(n, p) {
x <- X[1:n, 1:p, drop = FALSE]
colnames(x) <- paste0("x", seq_len(p))
beta <- beta_true[seq_len(p)]
sigma <- 1 + 2 * abs(x[, 1])
y <- drop(x %*% beta + rnorm(n, sd = sigma))
data.frame(y_boot = y, x)
}
bootstrap_oracle_se <- function(x, beta, formula, reps) {
sigma <- 1 + 2 * abs(x[, 1])
estimates <- replicate(reps, {
y <- drop(x %*% beta + rnorm(nrow(x), sd = sigma))
coef(lm(formula, data = data.frame(y_boot = y, x)))
})
apply(t(estimates), 2, sd)
}
extract_bootstrap_se <- function(method, fit) {
if (identical(method, "stats::lm")) {
return(as.numeric(fit))
}
if (identical(method, "boot::boot")) {
return(apply(fit$t, 2, sd))
}
if (grepl("^lmboot::", method)) {
return(apply(fit$bootEstParam, 2, sd))
}
as.numeric(fit$std.error)
}
mlxs_lm
Code
lm_results <- list()
lm_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[3], ]
lm_accuracy <- list()
for (i in seq_len(nrow(lm_grid))) {
n <- lm_grid$n[i]
p <- lm_grid$p[i]
subset_data <- full_data[1:n, c("y_cont", paste0("x", 1:p))]
lm_formula <- reformulate(paste0("x", 1:p), response = "y_cont")
beta_target <- beta_true[seq_len(p)]
fitters <- list(
"stats::lm" = function() lm(lm_formula, data = subset_data),
"RmlxStats::mlxs_lm" = function() {
fit <- mlxs_lm(lm_formula, data = subset_data)
Rmlx::mlx_eval(fit$coefficients)
fit
},
"fixest::feols" = function() feols(lm_formula, data = subset_data),
"RcppEigen::fastLm" = function() RcppEigen::fastLm(
lm_formula,
data = subset_data
),
"speedglm::speedlm" = function() speedglm::speedlm(
lm_formula,
data = subset_data
)
)
bm <- mark(
"stats::lm" = fitters[["stats::lm"]](),
"RmlxStats::mlxs_lm" = fitters[["RmlxStats::mlxs_lm"]](),
"fixest::feols" = fitters[["fixest::feols"]](),
"RcppEigen::fastLm" = fitters[["RcppEigen::fastLm"]](),
"speedglm::speedlm" = fitters[["speedglm::speedlm"]](),
iterations = 3,
check = function (r1, r2) {
all.equal(as.vector(coef(r1)), as.vector(coef(r2)),
tolerance = 1e-6)
},
filter_gc = FALSE
)
bm$n <- n
bm$p <- p
bm$model_type <- "lm"
lm_results[[i]] <- bm
fits <- lapply(fitters, function(fit_method) fit_method())
scores <- lapply(names(fits), function(method) {
beta_hat <- as.numeric(coef(fits[[method]]))[-1]
data.frame(
model_type = "lm",
n = n,
p = p,
method = method,
accuracy = relative_rmse(beta_hat, beta_target),
stringsAsFactors = FALSE
)
})
lm_accuracy[[i]] <- do.call(rbind, scores)
}
lm_df <- do.call(rbind, lm_results)
all_results <- rbind(all_results, lm_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, lm_accuracy))
mlxs_glm
Code
glm_results <- list()
glm_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[3], ]
glm_accuracy <- list()
for (i in seq_len(nrow(glm_grid))) {
n <- glm_grid$n[i]
p <- glm_grid$p[i]
subset_data <- full_data[1:n, c("y_bin", paste0("x", 1:p))]
glm_formula <- reformulate(paste0("x", 1:p), response = "y_bin")
beta_target <- beta_true[seq_len(p)] / 5
fitters <- list(
"stats::glm" = function() glm(
glm_formula,
family = binomial(),
data = subset_data,
control = list(maxit = 50)
),
"RmlxStats::mlxs_glm" = function() {
fit <- mlxs_glm(
glm_formula,
family = mlxs_binomial(),
data = subset_data,
control = list(maxit = 50, epsilon = 1e-5)
)
Rmlx::mlx_eval(fit$coefficients)
fit
},
"speedglm::speedglm" = function() speedglm::speedglm(
glm_formula,
family = binomial(),
data = subset_data
)
)
bm <- mark(
"stats::glm" = fitters[["stats::glm"]](),
"RmlxStats::mlxs_glm" = fitters[["RmlxStats::mlxs_glm"]](),
"speedglm::speedglm" = fitters[["speedglm::speedglm"]](),
iterations = 3,
check = function (r1, r2) {
all.equal(as.vector(coef(r1)), as.vector(coef(r2)),
tolerance = 1e-4)
},
filter_gc = FALSE
)
bm$n <- n
bm$p <- p
bm$model_type <- "glm"
glm_results[[i]] <- bm
fits <- lapply(fitters, function(fit_method) fit_method())
scores <- lapply(names(fits), function(method) {
beta_hat <- as.numeric(coef(fits[[method]]))[-1]
data.frame(
model_type = "glm",
n = n,
p = p,
method = method,
accuracy = relative_rmse(beta_hat, beta_target),
stringsAsFactors = FALSE
)
})
glm_accuracy[[i]] <- do.call(rbind, scores)
}
glm_df <- do.call(rbind, glm_results)
all_results <- rbind(all_results, glm_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, glm_accuracy))
mlxs_cv_glmnet
Code
glmnet_results <- list()
glmnet_grid <- bench_grid
glmnet_accuracy <- list()
glmnet_nfolds <- 3L
for (i in seq_len(nrow(glmnet_grid))) {
n <- glmnet_grid$n[i]
p <- glmnet_grid$p[i]
xvars <- paste0("x", 1:p)
x <- X_glmnet[1:n, xvars, drop = FALSE]
y_sparse_subset <- y_sparse[1:n]
beta_target <- beta_glmnet_true[seq_len(p)]
stats_time <- system.time({
fit_stats <- glmnet::cv.glmnet(
x,
y_sparse_subset,
nfolds = glmnet_nfolds
)
as.numeric(fit_stats$lambda.min)
})[["elapsed"]]
mlx_time <- system.time({
fit_mlx <- mlxs_cv_glmnet(
x,
y_sparse_subset,
nfolds = glmnet_nfolds
)
Rmlx::mlx_eval(fit_mlx$glmnet.fit$x_center)
Rmlx::mlx_eval(fit_mlx$glmnet.fit$x_scale)
as.numeric(fit_mlx$lambda.min)
})[["elapsed"]]
bm <- data.frame(
expression = c("glmnet::cv.glmnet", "RmlxStats::mlxs_cv_glmnet"),
median = bench::as_bench_time(c(stats_time, mlx_time)),
stringsAsFactors = FALSE
)
bm$n <- n
bm$p <- p
bm$model_type <- "glmnet"
for (name in setdiff(names(all_results), names(bm))) {
bm[[name]] <- NA
}
bm <- bm[, names(all_results), drop = FALSE]
glmnet_results[[i]] <- bm
fits <- list(
"glmnet::cv.glmnet" = fit_stats,
"RmlxStats::mlxs_cv_glmnet" = fit_mlx
)
scores <- lapply(names(fits), function(method) {
beta_hat <- as.numeric(coef(fits[[method]], s = "lambda.min"))[-1]
data.frame(
model_type = "glmnet",
n = n,
p = p,
method = method,
accuracy = safe_relative_rmse(beta_hat, beta_target),
stringsAsFactors = FALSE
)
})
glmnet_accuracy[[i]] <- do.call(rbind, scores)
}
glmnet_df <- do.call(rbind, glmnet_results)
all_results <- rbind(all_results, glmnet_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, glmnet_accuracy))
mlxs_prcomp
Code
pca_results <- list()
pca_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[4], ]
pca_accuracy <- list()
for (i in seq_len(nrow(pca_grid))) {
n <- pca_grid$n[i]
p <- pca_grid$p[i]
rank_k <- min(8L, max(4L, floor(min(n, p) / 2)))
pca_fixture <- make_pca_fixture(n, p, rank_k)
x <- pca_fixture$x
x_big <- bigstatsr::FBM(nrow = n, ncol = p, init = as.matrix(x))
rotation_true <- pca_fixture$rotation
fitters <- list(
"stats::prcomp" = function() prcomp(
x,
center = TRUE,
scale. = FALSE,
rank. = rank_k
),
"irlba::prcomp_irlba" = function() irlba::prcomp_irlba(
x,
n = rank_k,
center = TRUE,
scale. = FALSE
),
"rsvd::rpca" = function() rsvd::rpca(
x,
k = rank_k,
center = TRUE,
scale = FALSE
),
"bigstatsr::big_randomSVD" = function() bigstatsr::big_randomSVD(
x_big,
fun.scaling = bigstatsr::big_scale(center = TRUE, scale = FALSE),
k = rank_k
),
"RmlxStats::mlxs_prcomp" = function() {
fit <- mlxs_prcomp(
x,
center = TRUE,
scale. = FALSE,
rank. = rank_k,
n_iter = 2,
seed = 1
)
Rmlx::mlx_eval(fit$rotation)
if (!is.null(fit$x)) {
Rmlx::mlx_eval(fit$x)
}
fit
}
)
bm <- mark(
"stats::prcomp" = fitters[["stats::prcomp"]](),
"irlba::prcomp_irlba" = fitters[["irlba::prcomp_irlba"]](),
"rsvd::rpca" = fitters[["rsvd::rpca"]](),
"bigstatsr::big_randomSVD" = fitters[["bigstatsr::big_randomSVD"]](),
"RmlxStats::mlxs_prcomp" = fitters[["RmlxStats::mlxs_prcomp"]](),
iterations = 3,
check = FALSE,
filter_gc = FALSE
)
bm$n <- n
bm$p <- p
bm$model_type <- "pca"
pca_results[[i]] <- bm
fits <- lapply(fitters, function(fit_method) fit_method())
scores <- lapply(names(fits), function(method) {
data.frame(
model_type = "pca",
n = n,
p = p,
method = method,
accuracy = pca_accuracy_score(
fits[[method]],
rotation_true,
pca_fixture$sdev
),
stringsAsFactors = FALSE
)
})
pca_accuracy[[i]] <- do.call(rbind, scores)
}
pca_df <- do.call(rbind, pca_results)
all_results <- rbind(all_results, pca_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, pca_accuracy))Bootstrap
For bootstrap, we use only smaller datasets due to computational cost.
Code
boot_results <- list()
boot_grid <- bench_grid[bench_grid$n <= n_sizes[2] & bench_grid$p <= p_sizes[2], ]
boot_accuracy <- list()
bootstrap_reps <- if (params$develop) 50L else 100L
oracle_reps <- if (params$develop) 50L else 200L
for (i in seq_len(nrow(boot_grid))) {
n <- boot_grid$n[i]
p <- boot_grid$p[i]
subset_data <- make_bootstrap_data(n, p)
x <- as.matrix(subset_data[paste0("x", seq_len(p))])
beta_target <- beta_true[seq_len(p)]
formula_str <- paste("y_boot ~", paste(paste0("x", 1:p), collapse = " + "))
boot_formula <- as.formula(formula_str)
fit_mlxs <- mlxs_lm(boot_formula, data = subset_data)
oracle_se <- bootstrap_oracle_se(x, beta_target, boot_formula, oracle_reps)
oracle_se <- oracle_se[-1]
boot_stat <- function(dat, idx) {
coef(lm(boot_formula, data = dat[idx, , drop = FALSE]))
}
fitters <- list(
"boot::boot" = function() boot::boot(
subset_data,
statistic = boot_stat,
R = bootstrap_reps,
parallel = "no"
),
"lmboot::paired.boot" = function() lmboot::paired.boot(
boot_formula,
data = subset_data,
B = bootstrap_reps
),
"lmboot::residual.boot" = function() lmboot::residual.boot(
boot_formula,
data = subset_data,
B = bootstrap_reps
),
"mlxs_case" = function() {
summary(
fit_mlxs,
bootstrap = TRUE,
bootstrap_args = list(
B = bootstrap_reps,
seed = 42,
bootstrap_type = "case",
progress = FALSE
)
)
},
"mlxs_resid" = function() {
summary(
fit_mlxs,
bootstrap = TRUE,
bootstrap_args = list(
B = bootstrap_reps,
seed = 42,
bootstrap_type = "resid",
progress = FALSE
)
)
}
)
bm <- mark(
"boot::boot" = fitters[["boot::boot"]](),
"lmboot::paired.boot" = fitters[["lmboot::paired.boot"]](),
"lmboot::residual.boot" = fitters[["lmboot::residual.boot"]](),
"mlxs_case" = {
fit <- fitters[["mlxs_case"]]()
Rmlx::mlx_eval(fit$std.error)
fit
},
"mlxs_resid" = {
fit <- fitters[["mlxs_resid"]]()
Rmlx::mlx_eval(fit$std.error)
fit
},
iterations = 3,
check = FALSE,
filter_gc = FALSE,
memory = FALSE
)
bm$n <- n
bm$p <- p
bm$model_type <- "Bootstrap"
boot_results[[i]] <- bm
fits <- lapply(fitters, function(fit_method) fit_method())
scores <- lapply(names(fits), function(method) {
se_hat <- extract_bootstrap_se(method, fits[[method]])[-1]
data.frame(
model_type = "Bootstrap",
n = n,
p = p,
method = method,
accuracy = relative_rmse(se_hat, oracle_se),
stringsAsFactors = FALSE
)
})
boot_accuracy[[i]] <- do.call(rbind, scores)
}
boot_df <- do.call(rbind, boot_results)
all_results <- rbind(all_results, boot_df)
all_accuracy <- rbind(all_accuracy, do.call(rbind, boot_accuracy))Results: Speed
We compare functions both against base R implementation, and against the fastest alternative tested.
Display code
mlxs_lm
Data table
| Method | Median seconds | Rel. to base | Rel. to best |
|---|---|---|---|
| n = 10000, p = 50 | |||
| fixest::feols | 0.011 | 44.8% | 108.0% |
| RcppEigen::fastLm | 0.013 | 53.6% | 129.2% |
| speedglm::speedlm | 0.021 | 86.1% | 207.7% |
| stats::lm | 0.024 | 100.0% | 241.3% |
| RmlxStats::mlxs_lm | 0.010 | 41.5% | 100.0% |
| n = 50000, p = 50 | |||
| fixest::feols | 0.034 | 29.7% | 100.0% |
| RcppEigen::fastLm | 0.065 | 57.3% | 193.1% |
| speedglm::speedlm | 0.093 | 81.6% | 275.0% |
| stats::lm | 0.11 | 100.0% | 337.1% |
| RmlxStats::mlxs_lm | 0.038 | 33.4% | 112.5% |
| n = 10000, p = 100 | |||
| fixest::feols | 0.027 | 33.0% | 132.1% |
| RcppEigen::fastLm | 0.035 | 42.8% | 171.5% |
| speedglm::speedlm | 0.069 | 84.9% | 339.7% |
| stats::lm | 0.081 | 100.0% | 400.2% |
| RmlxStats::mlxs_lm | 0.020 | 25.0% | 100.0% |
| n = 50000, p = 100 | |||
| fixest::feols | 0.10 | 26.4% | 130.6% |
| RcppEigen::fastLm | 0.18 | 46.6% | 230.1% |
| speedglm::speedlm | 0.43 | 109.5% | 540.9% |
| stats::lm | 0.39 | 100.0% | 494.2% |
| RmlxStats::mlxs_lm | 0.080 | 20.2% | 100.0% |
| n = 10000, p = 200 | |||
| fixest::feols | 0.086 | 28.8% | 217.2% |
| RcppEigen::fastLm | 0.12 | 39.5% | 298.2% |
| speedglm::speedlm | 0.26 | 85.6% | 646.2% |
| stats::lm | 0.30 | 100.0% | 754.6% |
| RmlxStats::mlxs_lm | 0.040 | 13.3% | 100.0% |
| n = 50000, p = 200 | |||
| fixest::feols | 0.38 | 23.3% | 192.9% |
| RcppEigen::fastLm | 0.65 | 39.8% | 329.2% |
| speedglm::speedlm | 1.3 | 79.2% | 655.5% |
| stats::lm | 1.6 | 100.0% | 827.5% |
| RmlxStats::mlxs_lm | 0.20 | 12.1% | 100.0% |
| Base is base R implementation in 'stats' or 'boot' packages. | |||

mlxs_glm
Data table
| Method | Median seconds | Rel. to base | Rel. to best |
|---|---|---|---|
| n = 10000, p = 50 | |||
| speedglm::speedglm | 0.079 | 74.4% | 223.2% |
| stats::glm | 0.11 | 100.0% | 300.1% |
| RmlxStats::mlxs_glm | 0.035 | 33.3% | 100.0% |
| n = 50000, p = 50 | |||
| speedglm::speedglm | 0.39 | 76.5% | 365.5% |
| stats::glm | 0.50 | 100.0% | 477.8% |
| RmlxStats::mlxs_glm | 0.11 | 20.9% | 100.0% |
| n = 10000, p = 100 | |||
| speedglm::speedglm | 0.28 | 84.3% | 440.3% |
| stats::glm | 0.33 | 100.0% | 522.5% |
| RmlxStats::mlxs_glm | 0.063 | 19.1% | 100.0% |
| n = 50000, p = 100 | |||
| speedglm::speedglm | 1.3 | 78.5% | 507.9% |
| stats::glm | 1.7 | 100.0% | 647.2% |
| RmlxStats::mlxs_glm | 0.26 | 15.5% | 100.0% |
| n = 10000, p = 200 | |||
| speedglm::speedglm | 0.98 | 80.3% | 861.3% |
| stats::glm | 1.2 | 100.0% | 1072.2% |
| RmlxStats::mlxs_glm | 0.11 | 9.3% | 100.0% |
| n = 50000, p = 200 | |||
| speedglm::speedglm | 5.0 | 83.6% | 1065.0% |
| stats::glm | 6.0 | 100.0% | 1274.6% |
| RmlxStats::mlxs_glm | 0.47 | 7.8% | 100.0% |
| Base is base R implementation in 'stats' or 'boot' packages. | |||

mlxs_cv_glmnet
Data table
| Method | Median seconds | Rel. to base | Rel. to best |
|---|---|---|---|
| n = 10000, p = 50 | |||
| glmnet::cv.glmnet | 0.13 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 2.0 | 1494.7% | |
| n = 50000, p = 50 | |||
| glmnet::cv.glmnet | 0.42 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 1.8 | 420.8% | |
| n = 250000, p = 50 | |||
| glmnet::cv.glmnet | 1.7 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 2.3 | 134.1% | |
| n = 10000, p = 100 | |||
| glmnet::cv.glmnet | 0.11 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 2.2 | 2112.3% | |
| n = 50000, p = 100 | |||
| glmnet::cv.glmnet | 0.46 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 1.9 | 423.4% | |
| n = 250000, p = 100 | |||
| glmnet::cv.glmnet | 2.8 | 102.5% | |
| RmlxStats::mlxs_cv_glmnet | 2.8 | 100.0% | |
| n = 10000, p = 200 | |||
| glmnet::cv.glmnet | 0.36 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 7.9 | 2157.4% | |
| n = 50000, p = 200 | |||
| glmnet::cv.glmnet | 1.1 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 2.4 | 215.6% | |
| n = 250000, p = 200 | |||
| glmnet::cv.glmnet | 7.1 | 157.0% | |
| RmlxStats::mlxs_cv_glmnet | 4.5 | 100.0% | |
| n = 10000, p = 400 | |||
| glmnet::cv.glmnet | 0.76 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 20. | 2600.0% | |
| n = 50000, p = 400 | |||
| glmnet::cv.glmnet | 3.9 | 112.8% | |
| RmlxStats::mlxs_cv_glmnet | 3.4 | 100.0% | |
| n = 250000, p = 400 | |||
| glmnet::cv.glmnet | 23. | 263.7% | |
| RmlxStats::mlxs_cv_glmnet | 8.6 | 100.0% | |
| n = 10000, p = 800 | |||
| glmnet::cv.glmnet | 2.3 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 48. | 2044.5% | |
| n = 50000, p = 800 | |||
| glmnet::cv.glmnet | 9.3 | 100.0% | |
| RmlxStats::mlxs_cv_glmnet | 81. | 872.1% | |
| n = 250000, p = 800 | |||
| glmnet::cv.glmnet | 44. | 191.8% | |
| RmlxStats::mlxs_cv_glmnet | 23. | 100.0% | |
| Base is base R implementation in 'stats' or 'boot' packages. | |||

mlxs_prcomp
Data table
| Method | Median seconds | Rel. to base | Rel. to best |
|---|---|---|---|
| n = 10000, p = 50 | |||
| bigstatsr::big_randomSVD | 0.033 | 60.0% | 324.7% |
| irlba::prcomp_irlba | 0.028 | 52.1% | 282.3% |
| rsvd::rpca | 0.083 | 152.0% | 823.2% |
| stats::prcomp | 0.054 | 100.0% | 541.5% |
| RmlxStats::mlxs_prcomp | 0.010 | 18.5% | 100.0% |
| n = 50000, p = 50 | |||
| bigstatsr::big_randomSVD | 0.12 | 42.7% | 407.4% |
| irlba::prcomp_irlba | 0.12 | 42.9% | 410.0% |
| rsvd::rpca | 0.42 | 147.0% | 1404.0% |
| stats::prcomp | 0.28 | 100.0% | 954.9% |
| RmlxStats::mlxs_prcomp | 0.030 | 10.5% | 100.0% |
| n = 10000, p = 100 | |||
| bigstatsr::big_randomSVD | 0.064 | 30.6% | 436.7% |
| irlba::prcomp_irlba | 0.057 | 27.2% | 387.9% |
| rsvd::rpca | 0.14 | 67.0% | 956.7% |
| stats::prcomp | 0.21 | 100.0% | 1428.5% |
| RmlxStats::mlxs_prcomp | 0.015 | 7.0% | 100.0% |
| n = 50000, p = 100 | |||
| bigstatsr::big_randomSVD | 0.22 | 20.6% | 480.7% |
| irlba::prcomp_irlba | 0.23 | 21.2% | 494.2% |
| rsvd::rpca | 0.70 | 65.4% | 1525.1% |
| stats::prcomp | 1.1 | 100.0% | 2331.1% |
| RmlxStats::mlxs_prcomp | 0.046 | 4.3% | 100.0% |
| n = 10000, p = 200 | |||
| bigstatsr::big_randomSVD | 0.10 | 11.5% | 684.8% |
| irlba::prcomp_irlba | 0.10 | 11.4% | 681.1% |
| rsvd::rpca | 0.25 | 28.7% | 1712.5% |
| stats::prcomp | 0.89 | 100.0% | 5966.6% |
| RmlxStats::mlxs_prcomp | 0.015 | 1.7% | 100.0% |
| n = 50000, p = 200 | |||
| bigstatsr::big_randomSVD | 0.43 | 8.8% | 862.9% |
| irlba::prcomp_irlba | 0.45 | 9.1% | 897.1% |
| rsvd::rpca | 1.4 | 27.9% | 2746.0% |
| stats::prcomp | 4.9 | 100.0% | 9859.2% |
| RmlxStats::mlxs_prcomp | 0.050 | 1.0% | 100.0% |
| n = 10000, p = 400 | |||
| bigstatsr::big_randomSVD | 0.22 | 6.2% | 688.8% |
| irlba::prcomp_irlba | 0.19 | 5.5% | 612.5% |
| rsvd::rpca | 0.48 | 13.9% | 1539.3% |
| stats::prcomp | 3.5 | 100.0% | 11045.3% |
| RmlxStats::mlxs_prcomp | 0.031 | 0.9% | 100.0% |
| n = 50000, p = 400 | |||
| bigstatsr::big_randomSVD | 0.86 | 5.2% | 965.3% |
| irlba::prcomp_irlba | 0.99 | 6.0% | 1110.8% |
| rsvd::rpca | 2.6 | 15.6% | 2875.9% |
| stats::prcomp | 16. | 100.0% | 18443.5% |
| RmlxStats::mlxs_prcomp | 0.089 | 0.5% | 100.0% |
| Base is base R implementation in 'stats' or 'boot' packages. | |||

Bootstrap
Data table
| Method | Median seconds | Rel. to base | Rel. to best |
|---|---|---|---|
| n = 10000, p = 50 | |||
| boot::boot | 2.8 | 100.0% | 2071.6% |
| lmboot::paired.boot | 4.3 | 154.7% | 3205.6% |
| lmboot::residual.boot | 0.13 | 4.8% | 100.0% |
| mlxs_case | 0.90 | 32.1% | 665.7% |
| mlxs_resid | 0.16 | 5.7% | 118.8% |
| n = 50000, p = 50 | |||
| boot::boot | 16. | 100.0% | 4875.8% |
| lmboot::paired.boot | 21. | 135.1% | 6588.1% |
| lmboot::residual.boot | 0.63 | 4.0% | 196.5% |
| mlxs_case | 3.1 | 20.0% | 973.3% |
| mlxs_resid | 0.32 | 2.1% | 100.0% |
| n = 10000, p = 100 | |||
| boot::boot | 9.0 | 100.0% | 5629.1% |
| lmboot::paired.boot | 15. | 170.8% | 9614.5% |
| lmboot::residual.boot | 0.34 | 3.8% | 214.3% |
| mlxs_case | 1.8 | 19.9% | 1118.5% |
| mlxs_resid | 0.16 | 1.8% | 100.0% |
| n = 50000, p = 100 | |||
| boot::boot | 47. | 100.0% | 14243.1% |
| lmboot::paired.boot | 77. | 162.6% | 23154.4% |
| lmboot::residual.boot | 1.8 | 3.7% | 529.9% |
| mlxs_case | 7.6 | 16.1% | 2286.9% |
| mlxs_resid | 0.33 | 0.7% | 100.0% |
| Base is base R implementation in 'stats' or 'boot' packages. | |||

Results: Accuracy
mlxs_lm
Errors are calculated as the relative root mean squared error of the betas (i.e. root mean squared error of the estimated betas compared to the true betas, divided by the root mean square of the true betas).
Data table
| Method | Error | Rel. to best |
|---|---|---|
| n = 10000, p = 50 | ||
| stats::lm | 0.290 | 100.0% |
| RmlxStats::mlxs_lm | 0.290 | 100.0% |
| fixest::feols | 0.290 | 100.0% |
| RcppEigen::fastLm | 0.290 | 100.0% |
| speedglm::speedlm | 0.290 | 100.0% |
| n = 50000, p = 50 | ||
| stats::lm | 0.137 | 100.0% |
| RmlxStats::mlxs_lm | 0.137 | 100.0% |
| fixest::feols | 0.137 | 100.0% |
| RcppEigen::fastLm | 0.137 | 100.0% |
| speedglm::speedlm | 0.137 | 100.0% |
| n = 10000, p = 100 | ||
| stats::lm | 0.343 | 100.0% |
| RmlxStats::mlxs_lm | 0.343 | 100.0% |
| fixest::feols | 0.343 | 100.0% |
| RcppEigen::fastLm | 0.343 | 100.0% |
| speedglm::speedlm | 0.343 | 100.0% |
| n = 50000, p = 100 | ||
| stats::lm | 0.141 | 100.0% |
| RmlxStats::mlxs_lm | 0.141 | 100.0% |
| fixest::feols | 0.141 | 100.0% |
| RcppEigen::fastLm | 0.141 | 100.0% |
| speedglm::speedlm | 0.141 | 100.0% |
| n = 10000, p = 200 | ||
| stats::lm | 0.286 | 100.0% |
| RmlxStats::mlxs_lm | 0.286 | 100.0% |
| fixest::feols | 0.286 | 100.0% |
| RcppEigen::fastLm | 0.286 | 100.0% |
| speedglm::speedlm | 0.286 | 100.0% |
| n = 50000, p = 200 | ||
| stats::lm | 0.116 | 100.0% |
| RmlxStats::mlxs_lm | 0.116 | 100.0% |
| fixest::feols | 0.116 | 100.0% |
| RcppEigen::fastLm | 0.116 | 100.0% |
| speedglm::speedlm | 0.116 | 100.0% |
Graph

mlxs_glm
Errors are calculated as the relative root mean squared error of the estimated betas.
Data table
| Method | Error | Rel. to best |
|---|---|---|
| n = 10000, p = 50 | ||
| stats::glm | 0.632 | 100.0% |
| RmlxStats::mlxs_glm | 0.632 | 100.0% |
| speedglm::speedglm | 0.632 | 100.0% |
| n = 50000, p = 50 | ||
| stats::glm | 0.630 | 100.0% |
| RmlxStats::mlxs_glm | 0.630 | 100.0% |
| speedglm::speedglm | 0.630 | 100.0% |
| n = 10000, p = 100 | ||
| stats::glm | 0.645 | 100.0% |
| RmlxStats::mlxs_glm | 0.645 | 100.0% |
| speedglm::speedglm | 0.645 | 100.0% |
| n = 50000, p = 100 | ||
| stats::glm | 0.625 | 100.0% |
| RmlxStats::mlxs_glm | 0.625 | 100.0% |
| speedglm::speedglm | 0.625 | 100.0% |
| n = 10000, p = 200 | ||
| stats::glm | 0.641 | 100.0% |
| RmlxStats::mlxs_glm | 0.641 | 100.0% |
| speedglm::speedglm | 0.641 | 100.0% |
| n = 50000, p = 200 | ||
| stats::glm | 0.617 | 100.0% |
| RmlxStats::mlxs_glm | 0.617 | 100.0% |
| speedglm::speedglm | 0.617 | 100.0% |
Graph

mlxs_cv_glmnet
Errors are calculated as the relative root mean squared error of the
estimated betas, for lambda = lambda.min.
Data table
| Method | Error | Rel. to best |
|---|---|---|
| n = 10000, p = 50 | ||
| glmnet::cv.glmnet | 0.152 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.153 | 100.4% |
| n = 50000, p = 50 | ||
| glmnet::cv.glmnet | 0.108 | 101.3% |
| RmlxStats::mlxs_cv_glmnet | 0.106 | 100.0% |
| n = 250000, p = 50 | ||
| glmnet::cv.glmnet | 0.0375 | 105.2% |
| RmlxStats::mlxs_cv_glmnet | 0.0357 | 100.0% |
| n = 10000, p = 100 | ||
| glmnet::cv.glmnet | 0.229 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.230 | 100.6% |
| n = 50000, p = 100 | ||
| glmnet::cv.glmnet | 0.124 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.125 | 100.9% |
| n = 250000, p = 100 | ||
| glmnet::cv.glmnet | 0.0618 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.0634 | 102.6% |
| n = 10000, p = 200 | ||
| glmnet::cv.glmnet | 0.276 | 101.2% |
| RmlxStats::mlxs_cv_glmnet | 0.272 | 100.0% |
| n = 50000, p = 200 | ||
| glmnet::cv.glmnet | 0.164 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.168 | 102.4% |
| n = 250000, p = 200 | ||
| glmnet::cv.glmnet | 0.0832 | 101.1% |
| RmlxStats::mlxs_cv_glmnet | 0.0823 | 100.0% |
| n = 10000, p = 400 | ||
| glmnet::cv.glmnet | 0.291 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.296 | 101.6% |
| n = 50000, p = 400 | ||
| glmnet::cv.glmnet | 0.147 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.147 | 100.0% |
| n = 250000, p = 400 | ||
| glmnet::cv.glmnet | 0.0713 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.0731 | 102.6% |
| n = 10000, p = 800 | ||
| glmnet::cv.glmnet | 0.280 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.284 | 101.8% |
| n = 50000, p = 800 | ||
| glmnet::cv.glmnet | 0.127 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.127 | 100.0% |
| n = 250000, p = 800 | ||
| glmnet::cv.glmnet | 0.0614 | 100.0% |
| RmlxStats::mlxs_cv_glmnet | 0.0622 | 101.3% |
Graph

mlxs_prcomp
PCA errors are the projector error of the rotation matrix compared to the true data-generating rotation matrix, plus the relative root mean squared error of the standard deviations. Projector error is equivalent to measuring the principal angles between the estimated and true PCA subspaces.
Data table
| Method | Error | Rel. to best |
|---|---|---|
| n = 10000, p = 50 | ||
| stats::prcomp | 0.559 | 100.0% |
| irlba::prcomp_irlba | 0.559 | 100.0% |
| rsvd::rpca | 0.761 | 136.2% |
| bigstatsr::big_randomSVD | 0.559 | 100.0% |
| RmlxStats::mlxs_prcomp | 0.900 | 161.0% |
| n = 50000, p = 50 | ||
| stats::prcomp | 0.483 | 100.0% |
| irlba::prcomp_irlba | 0.483 | 100.0% |
| rsvd::rpca | 0.712 | 147.5% |
| bigstatsr::big_randomSVD | 0.483 | 100.0% |
| RmlxStats::mlxs_prcomp | 0.957 | 198.4% |
| n = 10000, p = 100 | ||
| stats::prcomp | 0.642 | 100.0% |
| irlba::prcomp_irlba | 0.642 | 100.0% |
| rsvd::rpca | 0.924 | 144.0% |
| bigstatsr::big_randomSVD | 0.642 | 100.0% |
| RmlxStats::mlxs_prcomp | 1.16 | 181.3% |
| n = 50000, p = 100 | ||
| stats::prcomp | 0.516 | 100.0% |
| irlba::prcomp_irlba | 0.516 | 100.0% |
| rsvd::rpca | 0.875 | 169.6% |
| bigstatsr::big_randomSVD | 0.516 | 100.0% |
| RmlxStats::mlxs_prcomp | 1.11 | 215.7% |
| n = 10000, p = 200 | ||
| stats::prcomp | 0.740 | 100.0% |
| irlba::prcomp_irlba | 0.740 | 100.0% |
| rsvd::rpca | 1.10 | 148.8% |
| bigstatsr::big_randomSVD | 0.740 | 100.0% |
| RmlxStats::mlxs_prcomp | 1.30 | 175.5% |
| n = 50000, p = 200 | ||
| stats::prcomp | 0.552 | 100.0% |
| irlba::prcomp_irlba | 0.552 | 100.0% |
| rsvd::rpca | 1.01 | 183.4% |
| bigstatsr::big_randomSVD | 0.552 | 100.0% |
| RmlxStats::mlxs_prcomp | 1.23 | 222.3% |
| n = 10000, p = 400 | ||
| stats::prcomp | 0.870 | 100.0% |
| irlba::prcomp_irlba | 0.870 | 100.0% |
| rsvd::rpca | 1.19 | 136.7% |
| bigstatsr::big_randomSVD | 0.870 | 100.0% |
| RmlxStats::mlxs_prcomp | 1.40 | 161.1% |
| n = 50000, p = 400 | ||
| stats::prcomp | 0.611 | 100.0% |
| irlba::prcomp_irlba | 0.611 | 100.0% |
| rsvd::rpca | 1.14 | 186.0% |
| bigstatsr::big_randomSVD | 0.611 | 100.0% |
| RmlxStats::mlxs_prcomp | 1.33 | 217.6% |
Graph

Bootstraps
Errors are calculated as the root mean squared error of the bootstrap-estimated standard errors. We calculate the true standard error by repeatedly estimating linear models, drawing a new dependent variable from the (known) data generating process and calculating the standard deviation of the estimates.
Data table
| Method | Error | Rel. to best |
|---|---|---|
| n = 10000, p = 50 | ||
| boot::boot | 0.0937 | 123.0% |
| lmboot::paired.boot | 0.0761 | 100.0% |
| lmboot::residual.boot | 0.110 | 144.6% |
| mlxs_case | 0.0945 | 124.1% |
| mlxs_resid | 0.0997 | 130.9% |
| n = 50000, p = 50 | ||
| boot::boot | 0.0803 | 100.0% |
| lmboot::paired.boot | 0.0912 | 113.6% |
| lmboot::residual.boot | 0.114 | 141.9% |
| mlxs_case | 0.0879 | 109.4% |
| mlxs_resid | 0.120 | 149.7% |
| n = 10000, p = 100 | ||
| boot::boot | 0.101 | 115.7% |
| lmboot::paired.boot | 0.0897 | 103.0% |
| lmboot::residual.boot | 0.106 | 122.3% |
| mlxs_case | 0.0870 | 100.0% |
| mlxs_resid | 0.116 | 133.1% |
| n = 50000, p = 100 | ||
| boot::boot | 0.0940 | 116.1% |
| lmboot::paired.boot | 0.0810 | 100.0% |
| lmboot::residual.boot | 0.107 | 132.8% |
| mlxs_case | 0.0854 | 105.5% |
| mlxs_resid | 0.0940 | 116.1% |
Graph
