modeldb icon indicating copy to clipboard operation
modeldb copied to clipboard

possible bugs in multiple linear regressions

Open Saarialho opened this issue 1 year ago • 0 comments

The problem

I am having trouble with grouped multivariate regressions. I believe the last error is in pattern matching with grepl (when xvar names have yvar in them)

Reproducible example

library(dplyr)
library(purrr)

mtcars %>% 
  arrow::to_duckdb() %>% 
  group_by(am) %>% 
  modeldb::linear_regression_db(mpg, auto_count = TRUE)
#> # A tibble: 2 × 11
#>      am `(Intercept)`    cyl    disp      hp   drat     wt   qsec    vs  gear
#>   <dbl>         <dbl>  <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl>
#> 1     0          8.64 -0.534 -0.0203  0.0622  0.592   1.95 -0.884 0.739  8.65
#> 2     1       -138.   -1.28   0.180  -0.160  -4.95  -10.5   8.09  0.943 12.3 
#> # ℹ 1 more variable: carb <dbl>

mtcars %>% 
  group_by(am) %>% 
  dplyr::reframe(
    reg = list(lm(mpg ~ ., data = dplyr::pick(dplyr::everything()))),
  ) %>% 
  .$reg %>% 
  map(coefficients) %>% 
  bind_rows()
#> # A tibble: 2 × 10
#>   `(Intercept)`    cyl    disp      hp   drat     wt   qsec    vs  gear  carb
#>           <dbl>  <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
#> 1          8.64 -0.534 -0.0203  0.0622  0.592   1.95 -0.884 0.739  8.65 -4.81
#> 2       -138.   -1.28   0.180  -0.160  -4.95  -10.5   8.09  0.943 12.3   4.69

mtcars %>% 
  group_by(cyl) %>% 
  modeldb::linear_regression_db(mpg, auto_count = TRUE)
#> Error in `map()`:
#> ℹ In index: 2.
#> Caused by error in `solve.default()`:
#> ! system is computationally singular: reciprocal condition number = 9.04685e-21
#> Backtrace:
#>      ▆
#>   1. ├─mtcars %>% group_by(cyl) %>% ...
#>   2. ├─modeldb::linear_regression_db(., mpg, auto_count = TRUE)
#>   3. │ └─modeldb:::mlr(...)
#>   4. │   └─purrr::map(seq_len(vars_count + 1), ~as.numeric(solve(xm[[.x]], ym[[.x]])))
#>   5. │     └─purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>   6. │       ├─purrr:::with_indexed_errors(...)
#>   7. │       │ └─base::withCallingHandlers(...)
#>   8. │       ├─purrr:::call_with_cleanup(...)
#>   9. │       └─modeldb (local) .f(.x[[i]], ...)
#>  10. │         ├─base::solve(xm[[.x]], ym[[.x]])
#>  11. │         └─base::solve.default(xm[[.x]], ym[[.x]])
#>  12. └─base::.handleSimpleError(...)
#>  13.   └─purrr (local) h(simpleError(msg, call))
#>  14.     └─cli::cli_abort(...)
#>  15.       └─rlang::abort(...)

mtcars %>% 
  group_by(cyl) %>% 
  dplyr::reframe(
    reg = list(lm(mpg ~ ., data = dplyr::pick(dplyr::everything()))),
  ) %>% 
  .$reg %>% 
  map(coefficients) %>% 
  bind_rows()
#> # A tibble: 3 × 10
#>   `(Intercept)`    disp      hp  drat     wt   qsec    vs    am  gear  carb
#>           <dbl>   <dbl>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1         60.9  -0.345  -0.0332 -4.19  4.48  -0.106 -3.64 -6.33  4.07  3.22
#> 2         32.8   0.0746 -0.0425  1.52  5.12  -2.33  -1.75 NA    NA    NA   
#> 3          6.25 -0.0234  0.152  -5.74 -0.726  1.36  NA     4.87 NA    -4.77

mtcars %>% 
  arrow::to_duckdb() %>% 
  dbplyr::window_order(mpg) %>% 
  mutate(lag_mpg = lag(mpg)) %>% 
  filter(!is.na(lag_mpg)) %>% 
  group_by(am) %>% 
  modeldb::linear_regression_db(mpg, auto_count = TRUE)
#> Warning in matrix(as.numeric(.x), nrow = matrix_size): data length [81] is not
#> a sub-multiple or multiple of the number of rows [10]
#> Warning in matrix(as.numeric(.x), nrow = matrix_size): data length [81] is not
#> a sub-multiple or multiple of the number of rows [10]
#> Error in `map()`:
#> ℹ In index: 1.
#> Caused by error in `solve.default()`:
#> ! 'a' (10 x 9) must be square
#> Backtrace:
#>      ▆
#>   1. ├─... %>% ...
#>   2. ├─modeldb::linear_regression_db(., mpg, auto_count = TRUE)
#>   3. │ └─modeldb:::mlr(...)
#>   4. │   └─purrr::map(seq_len(vars_count + 1), ~as.numeric(solve(xm[[.x]], ym[[.x]])))
#>   5. │     └─purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>   6. │       ├─purrr:::with_indexed_errors(...)
#>   7. │       │ └─base::withCallingHandlers(...)
#>   8. │       ├─purrr:::call_with_cleanup(...)
#>   9. │       └─modeldb (local) .f(.x[[i]], ...)
#>  10. │         ├─base::solve(xm[[.x]], ym[[.x]])
#>  11. │         └─base::solve.default(xm[[.x]], ym[[.x]])
#>  12. └─base::.handleSimpleError(...)
#>  13.   └─purrr (local) h(simpleError(msg, call))
#>  14.     └─cli::cli_abort(...)
#>  15.       └─rlang::abort(...)

mtcars %>% 
  mutate(lag_mpg = lag(mpg)) %>% 
  filter(!is.na(lag_mpg)) %>% 
  arrange(mpg) %>% 
  group_by(am) %>% 
  dplyr::reframe(
    reg = list(lm(mpg ~ ., data = dplyr::pick(dplyr::everything()))),
  ) %>% 
  .$reg %>% 
  map(coefficients) %>% 
  bind_rows()
#> # A tibble: 2 × 11
#>   `(Intercept)`    cyl    disp      hp   drat     wt   qsec     vs  gear  carb
#>           <dbl>  <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl>
#> 1          7.83 -0.533 -0.0209  0.0644  0.509   2.15 -0.932  0.739  8.98 -4.93
#> 2       -178.   -4.36   0.328  -0.226  -5.72  -18.9  11.2   -0.715 15.0   7.49
#> # ℹ 1 more variable: lag_mpg <dbl>

Created on 2025-01-05 with reprex v2.0.2

Saarialho avatar Jan 05 '25 09:01 Saarialho