corrr
corrr copied to clipboard
ERROR: Small integer data overflow (multiplication)
Connecting to an AWS Redshift database via an odbc connection. I'm using corrr version 0.4.0. Using the correlate() function when variables are doubles seem to work. Adding an integer when connected to Redshift throws an error. Included Reprex below to demonstrate a workaround with mutate_all(as.numeric) and to illustrate that issue doesn't happen with a local data frame.
rs_table in this example has ~7.3 million observations.
# set up packages
library(odbc)
library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(dbplyr)
#> Warning: package 'dbplyr' was built under R version 3.5.3
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#> ident, sql
library(corrr)
#> Warning: package 'corrr' was built under R version 3.5.3
# connect to db
redshift <- dbConnect(odbc::odbc(), "my_dsn", bigint = "integer")
# create pointer to db table
rs_table <- tbl(redshift, in_schema("my_schema", "my_table"))
# preview table
rs_table
#> # Source: table<my_schema.my_table> [?? x 5]
#> # Database: Redshift 8.0.2[...]
#> x1 x2 x3 x4 x5
#> <dbl> <dbl> <dbl> <int> <int>
#> 1 84.8 103. 18.0 0 0
#> 2 215. 249. 33.9 0 0
#> 3 196. 263. 67.0 0 0
#> 4 9.79 15.7 5.95 0 0
#> 5 80.0 97.9 18.0 1 0
#> 6 210. 240. 30.0 0 0
#> 7 260. 311. 51.2 0 0
#> 8 100.0 129. 29.0 0 0
#> 9 158. 195. 37.0 0 91
#> 10 66.0 82.9 17.0 0 0
#> # ... with more rows
# correlate() works when vars are all dbl
rs_table %>%
select(x1:x3) %>%
correlate(quiet = TRUE)
#> # A tibble: 3 x 4
#> rowname x1 x2 x3
#> <chr> <dbl> <dbl> <dbl>
#> 1 x1 NA 0.995 0.867
#> 2 x2 0.995 NA 0.894
#> 3 x3 0.867 0.894 NA
# correlate() breaks when int vars are included
rs_table %>%
select(x1:x4) %>%
correlate(quiet = TRUE)
#> Error: <SQL> 'SELECT SUM("x1") AS "x1_sum", SUM("x2") AS "x2_sum", SUM("x3") AS "x3_sum", SUM("x4") AS "x4_sum", SUM("x1" * "x1") AS "x1_two", SUM("x2" * "x2") AS "x2_two", SUM("x3" * "x3") AS "x3_two", SUM("x4" * "x4") AS "x4_two", SUM("x1" * "x2") AS "x1_x2", SUM("x1" * "x3") AS "x1_x3", SUM("x1" * "x4") AS "x1_x4", SUM("x2" * "x3") AS "x2_x3", SUM("x2" * "x4") AS "x2_x4", SUM("x3" * "x4") AS "x3_x4", COUNT(*) AS "obs"
#> FROM (SELECT "x1", "x2", "x3", "x4"
#> FROM my_schema.my_table) "dbplyr_002"'
#> nanodbc/nanodbc.cpp:1587: XX000: [Amazon][Amazon Redshift] (30) Error occurred while trying to execute a query: [SQLState XX000] ERROR: Small integer data overflow (multiplication)
#> DETAIL:
#> -----------------------------------------------
#> error: Small integer data overflow (multiplication)
#> code: 1058
#> context:
#> query: ...
#> location: ...
#> process: ...
#> -----------------------------------------------
#>
#>
# mutate_all(as.numeric) as workaround
rs_table %>%
select(x1:x5) %>%
mutate_all(as.numeric) %>%
correlate(quiet = TRUE)
#> # A tibble: 5 x 6
#> rowname x1 x2 x3 x4 x5
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 x1 NA 0.995 0.868 0.0707 0.00203
#> 2 x2 0.995 NA 0.895 0.0607 0.00269
#> 3 x3 0.868 0.895 NA 0.144 0.00559
#> 4 x4 0.0707 0.0607 0.144 NA -0.0160
#> 5 x5 0.00203 0.00269 0.00559 -0.0160 NA
# problem doesn't occur in local data frame
rs_table %>%
select(x1:x5) %>%
head(1000) %>%
collect() %>%
correlate(quiet = TRUE)
#> # A tibble: 5 x 6
#> rowname x1 x2 x3 x4 x5
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 x1 NA 0.995 0.868 0.0889 -0.0615
#> 2 x2 0.995 NA 0.907 0.0939 -0.0575
#> 3 x3 0.868 0.907 NA 0.109 -0.0340
#> 4 x4 0.0889 0.0939 0.109 NA -0.0326
#> 5 x5 -0.0615 -0.0575 -0.0340 -0.0326 NA
Created on 2019-08-27 by the reprex package (v0.3.0)
Session info
devtools::session_info()
#> - Session info ----------------------------------------------------------
#> setting value
#> version R version 3.5.2 (2018-12-20)
#> os Windows 10 x64
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.1252
#> ctype English_United States.1252
#> tz America/Chicago
#> date 2019-08-27
#>
#> - Packages --------------------------------------------------------------
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.5.3)
#> backports 1.1.3 2018-12-14 [1] CRAN (R 3.5.2)
#> bit 1.1-14 2018-05-29 [1] CRAN (R 3.5.2)
#> bit64 0.9-7 2017-05-08 [1] CRAN (R 3.5.2)
#> blob 1.1.1 2018-03-25 [1] CRAN (R 3.5.2)
#> callr 3.1.1 2018-12-21 [1] CRAN (R 3.5.2)
#> cli 1.1.0 2019-03-19 [1] CRAN (R 3.5.3)
#> colorspace 1.4-0 2019-01-13 [1] CRAN (R 3.5.2)
#> corrr * 0.4.0 2019-07-12 [1] CRAN (R 3.5.3)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 3.5.2)
#> DBI 1.0.0 2018-05-02 [1] CRAN (R 3.5.2)
#> dbplyr * 1.4.2 2019-06-17 [1] CRAN (R 3.5.3)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 3.5.2)
#> devtools 2.0.1 2018-10-26 [1] CRAN (R 3.5.2)
#> digest 0.6.20 2019-07-04 [1] CRAN (R 3.5.3)
#> dplyr * 0.8.3 2019-07-04 [1] CRAN (R 3.5.3)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 3.5.3)
#> fansi 0.4.0 2018-10-05 [1] CRAN (R 3.5.2)
#> fs 1.2.6 2018-08-23 [1] CRAN (R 3.5.2)
#> ggplot2 3.2.0 2019-06-16 [1] CRAN (R 3.5.3)
#> glue 1.3.1 2019-03-12 [1] CRAN (R 3.5.3)
#> gtable 0.2.0 2016-02-26 [1] CRAN (R 3.5.2)
#> highr 0.7 2018-06-09 [1] CRAN (R 3.5.2)
#> hms 0.4.2 2018-03-10 [1] CRAN (R 3.5.2)
#> htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.5.2)
#> knitr 1.24 2019-08-08 [1] CRAN (R 3.5.3)
#> lazyeval 0.2.1 2017-10-29 [1] CRAN (R 3.5.2)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 3.5.2)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 3.5.2)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 3.5.2)
#> odbc * 1.1.6 2018-06-09 [1] CRAN (R 3.5.2)
#> pillar 1.3.1 2018-12-15 [1] CRAN (R 3.5.2)
#> pkgbuild 1.0.2 2018-10-16 [1] CRAN (R 3.5.2)
#> pkgconfig 2.0.2 2018-08-16 [1] CRAN (R 3.5.2)
#> pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.5.2)
#> prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.5.2)
#> processx 3.3.0 2019-03-10 [1] CRAN (R 3.5.3)
#> ps 1.3.0 2018-12-21 [1] CRAN (R 3.5.2)
#> purrr 0.3.2 2019-03-15 [1] CRAN (R 3.5.3)
#> R6 2.4.0 2019-02-14 [1] CRAN (R 3.5.2)
#> Rcpp 1.0.1 2019-03-17 [1] CRAN (R 3.5.3)
#> remotes 2.0.2 2018-10-30 [1] CRAN (R 3.5.2)
#> rlang 0.4.0 2019-06-25 [1] CRAN (R 3.5.3)
#> rmarkdown 1.14 2019-07-12 [1] CRAN (R 3.5.3)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.5.2)
#> scales 1.0.0 2018-08-09 [1] CRAN (R 3.5.2)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.5.2)
#> stringi 1.4.3 2019-03-12 [1] CRAN (R 3.5.3)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 3.5.3)
#> testthat 2.0.1 2018-10-13 [1] CRAN (R 3.5.2)
#> tibble 2.1.3 2019-06-06 [1] CRAN (R 3.5.3)
#> tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.5.2)
#> usethis 1.4.0 2018-08-14 [1] CRAN (R 3.5.2)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 3.5.2)
#> withr 2.1.2 2018-03-15 [1] CRAN (R 3.5.2)
#> xfun 0.8 2019-06-25 [1] CRAN (R 3.5.3)
#> yaml 2.2.0 2018-07-25 [1] CRAN (R 3.5.2)
#>
#> [1] ...
Related to #81
@edgararuiz Do you see any issues with an embedded mutate_if(is.integer, as.numerc) or similar?