Coercion from SCE drops off rowData
When SingleCellExperiment is converted into SummarizedExperiment, the resulting SE does not include rowData that was in the input,
library(SingleCellExperiment)
# Create dummy data
n_cells <- 100
n_genes <- 50
# Create a dummy SingleCellExperiment object
sce <- SingleCellExperiment(
assays = list(counts = matrix(rpois(n_cells * n_genes, lambda = 10), nrow = n_genes, ncol = n_cells)),
colData = DataFrame(
cell_id = paste0("cell", 1:n_cells),
condition = sample(c("control", "treatment"), n_cells, replace = TRUE)
),
rowData = DataFrame(
gene_id = paste0("gene", 1:n_genes),
gene_name = paste0("Gene_", 1:n_genes)
)
)
se <- as(sce, "SummarizedExperiment")
# Show rowData
rowData(sce) |> head()
rowData(se) |> head()
Session info
R Under development (unstable) (2024-01-12 r85803) Platform: x86_64-pc-linux-gnu Running under: Linux Mint 21
Matrix products: default BLAS: /opt/R/devel/lib/R/lib/libRblas.so LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=fi_FI.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=fi_FI.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=fi_FI.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Helsinki tzcode source: system (glibc)
attached base packages: [1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] TreeSummarizedExperiment_2.13.0 Biostrings_2.73.1 XVector_0.45.0 SingleCellExperiment_1.27.2
[5] SummarizedExperiment_1.35.1 Biobase_2.64.0 GenomicRanges_1.57.1 GenomeInfoDb_1.41.1
[9] IRanges_2.39.1 S4Vectors_0.43.1 BiocGenerics_0.51.0 MatrixGenerics_1.17.0
[13] matrixStats_1.3.0
loaded via a namespace (and not attached):
[1] DBI_1.2.3 bitops_1.0-7 remotes_2.5.0 biomaRt_2.59.0 rlang_1.1.4
[6] magrittr_2.0.3 compiler_4.4.0 RSQLite_2.3.7 GenomicFeatures_1.55.1 png_0.1-8
[11] vctrs_0.6.5 stringr_1.5.1 profvis_0.3.8 pkgconfig_2.0.3 crayon_1.5.3
[16] fastmap_1.2.0 dbplyr_2.5.0 ellipsis_0.3.2 utf8_1.2.4 Rsamtools_2.19.2
[21] promises_1.3.0 sessioninfo_1.2.2 UCSC.utils_1.1.0 purrr_1.0.2 bit_4.0.5
[26] zlibbioc_1.51.1 cachem_1.1.0 jsonlite_1.8.8 progress_1.2.3 blob_1.2.4
[31] later_1.3.2 DelayedArray_0.31.8 BiocParallel_1.39.0 parallel_4.4.0 prettyunits_1.2.0
[36] R6_2.5.1 stringi_1.8.4 rtracklayer_1.63.0 pkgload_1.3.3 Rcpp_1.0.13
[41] usethis_2.2.2 httpuv_1.6.15 Matrix_1.6-5 tidyselect_1.2.1 yaml_2.3.9
[46] rstudioapi_0.16.0 abind_1.4-5 codetools_0.2-19 miniUI_0.1.1.1 curl_5.2.1
[51] pkgbuild_1.4.3 lattice_0.22-6 tibble_3.2.1 shiny_1.8.0 treeio_1.29.0
[56] withr_3.0.0 KEGGREST_1.45.1 desc_1.4.3 urlchecker_1.0.1 BiocFileCache_2.11.1
[61] xml2_1.3.6 pillar_1.9.0 filelock_1.0.3 generics_0.1.3 rprojroot_2.0.4
[66] RCurl_1.98-1.14 hms_1.1.3 tidytree_0.4.6 xtable_1.8-4 glue_1.7.0
[71] lazyeval_0.2.2 tools_4.4.0 BiocIO_1.14.0 GenomicAlignments_1.39.1 annotate_1.81.1
[76] fs_1.6.4 XML_3.99-0.16.1 grid_4.4.0 tidyr_1.3.1 ape_5.8
[81] devtools_2.4.5 AnnotationDbi_1.67.0 nlme_3.1-165 GenomeInfoDbData_1.2.12 restfulr_0.0.15
[86] cli_3.6.3 rappdirs_0.3.3 fansi_1.0.6 S4Arrays_1.5.4 dplyr_1.1.4
[91] yulab.utils_0.1.4 digest_0.6.36 SparseArray_1.5.21 rjson_0.2.21 htmlwidgets_1.6.4
[96] memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7 mime_0.12
[101] bit64_4.0.5
That is a bit annoying. Until @hpages can chime in, I'll just note that stepping through the coercion as SingleCellExperiment -> RangedSummarizedExperiment -> SummarizedExperiment does seem to work:
suppressPackageStartupMessages(library(SingleCellExperiment))
# Create dummy data
n_cells <- 100
n_genes <- 50
# Create a dummy SingleCellExperiment object
sce <- SingleCellExperiment(
assays = list(counts = matrix(rpois(n_cells * n_genes, lambda = 10), nrow = n_genes, ncol = n_cells)),
colData = DataFrame(
cell_id = paste0("cell", 1:n_cells),
condition = sample(c("control", "treatment"), n_cells, replace = TRUE)
),
rowData = DataFrame(
gene_id = paste0("gene", 1:n_genes),
gene_name = paste0("Gene_", 1:n_genes)
)
)
# rowData not propagated
rowData(as(sce, "SummarizedExperiment"))
#> DataFrame with 50 rows and 0 columns
# rowData propagated
rowData(as(as(sce, "RangedSummarizedExperiment"), "SummarizedExperiment"))
#> DataFrame with 50 rows and 2 columns
#> gene_id gene_name
#> <character> <character>
#> 1 gene1 Gene_1
#> 2 gene2 Gene_2
#> 3 gene3 Gene_3
#> 4 gene4 Gene_4
#> 5 gene5 Gene_5
#> ... ... ...
#> 46 gene46 Gene_46
#> 47 gene47 Gene_47
#> 48 gene48 Gene_48
#> 49 gene49 Gene_49
#> 50 gene50 Gene_50
I think it may be due to how SingleCellExperiment is defined resulting in it being 2 steps away from SummarizedExperiment
showClass('SingleCellExperiment')
#> Class "SingleCellExperiment" [package "SingleCellExperiment"]
#>
#> Slots:
#>
#> Name: int_elementMetadata int_colData
#> Class: DataFrame DataFrame
#>
#> Name: int_metadata rowRanges
#> Class: list GenomicRanges_OR_GRangesList
#>
#> Name: colData assays
#> Class: DataFrame Assays_OR_NULL
#>
#> Name: NAMES elementMetadata
#> Class: character_OR_NULL DataFrame
#>
#> Name: metadata
#> Class: list
#>
#> Extends:
#> Class "RangedSummarizedExperiment", directly
#> Class "SummarizedExperiment", by class "RangedSummarizedExperiment", distance 2
#> Class "RectangularData", by class "RangedSummarizedExperiment", distance 3
#> Class "Vector", by class "RangedSummarizedExperiment", distance 3
#> Class "Annotated", by class "RangedSummarizedExperiment", distance 4
#> Class "vector_OR_Vector", by class "RangedSummarizedExperiment", distance 4
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.4.1 (2024-06-14)
#> os macOS Sonoma 14.5
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Australia/Melbourne
#> date 2024-07-31
#> pandoc 3.2 @ /usr/local/bin/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> abind 1.4-5 2016-07-21 [1] CRAN (R 4.4.0)
#> Biobase * 2.65.0 2024-05-04 [1] Bioconductor 3.20 (R 4.4.0)
#> BiocGenerics * 0.51.0 2024-05-04 [1] Bioconductor 3.20 (R 4.4.0)
#> cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0)
#> crayon 1.5.3 2024-06-20 [1] CRAN (R 4.4.0)
#> DelayedArray 0.31.10 2024-07-28 [1] Bioconductor 3.20 (R 4.4.1)
#> digest 0.6.36 2024-06-23 [1] CRAN (R 4.4.0)
#> evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.4.0)
#> fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
#> fs 1.6.4 2024-04-25 [1] CRAN (R 4.4.0)
#> GenomeInfoDb * 1.41.1 2024-05-24 [1] Bioconductor 3.20 (R 4.4.0)
#> GenomeInfoDbData 1.2.12 2024-03-28 [1] Bioconductor
#> GenomicRanges * 1.57.1 2024-06-12 [1] Bioconductor 3.20 (R 4.4.1)
#> glue 1.7.0 2024-01-09 [1] CRAN (R 4.4.0)
#> htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#> httr 1.4.7 2023-08-15 [1] CRAN (R 4.4.0)
#> IRanges * 2.39.2 2024-07-17 [1] Bioconductor 3.20 (R 4.4.1)
#> jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.4.0)
#> knitr 1.48 2024-07-07 [1] CRAN (R 4.4.0)
#> lattice 0.22-6 2024-03-20 [1] CRAN (R 4.4.1)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0)
#> Matrix 1.7-0 2024-04-26 [1] CRAN (R 4.4.1)
#> MatrixGenerics * 1.17.0 2024-05-04 [1] Bioconductor 3.20 (R 4.4.0)
#> matrixStats * 1.3.0 2024-04-11 [1] CRAN (R 4.4.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0)
#> reprex 2.1.1 2024-07-06 [1] CRAN (R 4.4.0)
#> rlang 1.1.4 2024-06-04 [1] CRAN (R 4.4.0)
#> rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.4.0)
#> rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.0)
#> S4Arrays 1.5.5 2024-07-21 [1] Bioconductor 3.20 (R 4.4.1)
#> S4Vectors * 0.43.2 2024-07-17 [1] Bioconductor 3.20 (R 4.4.1)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0)
#> SingleCellExperiment * 1.27.2 2024-05-24 [1] Bioconductor 3.20 (R 4.4.0)
#> SparseArray 1.5.27 2024-07-29 [1] Bioconductor 3.20 (R 4.4.1)
#> SummarizedExperiment * 1.35.1 2024-06-28 [1] Bioconductor 3.20 (R 4.4.1)
#> UCSC.utils 1.1.0 2024-05-04 [1] Bioconductor 3.20 (R 4.4.0)
#> withr 3.0.0 2024-01-16 [1] CRAN (R 4.4.0)
#> xfun 0.46 2024-07-18 [1] CRAN (R 4.4.0)
#> XVector 0.45.0 2024-05-04 [1] Bioconductor 3.20 (R 4.4.0)
#> yaml 2.3.10 2024-07-26 [1] CRAN (R 4.4.0)
#> zlibbioc 1.51.1 2024-06-05 [1] Bioconductor 3.20 (R 4.4.0)
#>
#> [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
#>
#> ──────────────────────────────────────────────────────────────────────────────
Not sure if this is relevant, but using rowRanges()<- on a SCE also removes the rowData
library(SingleCellExperiment)
# Create dummy data
n_cells <- 100
n_genes <- 50
# Create a dummy SingleCellExperiment object
sce <- SingleCellExperiment(
assays = list(counts = matrix(rpois(n_cells * n_genes, lambda = 10), nrow = n_genes, ncol = n_cells)),
colData = DataFrame(
cell_id = paste0("cell", 1:n_cells),
condition = sample(c("control", "treatment"), n_cells, replace = TRUE)
),
rowData = DataFrame(
gene_id = paste0("gene", 1:n_genes),
gene_name = paste0("Gene_", 1:n_genes)
)
)
rowData(sce) |> head()
rowRanges(sce)<-do.call(GRangesList, lapply(1:n_genes, \(x) GRanges()))
rowData(sce) |> head()
I also noticed when using rowRanges(sce)[]<-, rowData becomes NAs instead.
rowRanges(sce)[seq_len(n_genes)]<-do.call(GRangesList, lapply(1:n_genes, \(x) GRanges()))
@hpages giving this a nudge in the hope you have time to clarify the issue and if it can be 'fixed'.