ps Feature Request: System Disk IO Counters

Thank you for developing and maintaining such a useful R package.

For my use-case, it would be very beneficial to have a function to return disk IO counters (read bytes, write bytes, transfers, etc) for Linux and Windows, similar to https://psutil.readthedocs.io/en/latest/#psutil.disk_io_counters. I have implemented a potential version for Linux as a bolt-on to this package, however cannot implement the Windows version as it would require updating the C source for this package and I'm not very good with C. I have a workaround using powershell, but it is less than ideal.

I'm unsure if you would prefer me to open this as PR, as I wouldn't be able to implement the Windows/OSX versions so it would be incomplete

My implementation for Linux, heavily inspired by psutil

ps_disk_io_counters <- function(perdisk = FALSE) {
  if (ps_os_name() == "WINDOWS") {
    return(.ps_disk_io_counters_windows(perdisk))
  } else {
    return(.ps_disk_io_counters_linux(perdisk))
  }
}


#' Internal disk IO counters for Linux, reads lines from diskstats if it exists
#' or /sys/block as a backup
.ps_disk_io_counters_linux <- function(perdisk = FALSE) {
  if (file.exists("/proc/diskstats")) {
    disk_info <- .read_procfs()
  } else if (dir.exists("/sys/block")) {
    disk_info <- .read_sysfs()
  } else {
    stop("Can't read, neither /proc/diskstats or /sys/block on this system")
  }

  # Keep all if perdisk, only keep non-partitions if perdisk=FALSE to get correct sums
  disk_info <- disk_info[perdisk | sapply(disk_info$name, .is_storage_device), ]

  if (perdisk) {
    return(disk_info)
  } else {
    # Sum all numeric cols, convert to df via list to ensure 1 row
    total_info <- data.frame(as.list(colSums(disk_info[, -1])))
    # Add no name
    total_info$name <- NA
    return(total_info)
  }

  return(disk_io)
}

#' Whether a named drive (e.g. 'sda') is a real storage device, or a virtual one
.is_storage_device <- function(name, including_virtual = TRUE) {
  name <- stringr::str_replace_all(name, "/", "!")
  if (including_virtual) {
    path <- file.path("/sys/block", name)
  } else {
    path <- file.path("/sys/block", name, "device")
  }

  return(dir.exists(path))
}

#' Logic to read disk IO stats from /proc/diskstats
.read_procfs <- function() {
  file <- readLines("/proc/diskstats")
  # Get info as list of vectors (could be different lengths)
  lines <- stringr::str_split(stringr::str_trim(file), "\\s+")
  # Pre-allocate matrix of info as NAs
  mat <- matrix(data = NA_character_, nrow = length(lines), ncol = 10)
  for (i in seq_along(lines)) {
    # For each line, check length and insert into matrix as appropriate
    line <- lines[[i]]
    flen <- length(line)
    if (flen == 15) {
      # Linux 2.4
      mat[i, 1] <- line[[4]] # name
      mat[i, 2] <- line[[3]] # reads
      mat[i, 3:9] <- line[5:11] # reads_merged, rbytes, rtime, writes, writes_merged, wbytes, wtime
      mat[i, 10] <- line[[14]] # busy_time
    } else if (flen == 14 || flen >= 18) {
      # Linux 2.6+, line referring to a disk
      mat[i, 1] <- line[[3]] # name
      mat[i, 2:9] <- line[5:12] # reads, reads_merged, rbytes, rtime, writes, writes_merged, wbytes, wtime
      mat[i, 10] <- line[[14]] # busy_time
    } else if (flen == 7) {
      # Linux 2.6+, line referring to a partition
      mat[i, 1] <- line[[2]] # name
      mat[i, 2] <- line[[4]] # reads
      mat[i, 4:5] <- line[5:6] # rbytes, writes
      mat[i, 8] <- line[7] # wbytes
    } else {
      stop("Cannot read diskstats file")
    }
  }

  # Add names and convert types as appropriate
  return(data.frame(
    name = mat[, 1],
    read_count = as.numeric(mat[, 2]),
    read_merged_count = as.numeric(mat[, 3]),
    read_bytes = as.numeric(mat[, 4]) * 512, # Multiple by disk sector size to get bytes
    read_time = as.numeric(mat[, 5]),
    write_count = as.numeric(mat[, 6]),
    write_merged_count = as.numeric(mat[, 7]),
    write_bytes = as.numeric(mat[, 8]) * 512, # Multiple by disk sector size to get bytes
    write_time = as.numeric(mat[, 9]),
    busy_time = as.numeric(mat[, 10])
  ))
}

#' Read disk IO from each device folder in /sys/block
.read_sysfs <- function() {
  # Get stat files for each block
  blocks <- list.dirs("/sys/block/", recursive = FALSE)
  all_files <- list.files(blocks, full.names = TRUE)
  stats <- all_files[stringr::str_ends(all_files, "stat")]

  # Pre-allocate list for dfs
  disk_info <- vector(mode = "list", length = length(stats))
  for (i in seq_along(stats)) {
    # Read in each file, get the field
    stat <- stats[[i]]
    fields <- readLines(stat)
    fields <- unlist(stringr::str_split(stringr::str_trim(fields), "\\s+"))

    # Save all info as a dataframe
    block_info <- data.frame(
      name = stringr::str_split(stat, "/")[[1]][[5]], # Extract name from stat filepath
      read_count = as.numeric(fields[[1]]),
      read_merged_count = as.numeric(fields[[2]]),
      read_bytes = as.numeric(fields[[3]]) * 512, # Multiple by disk sector size to get bytes
      read_time = as.numeric(fields[[4]]),
      write_count = as.numeric(fields[[5]]),
      write_merged_count = as.numeric(fields[[6]]),
      write_bytes = as.numeric(fields[[7]]) * 512, # Multiple by disk sector size to get bytes
      write_time = as.numeric(fields[[8]]),
      busy_time = as.numeric(fields[[10]])
    )

    disk_info[[i]] <- block_info
  }
  return(do.call(rbind, disk_info))
}

Apr 20 '23 13:04 michaelwalshe

Thanks! If you could submit it as a PR, that would be great. It is ok if a function is only implemented on Linux, we can always add a Windows/macOS implementation later, if it is possible to have one.

Apr 20 '23 13:04 gaborcsardi

Okay thanks, I'll submit this as a draft PR, though I haven't really submitted to R packages before so it may be a while as this'll need tests and documentation etc

Apr 20 '23 14:04 michaelwalshe

Thanks! It is fine to submit a first PR w/o docs and tests.

Apr 20 '23 14:04 gaborcsardi