[Rd] a fast table() for the 1D case
    Hervé Pagès 
    hpages at fhcrc.org
       
    Mon Sep 16 21:55:23 CEST 2013
    
    
  
Any chance some improvements can be made on table()?
table() is probably one of the most used R functions when working
interactively. Unfortunately it can be incredibly slow, especially
on a logical vector where a simple sum() is hundred times faster
(I actually got into the habit of using sum() instead of table()).
The table1D() proposal below doesn't go as far as using sum() on a
logical vector but it already provides significant speedups for most
use cases.
Thanks,
H.
On 08/09/2013 01:19 AM, Hervé Pagès wrote:
> Hi,
>
> table1D() below can be up to 60x faster than base::table() for the 1D
> case. Here are the detailed speedups compared to base::table().
>
>    o With a logical vector of length 5M:     11x faster
>                                      (or more if 'useNA="always"')
>
>    o With factor/integer/numeric/character of length 1M and 9 levels
>      (or 9 distinct values for non-factors):
>       - factor:                              60x faster
>       - integer/numeric vector:              12x faster
>       - character vector:                   2.4x faster
>
>    o With factor/integer/numeric/character of length 1M and no
>      duplicates:
>        - factor:                              5x faster
>        - integer vector:                      2x faster
>        - numeric vector:                    1.7x faster
>        - character vector:       no significant speedup
>
> Would be great if this improvement could make it into base::table().
>
> Thanks,
> H.
>
>    ## A fast table() implementation for the 1D case (replacing the '...'
>    ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments
>    ## which are unrelated to performance).
>
>    table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN),
>                        useNA = c("no", "ifany", "always"))
>    {
>      if (!missing(exclude) && is.null(exclude)) {
>          useNA <- "always"
>      } else {
>          useNA <- match.arg(useNA)
>      }
>      if (useNA == "always" && !missing(exclude))
>          exclude <- setdiff(exclude, NA)
>      if (is.factor(x)) {
>          x2 <- levels(x)
>          append_NA <- (useNA == "always" ||
>                        useNA == "ifany" && any(is.na(x))) &&
>                       !any(is.na(x2))
>          if (append_NA) {
>              x2 <- c(x2, NA)
>              x <- factor(x, levels=x2, exclude=NULL)
>          }
>          t2 <- tabulate(x, nbins=length(x2))
>          if (!is.null(exclude)) {
>              keep_idx <- which(!(x2 %in% exclude))
>              x2 <- x2[keep_idx]
>              t2 <- t2[keep_idx]
>          }
>      } else {
>          xx <- match(x, x)
>          t <- tabulate(xx, nbins=length(xx))
>          keep_idx <- which(t != 0L)
>          x2 <- x[keep_idx]
>          t2 <- t[keep_idx]
>          if (!is.null(exclude)) {
>              exclude <- as.vector(exclude, typeof(x))
>              keep_idx <- which(!(x2 %in% exclude))
>              x2 <- x2[keep_idx]
>              t2 <- t2[keep_idx]
>          }
>          oo <- order(x2)
>          x2 <- x2[oo]
>          t2 <- t2[oo]
>          append_NA <- useNA == "always" && !any(is.na(x2))
>          if (append_NA) {
>              x2 <- c(x2, NA)
>              t2 <- c(t2, 0L)
>          }
>      }
>      ans <- array(t2)
>      dimnames(ans) <- list(as.character(x2))
>      names(dimnames(ans)) <- "x"  # always set to 'x'
>      class(ans) <- "table"
>      ans
>    }
>
> table1D() also fixes some issues with base::table() that can be exposed
> by running the tests below.
>
>    test_table <- function(FUN_NAME)
>    {
>      FUN <- match.fun(FUN_NAME)
>
>      .make_target <- function(target_names, target_data)
>      {
>          ans <- array(target_data)
>          dimnames(ans) <- list(as.character(target_names))
>          names(dimnames(ans)) <- "x"
>          class(ans) <- "table"
>          ans
>      }
>
>      .check_identical <- function(target, current, varname, extra_args)
>      {
>          if (identical(target, current))
>              return()
>          if (extra_args != "")
>              extra_args <- paste0(", ", extra_args)
>          cat("unexpected result for '", FUN_NAME,
>              "(x=", varname, extra_args, ")'\n", sep="")
>      }
>
>      .test_exclude <- function(x, varname, target_names0, target_data0,
> exclude)
>      {
>          extra_args <- paste0("exclude=", deparse(exclude))
>          current <- FUN(x=x, exclude=exclude)
>          target_names <- target_names0
>          target_data <- target_data0
>          if (is.null(exclude)) {
>              if (!any(is.na(target_names))) {
>                  target_names <- c(target_names, NA)
>                  target_data <- c(target_data, 0L)
>              }
>          } else {
>              if (!is.factor(x)) {
>                  exclude <- as.vector(exclude, typeof(x))
>              } else if (!any(is.na(levels(x)))) {
>                  exclude <- union(exclude, NA)
>              }
>              exclude_idx <- match(exclude, target_names, nomatch=0L)
>              if (any(exclude_idx != 0L)) {
>                  target_names <- target_names[-exclude_idx]
>                  target_data <- target_data[-exclude_idx]
>              }
>          }
>          target <- .make_target(target_names, target_data)
>          .check_identical(target, current, varname, extra_args)
>      }
>
>      .do_exclude_tests <- function(x, varname, target_names0, target_data0,
>                                    more_excludes=NULL)
>      {
>          .BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
>          excludes <- c(.BASIC_EXCLUDES, more_excludes)
>          for (exclude in excludes)
>              .test_exclude(x, varname, target_names0, target_data0,
> exclude)
>      }
>
>      ## Test on a numeric vector.
>      x0 <- numeric(0)
>      .do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3))
>
>      x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
>      x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
>      x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
>      excludes <- list(c(5.3, -9),
>                       c(5.3, NA, -9),
>                       c(5.3, NaN, -9),
>                       c(5.3, 80, -9),
>                       x1_target_names0)
>      .do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0,
> excludes)
>
>      x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
>      x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
>      x2 <- rev(x1)
>      .do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0,
> excludes)
>
>      x3_target_names0 <- c(-9, 4, 5.3)
>      x3_target_data0 <- c(1L, 2L, 1L)
>      x3 <- c(5.3, 4, 4, -9)
>      .do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0,
> excludes)
>
>      ## Test on a factor.
>      f0 <- factor()
>      .do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3))
>
>      f1 <- factor(x1)
>      .do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0,
> excludes)
>
>      f2 <- factor(x1, exclude=NULL)
>      .do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0,
> excludes)
>
>      f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
>      f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
>      f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
>      .do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0,
> excludes)
>
>      x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
>      x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
>      f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
>      .do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0,
> excludes)
>
>      ## Test on a character vector.
>      c0 <- character(0)
>      .do_exclude_tests(c0, "c0", character(0), integer(0), list("Aa"))
>
>      c1 <- c("b", "AA", "", "a", "ab", "NaN", "4", "Aa", NA, "NaN",
> "ab", NA)
>      c1_target_names0 <- sort(unique(c1), na.last=TRUE)
>      c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
>      excludes <- list(c("Aa", 4, ""),
>                       c("Aa", NA, 4, "", "Z"),
>                       c("Aa", NaN, 4, "", "Z"),
>                       c("Aa", 4, "", "Z"))
>      .do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0,
> excludes)
>
>      c2 <- c("b", "AA", "", "a", "ab", "", "", "4", "Aa", "ab")
>      c2_target_names0 <- sort(unique(c2), na.last=TRUE)
>      c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
>      .do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0,
> excludes)
>
>      ## Test on a logical vector.
>      l0 <- logical(0)
>      .do_exclude_tests(l0, "l0", character(0), integer(0), list(c("Aa",
> TRUE)))
>
>      l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
>      l1_target_names0 <- c(FALSE, TRUE, NA)
>      l1_target_data0 <- c(4L, 2L, 3L)
>      excludes <- list(c(TRUE, FALSE),
>                       c("Aa", NA, TRUE),
>                       c("Aa", NaN, TRUE),
>                       l1_target_names0)
>      .do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0,
> excludes)
>
>      l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
>      l2_target_names0 <- c(FALSE, TRUE)
>      l2_target_data0 <- c(4L, 2L)
>      .do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0,
> excludes)
>    }
>
>    test_table("table")    # will display some issues
>    test_table("table1D")  # should not display anything
>
>
>> sessionInfo()
> R version 3.0.1 (2013-05-16)
> Platform: x86_64-unknown-linux-gnu (64-bit)
>
> locale:
>   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
>   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
>   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
>   [7] LC_PAPER=C                 LC_NAME=C
>   [9] LC_ADDRESS=C               LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] stats     graphics  grDevices utils     datasets  methods   base
>
> loaded via a namespace (and not attached):
> [1] tools_3.0.1
>
-- 
Hervé Pagès
Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024
E-mail: hpages at fhcrc.org
Phone:  (206) 667-5791
Fax:    (206) 667-1319
    
    
More information about the R-devel
mailing list