Rで簡単に前処理+クロス集計+α

execute system command

system("R --version")

R version 3.1.2 (2014-10-31) -- "Pumpkin Helmet"
Copyright (C) 2014 The R Foundation for Statistical Computing
Platform: i386-w64-mingw32/i386 (32-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under the terms of the
GNU General Public License versions 2 or 3.
For more information about these matters see
http://www.gnu.org/licenses/.

attributes of variable

# result is NULL
attributes(c())
attributes(NULL)
attributes(1)
attributes("1")
attributes(T)

# class, names, dim, row.names
# names -> var$name
attributes(as.Date(Sys.time()))
attributes(Sys.getenv())
attributes(matrix(1:5))
attributes(data.frame(1:5))

# attributes(POSIXlt)
# $names
#  [1] "sec"    "min"    "hour"   "mday"   "mon"    "year"   "wday"   "yday"   "isdst"
# [10] "zone"   "gmtoff"
# $class
# [1] "POSIXlt" "POSIXt"
#
# $tzone
# $[1] ""    "JST" "JDT"

date(seq and datetime)

# seq for date
last.ym <- function (x) {
  d <- seq(as.Date(x), len=2, by="-1 month")[2]
  return(format(d, "%Y%m"))
}

# POSIXlt(as datetime)
month.first.date <- function (x) {
  t <- as.POSIXlt(x)
  e <- as.POSIXlt(t - (t$mday - 1) * 60 * 60 * 24)
  return(as.Date(e))
}

package version

packageVersion("dplyr")
[1]0.4.1
## data processing library

library(dplyr)
library(data.table)

> library(dplyr)

 次のパッケージを付け加えます: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise, summarize

The following object is masked from ‘package:stats’:

    filter

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

> library(data.table)
data.table 1.9.4  For help type: ?data.table
*** NB: by=.EACHI is now explicit. See README to restore previous behaviour.

 次のパッケージを付け加えます: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last

fast read file

df1 <- data.table::fread(df1.path, header = T, stringsAsFactors = F)

pipe

easy to understand.

  • join
  • add column
  • filter
  • delete column(or choose column)
# %.% is duplicated.
df1 %>%
  dplyr::left_join(df2, by="id") %>%
  dplyr::inner_join(df3, by="id")
  dplyr::mutate(col5 = ifelse(is.na(df3$col4), 0, 1)) %>%
  dplyr::filter(mem_type %in% c("a", "z")) %>%
  dplyr::select(-col1, -col2, col3, col2)

grouping and summarize

grouped <- device %>%
  dplyr::group_by(id) %>%
  dplyr::summarize(last_timestamp = max(timestamp))

# distinct
# Don't use group for instead of unique().
dplyr::distinct(df1, id, type)

sql like

library(sqldf)
var.name.chain.dot <- data.frame(1:5)
var_name_chain_dot <- var.name.chain.dot
sqldf("select count(*) from var_name_chain_dot")
# sqldf("select count(*) var.name.chain.dot")
# ↑ error

delete var and gc

rm(var1, var2)
gc()

# for gloval env
clear.var.global <- function(vars) {
  rm(list=vars, pos = ".GlobalEnv")
  gc()
}

cross tabulation

library(reshape2)

# one column
dcast(df1, value.var = "id",
      type ~ .,
      length)
# table(d$g)

# cross tabulation
dcast(df1, value.var = "id",
      status ~ type,
      length)
# table(d$g, d$s)

# multi column cross tabulation
dcast(df1, value.var = "id",
      status + type + category ~ is_max,
      length)

arguments parser

# Python optparse like
library(optparse)
optslist <- list(
  make_option("--flag1",          action="store_T", default=F,      help="A Flag"),
  make_option(c("-f", "--flag2"), action="store",   default=F,      help="A Flag", type="logical"),
  make_option("--num",            type="integer",   default=1,      help="A Number"),
  make_option("--raw-data",       type="character", default="hoge", help="a string")
)
parser <- OptionParser(option_list = optslist)
opts <- parse_args(parser)

cat(sprintf("flag1=%s, flag2=%s, num=%d, raw-data=%s",
            opts$flag1, opts$flag2, opts$num, opts[["raw-data"]]))

exit or assertion like function

stop("Message")

stopifnot(T)
stopifnot(F == T)
# -> Error:  F == T は TRUE ではありません

random, shuffle(sample)

m1 <- matrix(runif(100), nrow=10)
m2 <- matrix(sample(c(rep(0, 5), rep(1, 5)), size = 9), ncol=3)

network graph

library(igraph)
g <- graph.adjacency(m1 <= 0.5, mode="undirected")

# edge
E(g)$color <- "grey"
E(g)$width <- 1
E(g)$weight <- 100
E(g)$label <- letters[1:length(E(g))]

# node
V(g)$size <- 5
V(g)$color <- "blue"
V(g)$shape <- "circle"
V(g)$label <- letters[1:length(V(g))]
V(g)$label.cex <- 1.5                   # font size for label

plot

plot(g)
plot(g, layout=layout.circle)
plot(g, layout=layout.star)
plot(g, layout=layout.grid)
plot(g, layout=layout.kamada.kawai)

# directed and weighted
g <- graph.adjacency(m2, mode="directed", weighted=TRUE)
plot(g, vertex.color="white", vertex.label = V(g)$name, edge.color = "black", edge.label = E(g)$weight, vertex.size=10)

# tree
g <- graph.tree(15)
plot.igraph(g)

# dendrogram like
lay <- layout.reingold.tilford(g)
plot(g, layout=lay)