First the proc.time() function is useful to see the runtime of a program. Here's an example that will give the run time in seconds:
time <- proc.time()
# - run some function - #
proc.time()[3] - time[3]
Some of the functions that I wrote for missing data have proven to be quite useful as well. First I always forget the is.na() function and often confuse complete.cases with na.omit
### Ways to find what values are missing
complete.cases(sed) #T/F if row has any missing
na.omit(sed) #gives only rows with no missing values
is.na(sed)==F #have to use is.na as a logical argument, not ==NA
Here are some functions that I think are quite useful and I am surprised they aren't available in R, or if they are, I don't know what they are called
### Identify cols that are entirely NA
NA.cols<-function(X) {
cols<-apply(X,2,function(x) sum(is.na(x)))==nrow(X)
names(cols)<-colnames(X)
na.cols<-which(cols==T)
if(length(na.cols)==0) na.cols<-'Each column has at least one non NA value'
return(na.cols)
}
### Identify rows that are entirely NA (opposite of
### complete.cases)
NA.rows<-function(X) {rows<-apply(X,1,function(x) sum(is.na(x)))==ncol(X)
names(rows)<-rownames(X)
na.rows<-which(rows==T)
if(length(na.rows)==0) na.rows<-'Each row has at least one non NA value'
return(na.rows)
}
### Identify cols that have no have no NA values (I guess you
### could also transpose the data and do na.omit)
complete.cols<-function(X) {
cols<-apply(X,2,function(x) sum(is.na(x)))==0
names(cols)<-colnames(X)
complete.cols<-which(cols==T)
if(length(complete.cols)==0) complete.cols<-'There are no complete variables'
return(complete.cols)
}
I also have my simple imputation function. For real imputation techniques I recommend the Amelia package (at least thats what I used for the multivariate class, and it seemed to have some nice features). I found using proc.time() that it takes like 22 seconds to run on my machine, mostly because the 'hot deck' imputation could probably be more efficient. The real lesson here is that when you write a function, you get to name it after yourself
### A function that returns some simple imputation methods
Alan.imputations<-function(X) {
require(fields)
mean.x<-rep(NA,ncol(X))
median.x<-rep(NA,ncol(X))
min.x<-rep(NA,ncol(X))
max.x<-rep(NA,ncol(X))
X.mean<-X
X.median<-X
X.min<-X
X.max<-X
X.zero<-X
X.sample<-X
for(j in 1:ncol(X)) {
if(is.numeric(X[,j])==T) {
mean.x[j]<-mean(X[,j],na.rm=T)
median.x[j]<-median(X[,j],na.rm=T)
min.x[j]<-min(X[,j],na.rm=T)
max.x[j]<-max(X[,j],na.rm=T)
}
}
for(j in 1:ncol(X)) {
X.mean[is.na(X.mean[,j]),j]<-mean.x[j]
X.median[is.na(X.median[,j]),j]<-median.x[j]
X.min[is.na(X.min[,j]),j]<-min.x[j]
X.max[is.na(X.max[,j]),j]<-max.x[j]
i.na<-is.na(X[,j])
if(sum(i.na)!=length(i.na))
X.sample[i.na,j]<-sample(na.omit(X[,j]),sum(i.na),replace=T)
}
X.zero[is.na(X.zero)]<-0
cols<-apply(X,2,function(x) sum(is.na(x)))==nrow(X)
na.cols<-which(cols==T)
comp.X<-na.omit(X[,-na.cols])
numeric.cols<-rep(NA,ncol(comp.X))
for(i in 1:ncol(comp.X)) {
numeric.cols[i] <- is.numeric(comp.X[,i])
}
X.na<-X[,-na.cols];X.na<-X.na[,numeric.cols]
NA.index<-which(is.na(X.na)==T,arr.ind=T)
sX <- scale(X.na)
X.na.center<-X.na - matrix(attr(sX,"scaled:center"),nrow(X.na),ncol(X.na),byrow=T)
X.na.scaled<-X.na.center/matrix(attr(sX,"scaled:scale"),nrow(X.na.center),ncol(X.na.center),byrow=T)
new.na.cols<-NA.cols(X.na.scaled)
X.na.scaled<-X.na.scaled[,-new.na.cols]
new.complete.cols<-complete.cols(X.na.scaled)
dist<-matrix(NA,nrow(X.na.scaled),nrow(X.na.scaled))
for(i in 1:nrow(X.na.scaled)) {
dist[i,]<-rdist(X.na.scaled[i,new.complete.cols],X.na.scaled[,new.complete.cols])
}
for(i in 1:nrow(dist)) {
if(complete.cases(X.na.scaled)[i]==F) {
j<-2
min<-which(dist[i,]==dist[i,order(dist[i,])[j]])
while(sum(is.na(X.na.scaled[min,is.na(X.na.scaled[i,])]))!=0) {
min<-which(dist[i,]==dist[i,order(dist[i,])[j]])
j<-j+1
}
X.na.scaled[i,is.na(X.na.scaled[i,])]<- X.na.scaled[min,is.na(X.na.scaled[i,])]
}
}
X.hotdeck<-X.na.scaled*matrix(attr(sX,"scaled:scale")[-new.na.cols],nrow(X.na.scaled),ncol(X.na.scaled),byrow=T)+matrix(attr(sX,"scaled:center")[-new.na.cols],nrow(X.na.scaled),ncol(X.na.scaled),byrow=T)
alan<-list(max=X.max, mean=X.mean, median=X.median, min=X.min, sample=X.sample, zero=X.zero, hotdeck=X.hotdeck)
return(alan)
}
(note that some of the spacing gets messed up, just in case something doesn't run right)
If you like profiling, you might get a kick out of this, from http://cran.r-project.org/web/packages/profr/profr.pdf:
ReplyDeletelibrary(profr)
glm_ex <- profr(example(glm))
head(glm_ex)
summary(glm_ex)
also:
> a<-data.frame(matrix(c(1,NA,3,4,5,6,7,8,9),3,3))
> a
X1 X2 X3
1 1 4 7
2 NA 5 8
3 3 6 9
> colSums(a)
X1 X2 X3
NA 15 24
> is.na(colSums(a))
X1 X2 X3
TRUE FALSE FALSE
> colSums(is.na(a))
X1 X2 X3
1 0 0
> colSums(is.na(a))==nrow(a)
X1 X2 X3
FALSE FALSE FALSE
This is a great function, the first to go in my .RProfile
ReplyDelete.ls.objects <- function (pos = 1, pattern, order.by = "Size", decreasing=TRUE, head = TRUE, n = 10) {
# based on postings by Petr Pikal and David Hinds to the r-help list in 2004
# modified by: Dirk Eddelbuettel (http://stackoverflow.com/questions/1358003/tricks-to-manage-the-available-memory-in-an-r-session)
# I then gave it a few tweaks (show size as megabytes and use defaults that I like)
# a data frame of the objects and their associated storage needs.
napply <- function(names, fn) sapply(names, function(x)
fn(get(x, pos = pos)))
names <- ls(pos = pos, pattern = pattern)
obj.class <- napply(names, function(x) as.character(class(x))[1])
obj.mode <- napply(names, mode)
obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class)
obj.size <- napply(names, object.size) / 10^6 # megabytes
obj.dim <- t(napply(names, function(x)
as.numeric(dim(x))[1:2]))
vec <- is.na(obj.dim)[, 1] & (obj.type != "function")
obj.dim[vec, 1] <- napply(names, length)[vec]
out <- data.frame(obj.type, obj.size, obj.dim)
names(out) <- c("Type", "Size", "Rows", "Columns")
out <- out[order(out[[order.by]], decreasing=decreasing), ]
if (head)
out <- head(out, n)
out
}