[R-SIG-Finance] Indexing minute data with indexing package

Sun Sep 18 06:51:36 CEST 2011

I've been working with indexing this weekend and have been trying to  
best figure out how to use it. My main goal is to, one ticker at a  
time, load the data into R, then index it on disk using indexing /  
mmap.  Then, when I need it for analysis, I'd like to load it into  
memory for analysis. I've had some success with this, but can't get it  
to work quite the way I want, so I'd like to get some input here.

Here's the process I've been using:
---------
library(indexing)
library(quantmod)
library(TTR)
tkr = "SPY"

# First import the data from CSV file
headerCol = c("Date","Time","Open","High","Low","Close","Volume");
tmp = read.csv(paste(getDWD(),"\\",tkr,"_1.csv",sep=""), header =  
FALSE, stringsAsFactors = FALSE, col.names=headerCol);

tmp$Timestamp = unclass(as.POSIXct(paste(tmp$Date,tmp$Time), format =  
"%m/%d/%Y %H:%M"));

# I converted the timestamp to the unix form, because in order to  
store it on the disk using indexing it must be an integer. I then use  
the following function:

indexTmp = function(tmp, symbol, db=indexenv()) {
	cwd <- getwd();
	setwd(getIWD()); 		#getIWD() returns the directory where I wish to  
store the indexed binary files
	db <- create_index(as.vector(tmp 
$Timestamp),column=paste(symbol,".DateTime",sep=""))
	db <- create_index(tmp$Open,column=paste(symbol,".Open",sep=""))
	db <- create_index(tmp$High,column=paste(symbol,".High",sep=""))
	db <- create_index(tmp$Low,column=paste(symbol,".Low",sep=""))
	db <- create_index(tmp$Close,column=paste(symbol,".Close",sep=""))
	db <- create_index(tmp$Volume,column=paste(symbol,".Volume",sep=""))
	setwd(cwd);
}

# The data from the csv file is indexed on disk as follows:

indexTmp(tmp=tmp,symbol=tkr);
db <- indexenv()
ls(db)
[1] "SPY.Close"    "SPY.DateTime"	"SPY.High"     "SPY.Low"       
"SPY.Open"     "SPY.Volume"

# Now, I'm not sure if this is the best way to do this, but I can't  
really think of another way to do it given what I understand of the  
indexing structure. Ideally, I'd like to pull the data per ticker  
using the following routine:

getIdxData = function(symbol = "", startDate = "", endDate = "",  
fields=c("Open","High","Low","Close","Volume"),db=indexenv()) {
	cwd <- getwd();
	setwd(getIWD());

	# Turn date range into unix time
	sDate = as.numeric(round(unclass(as.POSIXct(startDate))[1]));
	if (endDate == "") eDate =  
as.numeric(round(unclass(as.POSIXct(Sys.time()))[1])) else eDate =  
unclass(as.POSIXct(endDate))[1];

	# Now fetch the data in the index and construct an xts object from it
	dIdx =  
db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".DateTime",sep=""))]
	dIdx=as.POSIXct(dIdx,origin="1970-01-01")

	tmp = NULL;
	cNames = NULL;
	for (field in fields) {
		tmp =  
cbind 
(tmp 
,db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".",field,sep=""))]);
		cNames = c(cNames, paste(symbol,".",field,sep=""));
	}
	setwd(cwd);
	tmp = xts(tmp, order.by = dIdx);
	colnames(tmp) = cNames;
	return(tmp)
}

SPY<-getIdxData(symbol="SPY",startDate="1994-01-01")
Error in as.POSIXlt.character(x, tz, ...) :
   character string is not in a standard unambiguous format
---------

As you can see, a call to this routine results in failure. This is  
because the calls to 'dIdx = db[suppressWarnings(search_index' bit   
evaluates to a vector of NAs, so the as.POSIXct function doesn't work.  
Commenting out the call to as.POSIXct and having the function just  
return tmp before making it an xts object results in a vector with all  
NAs:
---------
SPY<-getIdxData(symbol="SPY",startDate="1997-01-01")
tail(SPY)
            SPY.Open SPY.High SPY.Low SPY.Close SPY.Volume
[1893702,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893703,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893704,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893705,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893706,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893707,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
---------

However, if you do this run the body of the above getIdxData function  
line by line in the global environment, it is successful:
---------
symbol=tkr
startDate="1994-01-01"
endDate=""
fields=c("Open","High","Low","Close","Volume")
db=indexenv()

cwd <- getwd();
setwd(getIWD());

# Turn date range into unix time
sDate = as.numeric(round(unclass(as.POSIXct(startDate))[1]));
if (endDate == "") eDate =  
as.numeric(round(unclass(as.POSIXct(Sys.time()))[1])) else eDate =  
unclass(as.POSIXct(endDate))[1];

# Now fetch the data in the index and construct an xts object from it
dIdx =  
db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".DateTime",sep=""))]
dIdx=as.POSIXct(dIdx,origin="1970-01-01")

tmp = NULL;
cNames = NULL;
for (field in fields) {
	tmp =  
cbind 
(tmp 
,db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".",field,sep=""))]);
	cNames = c(cNames, paste(symbol,".",field,sep=""));
}
setwd(cwd);
tmp = xts(tmp, order.by = dIdx);
colnames(tmp) = cNames;
tail(tmp)
                     SPY.Open SPY.High SPY.Low SPY.Close SPY.Volume
2011-09-03 00:54:00   117.41   117.41  117.41    117.41        100
2011-09-03 00:55:00   117.42   117.42  117.40    117.41       2520
2011-09-03 00:56:00   117.41   117.41  117.40    117.40       2800
2011-09-03 00:57:00   117.41   117.41  117.41    117.41       1685
2011-09-03 00:58:00   117.42   117.42  117.42    117.42        500
2011-09-03 00:59:00   117.42   117.42  117.40    117.42       6418
---------

Apologies for the long post, but there was a lot of sample code to  
provide. My questions are as follows:

1) Why doesn't my getIdxData() function work properly? I'm pretty sure  
it has to do with the way the indexing function evaluates it's  
arguments. I had to use the eval() function in order to trick the  
index into accepting parametrized symbols. Perhaps there's some tricky  
environment / scope issue going on here.
2) For those of you using the index package, what approach would you  
recommend to use for storing equity data like this?

I understand the indexing package is very alpha, and I appreciate the  
work that has gone into it. It seems to be perfect for my purposes, as  
I would like to have many series of minute data at the ready to load  
into memory and analyze on demand.  Hopefully I can get some help here  
to resolve this. Thanks!