Das Bashskript für den Faktorendownload:
#!/bin/bash
rm *.csv
rm *.tar.gz
wget https://coinmetrics.io/data/all.tar.gz
tar -xzvf all.tar.gz
Am besten kommt das in einen Ordner mit Namen "coinmetrics" rein, dann klappt auch das folgende R-Skript, welches sich direkt ausserhalb des Ordners befindet. Der Code ist super hässlich, enthält redundante Sachen oder Kram, welcher vom Debugging stammt und lässt sich bestimmt noch optimieren. Auch die Variablennamen sind teils nichtssagend. Ich hoffe, den demnächst überarbeiten zu können:
keep_cols <- function(data, keep=NULL){
if (is.null(keep))
return(data);
to_keep <- rep(FALSE, ncol(data));
for (k in keep)
to_keep <- to_keep | grepl(k, colnames(data));
ret <- data[,to_keep];
ret <- ret[,order(colnames(ret))];
return(ret);
}
merge_data <- function(data, keep=NULL, drop_cols=FALSE){
ret <- NULL;
for (i in 1:length(data)){
if (is.null(keep) || names(data)[i] %in% keep){
datum <- data[[i]];
if (drop_cols)
datum <- datum[,!is.na(datum[nrow(datum),])]
index <- which(grepl("date", colnames(datum)));
if (length(index) == 1){
if (is.null(ret)){
ret <- datum;
byx <- colnames(datum)[index];
} else {
byy <- colnames(datum)[index];
ret <- merge(x=ret, y=datum, by.x=byx, by.y=byy);
}
}
}
}
return(ret);
}
read_dir <- function(dir){
files <- list.files(dir, pattern="*.csv");
ret <- list();
for (file in files){
datum <- read.csv(file=paste(dir, file, sep="/"));
colnames(datum) <- paste(file, colnames(datum), sep=".");
ret <- append(ret, list(datum));
}
names(ret) <- files;
return(ret);
}
make_open_close <- function(data, number_of_days=7, date="DATE", price="PRICE"){
x <- data;
# extrahiere ersten und letzten Tag der Woche
if (number_of_days > 1){
x <- cbind(x, "WEEK_DAY"=((1:nrow(x))%%number_of_days));
open <- x[x$WEEK_DAY==1, price];
close <- x[x$WEEK_DAY==0, price];
open_date <- x[x$WEEK_DAY==1, date];
close_date <- x[x$WEEK_DAY==0, date];
} else {
open <- x[,price];
close <- x[,price];
open_date <- x[,date];
close_date <- x[,date];
}
# passe die Länge der Vektoren an sodass es für jeden Tag eine Entsprechung gibt, einer der Fälle ist glaube ich unnötig
if (length(open) > length(close)) {open <- open[1:length(close)]; open_date <- open_date[1:length(close)];};
if (length(open) < length(close)) {close <- close[1:length(open)]; close_date <- close_date[1:length(open)];};
return(data.frame("OPEN_DATE"=open_date, "CLOSE_DATE"=close_date, "OPEN_PRICE"=open, "CLOSE_PRICE"=close));
}
make_return <- function(data){
return((data$CLOSE_PRICE - data$OPEN_PRICE)/data$OPEN_PRICE);
}
test_levels <- function(data, id=NULL){
if (!is.null(id))
print(id);
bar <- table(data);
print(bar[bar == 1]);
}
make_breaks <- function(data, breaks=5, ignore=c()){
ret <- data;
for (i in 3:ncol(ret))
if (!(colnames(ret)[i] %in% ignore))
ret[,i] <- cut(ret[,i], breaks=breaks);
return(ret);
}
make_formula <- function(data, y=NULL, ignore=c()){
if (is.null(y))
stop("id for y-variable missing");
ret <- paste(setdiff(colnames(data), c(y, ignore)), collapse=" + ");
ret <- paste(y, ret, sep=" ~ ");
return(ret);
}
make_prediction <- function(data, price_id=NULL, shift=1, breaks=5, ignore=c()){
if (is.null(price_id))
stop("price_id missing");
R <- make_breaks(data, breaks=breaks, ignore=price_id);
shifted_R <- R;
shifted_R[1:(length(R[,price_id])-shift), price_id] <- R[(shift+1):length(R[,price_id]), price_id];
shifted_R <- shifted_R[1:(length(R[,price_id])-shift),];
formula <- make_formula(shifted_R, y=price_id, ignore=ignore);
shifted_fit <- lm(formula, data=shifted_R);
to_predict <- R[(nrow(R)-shift+1):nrow(R),];
to_predict[,price_id] <- NA;
# were levels for prediction used in fit?
b <- rep(TRUE,3); for (i in 4:length(to_predict)){b <- c(b, to_predict[1,colnames(to_predict)[i]] %in% shifted_R[,colnames(to_predict)[i]])}
if (sum(!b) > 0)
cat(paste("New levels: ", paste(colnames(to_predict)[!b], collapse=", "), sep=""));
# replace non-existing factors with last one in shifted_R
# actually it should be better to use factor from fit, which is closest
to_predict[1,!b] <- shifted_R[nrow(shifted_R),!b];
prediction <- predict(shifted_fit, to_predict);
ret <- list(prediction, to_predict, R, shifted_R, shifted_fit);
names(ret) <- c("PREDICTION", "TO_PREDICT", "R", "SHIFTED_R", "SHIFTED_FIT");
return(ret);
}
breaks = 15;
price_id = "btc.csv.price.USD.";
x <- read_dir("coinmetrics");
x <- merge_data(x, keep=c("btc.csv", "ltc.csv", "eth.csv", "xrp.csv", "doge.csv", "usdt.csv", "gold.csv", "sp500.csv"), drop_cols=TRUE);
x <- keep_cols(x, c("btc.csv.date", "price", "value", ".txVolume.", "marketcap"));
colnames(x)[colnames(x)=="btc.csv.date"] <- "DATE";
# simply switches columns:
index <- which(colnames(x)==price_id);
indices <- 1:ncol(x);
indices[2] <- index;
indices[index] <- 2;
x <- x[,indices];
x <- x[apply(x, MARGIN=1, function(ret){!any(is.na(ret))}),];
x <- x[(nrow(x) %% 7 + 1):nrow(x),];
R <- make_open_close(x, price=price_id)[,c("OPEN_DATE", "CLOSE_DATE")];
for (i in 2:ncol(x))
R <- cbind(R, make_return(make_open_close(x, price=colnames(x)[i])));
colnames(R)[3:length(colnames(R))] <- colnames(x)[2:length(colnames(x))];
#R <- R[1:(nrow(R)-1),];
svd_R <- R;
foo <- make_prediction(R, price_id=price_id, shift=1, breaks=breaks, ignore=c("OPEN_DATE", "CLOSE_DATE"));
prediction <- foo$PREDICTION;
R <- foo$R;
shifted_R <- foo$SHIFTED_R
shifted_fit <- foo$SHIFTED_FIT;
# evaluations
X <- model.matrix(shifted_fit);
beta <- coefficients(shifted_fit);
X <- X[,!is.na(beta)];
beta <- beta[!is.na(beta)];
evaluation_historically <- X %*% beta;
correct_predictions <- (evaluation_historically < 0) == (shifted_R[,price_id] < 0);
print(sum(correct_predictions)/nrow(shifted_R));
n <- 10^8; print(sum((ceiling(runif(n)*nrow(shifted_R)) <= sum(shifted_R[,price_id] < 0)) == (ceiling(runif(n)*nrow(shifted_R)) <= sum(shifted_R[,price_id] < 0)))/n);
dev.new(width=19, height=3.5);
matplot(R[,price_id], type="l", col=c("blue"), lty = c(1), xaxt="n");
abline(h=0, col="black");
lines((1:length(evaluation_historically))+1, evaluation_historically, col="violet");
points(length(R[,price_id])+1, prediction, col="violet", pch=16);
ops <- c(0.06927308);
old_predictions <- data.frame("x"=(nrow(R)-length(ops)+1):nrow(R), "y"=ops);
points(old_predictions$x, old_predictions$y, col="violet", pch=1);
int <- floor(nrow(R)/7) - 1; # warum auch immer minus 1?
axis(1, at = int*(0:7)+1, labels = R$CLOSE_DATE[int*(0:7)+1]);
legend("left", legend = c("historic", "learned"), col = c("blue", "violet"), lty = c(1,1), lwd = 1 , xpd = T );
title(paste("Weekly BTC return prediction via factorial regression (breaks = ", breaks, "):\nPredicted return for ", as.Date(R$CLOSE_DATE[nrow(R)])+7, ": ", prediction, sep=""));
min_val <- min(c(R[,price_id], evaluation_historically));
colors <- c("red", "green");
correct_predictions <- (evaluation_historically < 0) == (shifted_R[,price_id] < 0);
correct_predictions <- as.integer(correct_predictions)+1;
for (i in 1:length(correct_predictions)){rect(i+1-0.5, min_val, i+2-0.5, min_val-0.1, col=colors[correct_predictions[i]], border=NA)}
factors <- tail(colnames(R),-3);
f1 <- paste(factors[1:14], collapse=", ");
f2 <- paste(factors[15:19], collapse=", ");
s <- paste(f1, f2, sep=",\n")
title(sub=s, adj=0, line=3, font=2, cex.sub=0.9);