# Forecasting financial time series with machine learning models and Twitter data

A talk for the Grup d'estudi de Machine Learning de Barcelona
Date: 18/ Sept. / 2014, 18:00-20:00

Place: FIB (Facultat Informàtica de Barcelona, UPC), Campus Nord, Edif. B6, Sala de Actos

Here are the slides for this talk

Below you will find the R code for the simulations done during  this talk:

R Example I: A test for the possibility of a normal distribution for a stock's return. The necessary data is in AAPL.rds

```#######  @ Argimiro Arratia, 2014,   NNET and SVM modeling
###### http://computationalfinance.lsi.upc.edu

wdir="~/the path to the data"
setwd(wdir)

########Visual Test of normality#################
require(quantmod)
apRd= periodReturn(appl,period="daily")
dsd=density(apRd) #estimate density of daily log ret
yl=c(min(dsd\$y),max(dsd\$y)) #set y limits
hist(apRd,probability=T,xlab="APPLE returns",main=NULL,ylim=yl)
lines(dsd)
##plot the normal density with mean, stdv of apRd
a=seq(min(apRd),max(apRd),0.001)
lines(a,dnorm(a,mean(apRd),sd(apRd)),col="red")

##Repeat above with period="weekly", "monthly".
##Run a Shapiro-wilk normality test
shapiroTest(apRd)
##############################################

```

R Example II: To forecast the monthly and yearly returns of the S&P 500 with a neural network and a support vector machine models, using as features   lags 1, 2, 3 and 5. The necessary data is in sp500m.rds, and the period considered  range from 1900 to 2012.

```#######  @ Argimiro Arratia, 2014,   NNET and SVM modeling
###### http://computationalfinance.lsi.upc.edu

wdir="~/the path to the data"
setwd(wdir)

########Nonlinear models#############################
####### SVM and Neural networks ############
library(e1071) ##for svm
library(nnet)
library(kernlab)
library(quantmod)
library(caret) ##for some data handling functions
library(Metrics)##Measures of prediction error:mse, mae
library(xts)

##Data:sp500m the S&P500 monthly readings from Jan. 1990 to Jan. 2012
plot(sp500m['1910/1990'])

tau=1 #data is monthly. Try tau=12 (year), tau=1 (monthly)
ret=diff(log(sp500m),diff=tau)  ##compute tau-period returns

##Model Inputs:
##Define matrix of features (each column is a feature)
#Features: lags 1,2,3,5
feat = merge(na.trim(lag(ret,1)),na.trim(lag(ret,2)),na.trim(lag(ret,3)),na.trim(lag(ret,5)),
all=FALSE)

##add TARGET. We want to predict RETURN
dataset = merge(feat,ret,all=FALSE)

colnames(dataset) = c("lag.1", "lag.2", "lag.3","lag.5",
#names of other features,
"TARGET")

##Divide data into training (75%) and testing (25%). Use caret methods
index = 1:nrow(dataset)
trainindex= createDataPartition(index,p=0.75,list=FALSE)
##process class sets as data frames
training = as.data.frame(dataset[trainindex,])
rownames(training) = NULL
testing = as.data.frame(dataset[-trainindex,])
rownames(testing) = NULL

##Train model
##############################################
##OPTION LAZY: one svm, one nnet built w/o tuning  (or tune by hand)
#parameters that can be tuned
#type="C" ##classification
type="eps-regression" ##regression
u= -2 ## -3,-2,-1,0,1,2,3
gam=10^{u}; w= 4.5 ##1.5,-1,0.5,2,3,4
cost=10^{w}
##The higher the cost produce less support vectors, increases accuracy
##However we may overfit
svmFit = svm (training[,-ncol(training)], training[,ncol(training)],
type=type,
gamma=gam,
cost=cost
)
summary(svmFit)
##build SVM predictor
predsvm = predict(svmFit, testing[,-ncol(testing)])
##A nnet with size hidden layers +skip layer. Max iteration 10^4,
size=6
nnetFit = nnet(training[,-ncol(training)], training[,ncol(training)],
size=size,skip=T, maxit=10^4,decay=10^{-2},trace=F,linout=T)
summary(nnetFit) ##gives description w/weights

##build NNET predictor type="raw"
prednet<-predict(nnetFit,testing[,-ncol(testing)],type="raw")

################end of Option Lazy ##############################

###EVALUATION
actualTS=testing[,ncol(testing)] ##the true series to predict
predicTS=predsvm ##choose appropriate
predicTS = prednet

##1. Evaluation for return prediction. Residual sum of squares
ssr= sum((actualTS - predicTS)^2); ssr
##Normalize Residual Mean Square Error (NRMSE)
nrmse = sqrt(ssr/((length(actualTS)-1)*var(actualTS))); nrmse
##percentage of outperforming direct sample mean (sample expected value)
pcorrect = (1-nrmse)*100; pcorrect
##For visual comparison
yl=c(min(actualTS,predicTS),max(actualTS,predicTS)) #set y limits
plot(actualTS,predicTS,ylim=yl)

```