############# Time Series: SES, HW ES, Forecasting & Decision Trees (Code #18 - Lecture 25) ############# Car_da <- read.csv("https://www.bauer.uh.edu/rsusmel/4397/TOTALNSA.csv", head=TRUE, sep=",") names(Car_da) x_car <- Car_da$TOTALNSA ### Stabilizing Transformations library(tseries) ts_car <- ts(x_car,start=c(1976,1),frequency=12) plot.ts(ts_car, xlab="Time", ylab="div", main="Total U.S. Vehicle Sales") mean(x_car) sd(x_car) ## Log Transformation l_car <- log(ts_car) plot.ts(l_car, xlab="Time", ylab="div", main="Total U.S. Vehicle Sales") mean(l_car) sd(l_car) ### SES Log Car Sales mod_car <- HoltWinters(l_car, beta=FALSE, gamma=FALSE) mod_car ### SES Log Dividend Changes Sh_da <- read.csv("https://www.bauer.uh.edu/rsusmel/4397/Shiller_data.csv", head=TRUE, sep=",") names(Sh_da) x_P <- Sh_da$P x_D <- Sh_da$D x_i <- Sh_da$Long_i T <- length(x_P) lr_p <- log(x_P[-1]/x_P[-T]) lr_d <- log(x_D[-1]/x_D[-T]) mod1 <- HoltWinters(lr_d[1:1750], beta=FALSE, gamma=FALSE) mod1 plot(mod1$fitted, main= "Log Dividend Changes: SES" ) ## One step forecast errors T_last <- nrow(mod1$fitted) # number of in-sample forecasts h <- 25 # forecast horizon ses_f <- matrix(0,h,1) # Vector to collect forecasts alpha <- 0.29 y <- lr_d T <- length(lr_d) sm <- matrix(0,T,1) T1 <- T - h + 1 # Start of forecasts a <- T1 # index for while loop sm[a-1] <- mod1$fitted[T_last] # last in-sample forecast while (a <= T) { sm[a] = alpha * y[a-1] + (1-alpha) * sm[a-1] a <- a + 1 } ses_f <- sm[T1:T] ses_f f_error_ses <- sm[T1:T] - y[T1:T] # forecast errors MSE_ses <- sum(f_error_ses^2)/h # MSE plot(ses_f, type="l", main ="SES Forecasts: Changes in Dividends") forecast(mod1, h=25, level=.95) ### Holt-Winters components_car <- decompose(l_car) plot(components_car) HW_car <- HoltWinters(l_car, seasonal = "additive") HW_car # Custom Holt-Winters fitting (Specifying alpha, beta & gamma) HW_car_2 <- HoltWinters(l_car, alpha=0.2, beta=0.1, gamma=0.1) HW_car_2 # Visual evaluation of fit plot(l_car, main = "Log Car Sales", ylab="Log Car Sales") lines(HW_car$fitted[,1], lty=2, col="blue") lines(HW_car_2$fitted[,1], lty=2, col="red") # Forecasting HW_car.pred <- predict(HW_car, 24, prediction.interval = TRUE, level=0.95) forecast(HW_car, h=24, level=.95) # Visual evaluation of Fit and Forecasts plot(l_car, main = "Forecasting Log Car Sales", ylab="Log CAr Sales") lines(HW_car$fitted[,1], lty=2, col="blue") # History of Series lines(HW_car.pred[,1], col="red") # Predictions lines(HW_car.pred[,2], lty=2, col="orange") # Upper 95% C.I. band lines(HW_car.pred[,3], lty=2, col="orange") # Lower 95% C.I. band # In-sample MSE HW_car_mse <- sum(HW_car$fitted[,1]^2)/length(HW_car$fitted[,1]) HW_car_mse # Multiplicative Model and Forecast HW_car_m <- HoltWinters(l_car, seasonal ="multiplicative") HW_car_m HW_car_m.pred <- predict(HW_car_m, 24, prediction.interval = TRUE, level=0.95) forecast(HW_car_m, h=24, level=.95) #Visually evaluate the prediction plot(l_car, main = "Forecasting Log Car Sales", ylab="Log Car Sales") lines(HW_car_m$fitted[,1], lty=2, col="blue") # History of Series lines(HW_car_m.pred[,1], col="red") # Predictions lines(HW_car_m.pred[,2], lty=2, col="orange") # Upper 95% C.I. band lines(HW_car_m.pred[,3], lty=2, col="orange") # Lower 95% C.I. band # In-sample MSE HW_car_m_mse <- sum(HW_car_m$fitted[,1]^2)/length(HW_car_m$fitted[,1]) HW_car_m_mse # Using R package forecast library(forecast) HW_car_f <- forecast(HW_car, h=24, level=c(80,95)) # Visualize forcast: plot(HW_car_f, main = "Forecasting Log Car Sales", ylab="Log Car Sales") lines(HW_car_f$fitted, lty=2, col="purple") # Evaluation of HW Model HW_car_fe <- HW_car_f$residuals #In-sample prediction errors acf(HW_car_fe, lag.max=20, na.action=na.pass) Box.test(HW_car_fe, lag=20, type="Ljung-Box") hist(HW_car_fe) # Testing accuracy of forecasts (Very informal) dm.test(HW_car.pred[,1], HW_car_m.pred[,1], power=2) dm.test(HW_car.pred[,1], HW_car_m.pred[,1], power=1) ###### Regression Tree: IBM ibm_data <- data.frame(ibm_x, Mkt_RF, SMB, HML) plot(SMB, Mkt_RF, , main = "Mkt_RF & SMB: Thresholds") abline(v = 0.005, h = 0.027, lty = "dashed") # Create training (70%) and test (30%) sets for our data. library(rsample) # data splitting library(rpart) # performing regression trees library(rpart.plot) # plotting regression trees library(ipred) # bagging library(caret) # bagging libary(rattle) # nice graphs set.seed(123) # Use set.seed for reproducibility ibm_split <- initial_split(ibm_data, prop = .7) ibm_train <- training(ibm_split) ibm_test <- testing(ibm_split) ### Single Decision Tree # Rpart with no complexity penalty, cp = 0 m_ibm_ft <- rpart( formula = ibm_x ~ ., data = ibm_train, method = "anova", control = list(cp = 0, xval = 10) ) plotcp(m_ibm_ft) abline(v = 9, lty = "dashed") # Rpart applies complexity penalty m_ibm <- rpart( formula = ibm_x ~ ., data = ibm_train, method = "anova" ) rpart.plot(m_ibm) plotcp(m_ibm) m_ibm printcp(m_ibm) pred_ibm_tree <- predict(m_ibm, newdata = ibm_test) RMSE(pred = pred_ibm_tree, obs = ibm_test$ibm_x) ## Pruning tree m_ibm_prune <- prune(m_ibm, cp = 0.05) pred_ibm_prune <- predict(m_ibm_prune, newdata = ibm_test) RMSE(pred = pred_ibm_prune, obs = ibm_test$ibm_x) #### Changing Training data (Now, with TS Split) s_size <- floor(0.70 * nrow(ibm_data)) split <- split(ibm_data, rep(1:2, each = s_size)) names(split) <- c("train_ts", "test_ts") ibm_train_ts <- split$`train_ts` ibm_test_ts <- split$`test_ts` # Rpart with no complexity penalty, cp = 0 m_ibm_ft_ts <- rpart( formula = ibm_x ~ ., data = ibm_train_ts, method = "anova", control = list(cp = 0, xval = 10) ) rpart.plot(m_ibm_ft_ts) plotcp(m_ibm_ft_ts) abline(v = 5, lty = "dashed") # Rpart applies complexity penalty m_ibm_ts <- rpart( formula = ibm_x ~ ., data = ibm_train_ts, method = "anova" ) rpart.plot(m_ibm_ts) plotcp(m_ibm_ts) m_ibm_ts plot(m_ibm_ts) printcp(m_ibm_ts) pred_ibm_tree_ts <- predict(m_ibm, newdata = ibm_test_ts) RMSE(pred = pred_ibm_tree_ts, obs = ibm_test_ts$ibm_x) m_ibm_prune_ts <- prune(m_ibm_ts, cp = 0.05) pred_ibm_prune <- predict(m_ibm_prune_ts, newdata = ibm_test_ts) RMSE(pred = pred_ibm_prune_ts, obs = ibm_test_ts$ibm_x) fancyRpartPlot(m_ibm_prune) fancyRpartPlot(m_ibm_prune_ts) ### Bagging library(ipred) ntree <- 10:60 rmse <- vector(mode = "numeric", length = length(ntree)) # store OOB RMSEs for (i in seq_along(ntree)) { set.seed(123) # set seed to be able to reproduce results # perform bagged model (set coob = TRUE to use OOB sample to estimate error. model_ibm_bag <- bagging( formula = ibm_x ~ ., data = ibm_train, coob = TRUE, nbagg = ntree[i] ) rmse[i] <- model$err # get OOB error } model_ibm_bag plot(ntree, rmse, type = 'l', main = "RMSE as function Number of trees", lwd = 2) abline(v = 50, col = "red", lty = "dashed") pred_bag <- predict(model_ibm_bag, newdata = ibm_test) RMSE(pred = pred_bag, obs = ibm_test$ibm_x) ### Random Forest (RF) library(randomForest) rf_ibm <- randomForest(ibm_x ~ ., data = ibm_train, ntree = 100) rf_ibm plot(rf_ibm) # Use to set number of trees # number of trees with lowest MSE which.min(rf_ibm$mse) sqrt(rf_ibm[which.min(m1$mse)]) pred_rf <- predict(rf_ibm , newdata = ibm_test) RMSE(pred = pred_rfg, obs = ibm_test$ibm_x) ## Tune number of predictors in each node (mtry)in RF set.seed(223) rf_xom_2 <- tuneRF( x = xom_train[xom_preds], y = xom_train$xom_x, ntreeTry = 120, mtryStart = 2, stepFactor = 1.5, improve = 0.01, trace = FALSE # to not show real-time progress ) print(rf_xom_2) ### Gradient boosting library(gbm) set.seed(123) # for reproducibility (use 223 to get a different tree) ibm_gbm <- gbm( formula = ibm_x ~ ., data = ibm_train, distribution = "gaussian", # SSE loss function n.trees = 500, shrinkage = 0.1, interaction.depth = 3, n.minobsinnode = 10, cv.folds = 10 ) ibm_gbm best <- which.min(ibm_gbm$cv.error) # getbest model that minimizes MSE sqrt(ibm_gbm$cv.error[best]) # get MSE and compute RMSE pred_gbm <- predict(ibm_gbm, newdata = ibm_test) RMSE(pred = pred_gbm, obs = ibm_test$ibm_x) gbm.perf(ibm_gbm, method = "cv") # plots ### Compare with Regression ## using random split ibm_reg <- lm (ibm_x ~., data=ibm_train) pred_ibm_reg <- predict(ibm_reg, newdata = ibm_test) RMSE(pred = pred_ibm_reg, obs = ibm_test$ibm_x) ## using time-series split ibm_reg_ts <- lm (ibm_x ~., data=ibm_train_ts) pred_ibm_reg_ts <- predict(ibm_reg_ts, newdata = ibm_test_ts) RMSE(pred = pred_ibm_reg_ts, obs = ibm_test_ts$ibm_x)