############# Prediction/Forecasting AND Model Selection (Code #11 - Lecture 16) #############

#### Forecasting: Fundamental Approach
FX_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/FX_USA_JAP.csv", head=TRUE, sep=",")

### Steps 1 & 2: Collect data, define variables and transform data.
us_I <- FX_da$US_INF				# Read US Inflation (IUS) data from file
us_mg <- FX_da$US_M1_c				# Read US Money growth (mUS) data from file
us_i <- FX_da$US_I3M				# Read US 3-mo Interest rate (iUS) data from file
us_y <- FX_da$US_GDP_g				# Read US GDP growth (yUS) data from file

jp_I <- FX_da$JAP_INF				# Read Japan Inflation (IUS) data from file
jp_mg <- FX_da$JAP_MI_c				# Read Japan Money growth (mJP) data from file
jp_i <- FX_da$JAP_I3M				# Read Japan 3-mo Interest rate (iJP) data from file
jp_y <- FX_da$JAP_GDP_g				# Read Japan GDP growth yJP) data from file

e_f <- FX_da$JPY.USD_c				# Read changes in JPY/USD (e)

## Plot FX Rate Changes ((Full Sample)
x.ts = ts(e_f, frequency = 4, start=c(1978, 2))
plot.ts(x.ts, col="blue",ylab ="USD/JAP changes", main="USD/JAP: 1978-2023")

## Define Regressors and combine them in xx (a Tx3 Matrix)
int_dif <- us_i - jp_i
mg_dif <- us_mg - jp_mg
y_dif <- us_y - jp_y
xx <- cbind(int_dif, y_dif)

## Define Estimation Period with T_est & adjust sample size for variables (e_f1 & xx)
T <- length(e_f)
T_est <- 161						# Define final observation for estimation period. (T = 161 = 2018:2)
e_f1 <- e_f[1:T_est]					# Adjust sample size to T_est
xx_1 <- xx[1:T_est,]					# Adjust sample size to T_est


## Plot FX Rate Changes (Estimation Sample)
x.ts = ts(e_f1, frequency = 4, start=c(1978, 2))
plot.ts(x.ts, col="blue",ylab ="USD/JPY changes", main="USD/JPY (Est Period): 1978-2018")


### Step 3. Estimate Model (only with estimation period, 1 to T_est)
fit_ef <- lm(e_f1 ~ xx_1)
summary(fit_ef)

### Step 4. Generate forecasts 

## 4.1 Model forecast driving variables (X's) using info from estimation period only

# AR(1) Model for int_dif
int_dif_lag1 <- int_dif[1:T_est-1]				# Lag (iUS,t - iJAP,t) 
int_dif_lag0 <- int_dif[2:T_est]				# Adjust sample size (lost one observation above)
fit_int <- lm(int_dif_lag0 ~ int_dif_lag1)			# Fit AR(1) model  
summary(fit_int)

# AR(1) Model for y_dif
y_dif_lag1 <- y_dif[1:T_est-1]					# Lag (yUS,t - yJAP,t) 
y_dif_lag0 <- y_dif[2:T_est]					# Adjust sample size (lost one observation above)
fit_y <- lm(y_dif_lag0 ~ y_dif_lag1)				# Fit AR(1) model  
summary(fit_y)


# AR(1) Model One-step=ahead forecasts (starting at T_val = T_est + 1) for driving variables
T_val <- T_est+1						# start of Validation period
xx_cons <- rep(1,T-T_val+1)					# create the constant vector
int_dif_0 <- cbind(xx_cons,xx[T_val:T,1]) %*% fit_int$coeff	# 22 forecasts for (iUS,t - iJAP,t) 
y_dif_0 <- cbind(xx_cons,xx[T_val:T,2]) %*% fit_y$coeff		# 22 forecasts for (yUS,t - yJAP,t) 

## NOTE: Huge outlier in observation mg_dif_0[11] due to a big jump in monetary policy due to Covid-19
# we can replace outlier with 
# mg_dif_0[11] <- (mg_dif_0[10]+mg_dif_0[12])/2

# Model One-step=ahead forecasts (starting at T_val = T_est + 1) for e_f (& MSE)
e_Mod_0 <- cbind(xx_cons,int_dif_0,y_dif_0)%*%fit_ef$coeff	# Model's 22 forecast
f_e_Mod <- e_f[T_val:T] - e_Mod_0 				# Model's forecast error
mse_e_f <- sum(f_e_Mod^2)/(T-T_val+1)				# Model's MSE
mse_e_f 


## RW one-step forecast for e_f (& MSE)
e_f_RW_0 <- rep(0,T-T_val+1)					# RW forecast = 0 (always 0, for all t+T!)
f_e_RW <- e_f[T_val:T] - e_f_RW_0 				# RW's forecast error	
mse_e_RW <- sum(f_e_RW^2)/(T-T_val+1)				# RW's MSE
mse_e_RW

### Step 5. Evaluation of Forecasts
z_mgn <- f_e_Mod + f_e_RW 
x_mgn <- f_e_Mod - f_e_RW
fit_mgn <- lm(z_mgn ~ x_mgn)
summary(fit_mgn)


### Step 6. Out-of-sample Forecasts
# out-of-sample forecat for int_dif
int_dif_p1 <- cbind(1,int_dif[T])%*%fit_int$coeff		# int_dif_p1 = Et=2023:II[(iUS,t - iJAP)t+1=2023:III] 
int_dif_p1

# out-of-sample forecat for y_dif
y_dif_p1 <- cbind(1,y_dif[T])%*%fit_y$coeff			 # y_dif_p1 = Et=2023:II[(yUS,t - yJAP)t+1=2023:III] 
y_dif_p1

S <- 0.007							# Today's value of St=2023:II
e_f_p1 <- cbind(1,int_dif_p1,y_dif_p1) %*% fit_ef$coeff		# Today's forecast for et=2023:III
e_f_p1
S_p1 <- S * (1+e_f_p1/100)					# Model's forecast for St+1=2023:III 
S_p1 


## Use the one-step-ahead forecasts to generate two-step-ahead forecasts. => Forecast Et=2023:II[St+1=2023:IV] 	(=S_p2 below)
S1 <- S_p1 							# Today's value of St+1=2023:III
int_dif_p2 <- cbind(1,int_dif_p1)%*%fit_int$coeff		# Today's forecast for (iUS - iJP)t+2
y_dif_p2 <- cbind(1,y_dif_p1)%*%fit_y$coeff			# Today's forecast for (yUS - yJP)t+2
e_f_p2 <- cbind(1,int_dif_p2,y_dif_p2)%*%fit_ef$coeff # Today's forecast for et=2023:IV
e_f_p2

S_p2 <- S1*(1+e_f_p2/100)					# Today's forecast for St=2023:IV
S_p2	



##### Model Selection (Specific to General)

### Specific (3-factor FF Model)

SFX_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/Stocks_FX_1973.csv",head=TRUE,sep=",")
x_ibm <- SFX_da$IBM
x_Mkt_RF<- SFX_da$Mkt_RF
x_SMB <- SFX_da$SMB
x_HML <- SFX_da$HML
x_RMW <- SFX_da$RMW
x_CMA <- SFX_da$CMA
x_RF <- SFX_da$RF

T <- length(x_ibm)
lr_ibm <- log(x_ibm[-1]/x_ibm[-T])
 Mkt_RF <- x_Mkt_RF[-1]/100
 SMB <- x_SMB[-1]/100
 HML <- x_HML[-1]/100
 RMW <- x_RMW[-1]/100
 CMA <- x_CMA[-1]/100
 RF <- x_RF[-1]/100
 ibm_x <- lr_ibm - RF						# IBM excess returns

## Fit Specific Model (3-factor FF Model)

fit_ibm_ff3 <- lm(ibm_x ~ Mkt_RF + SMB + HML)
summary(fit_ibm_ff3)

## Add to Specific Model (January Effect)

Jan <- rep(c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (round(T)/12+1))	# Create January dummy
T <- length(ibm_x)
T2 <- T+1
Jan_1 <- Jan[2:T2]

fit_ibm_ff3_new <- lm (ibm_x ~ Mkt_RF + SMB + HML + Jan_1)
summary(fit_ibm_ff3_new)

## Testing Fit (Diagnostic Testing)

library(lmtest)
resettest(fit_ibm_ff3_new, type="fitted")			# Check functional form (non-linearities)

## Add More Variables (CMA, RMW) to Specific Model (January Effect)
fit_ibm_ff5_Jan <- lm (ibm_x ~ Mkt_RF + SMB + HML + Jan_1 + CMA + RMW)
summary(fit_ibm_ff5_Jan)

## Testing Fit (Diagnostic Testing)

library(lmtest)
resettest(fit_ibm_ff5_Jan, type="fitted")			# Check functional form (non-linearities)


### Specific: Stepwise regression (with 5-factor FF Model) with package olsrr

## Fit Specific Model (5-factor FF Model)

library(olsrr)
ff_step_data <- data.frame(Mkt_RF, SMB, HML, RMW, CMA) 		# k=5. (Provide the list of variables)

ibm_ff_model <- lm(ibm_x ~ ., data = ff_step_data)
ols_step_forward_p(ibm_ff_model , details = TRUE)		# default p-value (penter) is 0.3. 
plot(ols_step_forward_p(ibm_ff_model))				# plots R2, Adj-R2, AIC, BIC, Cp

ols_step_forward_p(ibm_ff_model, penter = 0.1, details = TRUE)	# changing selection p-value to 0.1.


##### Model Selection (General to Specific, GETS)

### GUM with Seasonal January Dummy and Structural Break in Dec 2001 (dot.com)
t_sb <- 342							# Structural break date (End of 1st-regime)
T_s_1 <- T - t_sb
d_0 <- matrix(0, t_sb, 1)
d_1 <- matrix(1, T_s_1, 1)
Dot_com <- rbind(d_0,d_1)
length(Dot_com)
Mkt_RF2 <- Mkt_RF^2
SMB2 <- SMB^2
HML2 <- HML^2
Mkt_SMB <- Mkt_RF * SMB
Mkt_HML <- Mkt_RF * HML
SMB_HML <- SMB * HML
Mkt_Jan <- Mkt_RF * Jan_1
HML_Jan <- HML * Jan_1
SMB_Jan <- SMB * Jan_1
Mkt_Dot <- Mkt_RF * Dot_com
HML_Dot <- HML * Dot_com
SMB_Dot <- SMB * Dot_com

## Fit GUM
fit_gum <- lm (ibm_x ~ Mkt_RF + SMB + HML + Jan_1 + Mkt_RF2 + SMB2 + HML2 + Mkt_HML + Mkt_SMB + SMB_HML + Mkt_Jan + HML_Jan + SMB_Jan + Mkt_Dot + HML_Dot + SMB_Dot)
summary(fit_gum)

## Check specification with RESET Test
resettest(fit_gum, type="fitted")				# Check functional form (non-linearities)

## Reduce model by keeping variables with t-stats greater than 1.645 or practical experience says to keep them.
fit_gum_r <- lm (ibm_x ~ Mkt_RF + SMB + HML + Jan_1 + HML2 + SMB_Jan + Mkt_Dot + HML_Dot)
summary(fit_gum_r)

## F-test of reduced model against GUM
e_u <- fit_gum$residuals					# GUM residuals
RSS_u <- t(e_u)%*%e_u
e_r <- fit_gum_r$residuals					# Restricted GUM residuals
RSS_r <- t(e_r)%*%e_r
f_test_gum <- ((RSS_r - RSS_u)/9)/(RSS_u/(T-16))		# F-test
f_test_gum

qf(.95, df1=9, df2=T-16)					# 95% quantile value of F-test
p_val <- 1 - pf(f_test_gum,df1=9, df2=T-16)			# p-value of F-test
p_val


## GETS with package olsrr
library(olsrr)
## We need to put all the explanatory variables in a data frame
ff_step_data <- data.frame(Mkt_RF, SMB, HML, Jan_1, Dot_com, Mkt_RF2, HML2, SMB2, Mkt_HML, Mkt_SMB, SMB_HML, Mkt_Jan, SMB_Jan, HML_Jan, Mkt_Dot, HML_Dot, SMB_Dot) 

## Fit GUM with lm
ibm_ff_model <- lm(ibm_x ~ ., data = ff_step_data)	

## Use GETS (backward search, in this package). We can specify the p-value (prem) to eliminate variables.
ols_step_backward_p(ibm_ff_model, prem = .05, details = TRUE)	# long final output & prem = p-value	


### BEST SUBSET with package olsrr
ff_step_data_1 <- data.frame(Mkt_RF, SMB, HML, RMW, CMA, Jan_1, Dot_com, Mkt_RF2, HML2, SMB2) 
ibm_ff_model_1 <- lm(ibm_x ~ ., data = ff_step_data_1)	# default p-value (penter) is 0.3 
plot(ibm_ff_model_1)						# print results

ibm_fit_best_subset <- ols_step_best_subset(ibm_ff_model_1, metric ="adjr")
ibm_fit_best_subset						# print result

# S3 plot for ols_step_best_subset
plot(ibm_fit_best_subset)


## Full Model will take a while to run
ibm_ff_model_full <- lm(ibm_x ~ ., data = ff_step_data)			# default p-value (penter) is 0.3 
ibm_fit_best_subset_full <- ols_step_best_subset(ibm_ff_model_full, metric ="adjr")
ibm_fit_best_subset_full						# print results	

# S3 plot for ols_step_best_subset
plot(ibm_fit_best_subset_full)