########### PRACTICE FINAL EXAM #############

Fin_da <- read.csv("https://www.bauer.uh.edu/rsusmel/4397/Fin.csv", head=TRUE, sep=",")
names(Fin_da)

x_date <- Fin_da$Date
x_cat <- Fin_da$CAT
x_Mkt_RF<- Fin_da$Mkt_RF
x_SMB <- Fin_da$SMB
x_HML <- Fin_da$HML
x_CMA <- Fin_da$CMA
x_RMW <- Fin_da$RMW
x_RF <- Fin_da$RF
x_Div <- Fin_da$Dividends
x_Earn <- Fin_da$Earnings

T <- length(x_cat)
lr_cat <- log(x_cat[-1]/x_cat[-T])
lr_earn <- log(x_Earn[-1]/x_Earn[-T])
Mkt_RF <- x_Mkt_RF[-1]/100
SMB <- x_SMB[-1]/100
HML <- x_HML[-1]/100
CMA <- x_CMA[-1]/100
RMW <- x_RMW[-1]/100
RF <- x_RF[-1]/100
cat_x <- lr_cat - RF			# cat excess returns 

T_new <- length(cat_x)


###### QUESTION 1 ###### 

## 1.a - Report Regression
fit_cat_ff4 <- lm (cat_x ~ Mkt_RF + SMB + HML + RMW)
summary(fit_cat_ff4)

## 1.b - Interpret Beta_1 and Beta_2 Coefficients 


## 1.c - Drivers of cat
# MKt, HML very significant (very very los p-values). SMB and RMW, significant at 10%


## 1.d - Interpret R^2 and F-goodness of Fit test
# The variability of the FF factors explain 40.24% of the variability of cat excess returns
# The F-statistic showa a low p-values. It rejects the H0 of no joint significant for the FF factors


## 1.e - Test if Beta_1 = 1
out_ff4 <- summary(fit_cat_ff4) 
b_ff4 <- fit_cat_ff4$coefficients
b_ff4_1 <-out_ff4$coefficients[,1]
b_ff4_sd1 <-out_ff4$coefficients[,2]

t_beta_1 <- (b_ff4[2]-1)/b_ff4_sd1[2]
t_beta_1 

# |t_beta_1| < 1.96 => At the 5% level, we cannot reject H0: Beta_1 = 1.


## 2.f Test if beta_SMB = beta_HML = 0.5
library(car)
linearHypothesis(fit_cat_ff4, c("SMB = 0.5","HML = 0.5"), test="F") 	# "F": exact test 

# p-value of F-stat is close to 5%. Marginal decision. Strictly speaking, we cannot reject H0.

## 2.g  QLRt: Structural Break - Estimating breaking point date (with library desk)
library(desk)
e_cat <- fit_cat_ff4$residuals
T<- length(e_cat)
pie <- .15
T0 <- round(T * pie)
T1 <- round(T *(1-pie))
my.qlr <- qlr.test(fit_cat_ff4, from = T0, to = T1, sig.level = 0.05, details = TRUE)

plot(my.qlr, col = "red", main = "QLR Test: 4-FF Factor Model") # Plot test results
max(my.qlr$f.stats)

my.qlr$breakpoint		# Extract breakpoint observation
Fin_da$Date[my.qlr$breakpoint]	# Print date

qlr.cv(T, L = 2, sig.level = 0.05)

# QLR F-value is 5.08668. It is low compared to the critical value => We cannot reject H0. No evidence of Structural Break



## 2.a - LM Test for Mkt_RF^2 and SMB^2 as drivers of heteroscedasticity
Mkt_RF2 <- Mkt_RF^2
SMB2 <- SMB^2

resid_r <- fit_cat_ff4$residuals					# extract residuals from FF3 model
fit_lm <- lm (resid_r ~ Mkt_RF + SMB + HML + RMW + Mkt_RF2 + SMB2)		# auxiliary regression
summary(fit_lm )
R2_r <- summary(fit_lm)$r.squared			 	# extract  R2 from fit_lm 
R2_r

LM_test <- R2_r * T_new
LM_test

qchisq(.95, df = 2) 	# chi-squared (df=2) value at 5% level

p_val <- 1 - pchisq(LM_test, df = 2)  	# p-value of LM_test 
p_val

# p-value of LM-test is 0.20 => We cannot reject H0. That is, no evidence that SMB^2 & HML^2 arae drivers of heteroscedasticity.


## 2.b - GQ and BP Test for heteroscedasticity.
library(lmtest)
gqtest(fit_cat_ff4)
bptest(fit_cat_ff4)

# Big p-values for GQ test and low for BP test => Evidence of heteroscedasticity.


## 2.c - DW Test for first-order autocorrelation
dwtest(fit_cat_ff4)

# p-value of DW test is 0.1038 => Cannot reject H0. That is, no evidence for first order autocorrelation in model errors.


## 2.d - BG Test for autocorrelation
bgtest(fit_cat_ff4, order=4)
bgtest(fit_cat_ff4, order=12)

# Big p-values for both tests. No evidence of 4th and 12th order autocorrelation.


## 2.e
#### Q & LB Autocorrelation Tests & Q* Automatic Lag Selection Test

y <- e_cat
Box.test(y, lag = 4, type="Box-Pierce")
Box.test(y, lag = 12, type="Box-Pierce")
Box.test(y, lag = 4, type="Ljung-Box")
Box.test(y, lag = 12, type="Ljung-Box")

library(vrtest)
Auto.Q(y, 12)				# Q* test automatic selection of p

# No change for Mkt and HML; but, once, we take into account autocorrelation and heteroscedasticity SMB and RMW drop their significance. RMW not longer signficant at 10% level.



## 2.f NW SE
# At 5% level, we have evidence of autocorrelation. Then, use NW SE.
library(sandwich)
Var_NW <- NeweyWest(fit_cat_ff4, lag = 12)
SE_NW <- sqrt(diag(Var_NW))
t_b_NW <- b_ff4/SE_NW
t_b_NW

# Once, we take into account autocorrelation and heteroscedasticity SMB is no longer significant at the 5% level (but, significant at 10% level).


###### QUESTION 3 ###### 

x_date[433]

## 3.a -  Estimation Period
T0 <- 1
T1 <- 432							# End of Estimation Period (2016.12)
T2 <- T1+1							# Start of Validation Period (2017.1)
y <- cat_x
xx_ff4 <- cbind(Mkt_RF, SMB, HML, RMW)
y1 <- y[T0:T1]
T2 <- T1+1							# Start of Validation Period (2017.1)
x1 <- xx_ff4[T0:T1,]
fit_cat_est <- lm(y1 ~  x1)					# Estimation Period Regression (1971.2 - 2017.4)
b_est <- fit_cat_est$coefficients				# Extract OLS coefficients from regression
summary(fit_cat_est)


## 3.b - Forecasting and MSE Under RW Assumption For Independent Variables
xx_cons <- rep(1,T_new-T2+1)
k_for <- length(xx_cons)					# Create a constant for Validation forecasts
y_mod_f0 <- cbind(xx_cons,xx_ff4[T1:(T_new-1),]) %*% b_est	# Validation period data
e_mod_f0 <- y[T2:T_new] - y_mod_f0				# Forecasat error
mse_e_f0 <- sum(e_mod_f0^2)/k_for				# Model's MSE
mse_e_f0

## 3.c - Forecasting and MSE Under  RW Model
e_rw_f0 <- y[T2:T_new] - y[T1:(T_new - 1)]			# RW Forecasat error
mse_rw_f0 <- sum(e_rw_f0^2)/k_for				# RW's MSE
mse_rw_f0


## 3.d - Testing Equality of MSE: Mod vs RW
z_mgn <- e_rw_f0 + e_mod_f0					# RW has higher MSE => becomes e(1)
x_mgn <- e_rw_f0 - e_mod_f0
fit_mgn <- lm(z_mgn ~ x_mgn)	
summary(fit_mgn)						# Check t-stat

# Significant slope coefficient in MGN regression => Model's MSE is significantly different (& lower) than the RW.


###### QUESTION 4 ###### 
## ARMA(1,3) y_t = 0.56 + .46  y_(t-1) + 0.5 ε_(t-1) - 0.4  ε_(t-2)- 0.2 ε_(t-3) + ε_t

# Data
y_t_1 <- 1.34
e_t_1 <- 0.17
e_t_2 <- -0.95
e_t_3 <- 0.77

## 4.a - Obtain forecasts for the series y_t for times t, t+1, and t+2 using the estimated ARMA model.
y_t <- 0.56 + 0.46 * y_t_1 + 0.5 * e_t_1 - 0.4 * e_t_2 - 0.2 * e_t_3 
y_t_p_1 <- 0.56 + 0.46 * y_t - 0.4 * e_t_1 - 0.2 * e_t_2 
y_t_p_2 <- 0.56 + 0.46 * y_t_p_1 - 0.2 * e_t_1 

## 4.b - MSE for ARMA(1,3) 
#Observed Data 
y_actual_t <- -0.21
y_actual_t_p_1 <- -0.42
y_actual_t_p_2 <- 3.01

MSE_arma <- 1/3 *((y_actual_t- y_t)^2 + (y_actual_t_p_1 - y_t_p_1)^2 + (y_actual_t_p_2 - y_t_p_2)^2)
MSE_arma

## 4.c. Use a SES model to forecast the values for the series y_t for times t, t+1, and t+2  The estimated value of the smoothing constant, α, is 0.26, with the most recently available smoothed value, St-1  = 0.307. Obtain forecasts for the series yt for times t, t+1, and t+2 using this model. Compute the (out-of-sample) MSE.
alpha <- 0.26
S_t_1 <- 0.607
S_t <- S_t_1 + alpha *(y_t_1 - S_t_1)
S_t_p_1 <- S_t
S_t_p_2 <- S_t

MSE_ses <- 1/3 *((y_actual_t- S_t_p_2)^2 + (y_actual_t_p_1 - S_t_p_2)^2 + (y_actual_t_p_2 - S_t_p_2)^2)
MSE_ses


## 4.d - Select the better forecasting model.
# SES is selected. It has a lower MSE.



###### QUESTION 5 ###### 

## 5.a - Report ACF & PACF
plot(lr_earn)
acf(lr_earn)
pacf(lr_earn)


## 5.b - Suggest a model
# AR(4), AR(8)?. Some seasonality at lag 13, which is really strange!... The financial crisis (with outliers) is likely affecting the series. Maybe better to use an ARMA(,)? Try auto.arima

library(forecast) 
auto.arima(lr_earn)

fit_arma43 <- arima(lr_earn, order=c(4,0,3))
fit_arma43 

## 5.c - Stationarity?
autoplot(fit_arma43)

# Inverse AR roots inside the unit circle => stationary...
# Inverse MA roots at the unit circle => close to non-invertible


## 5.d - Autocorrelated Residuals?
checkresiduals(fit_arma43)

# Yes, evidence of autocorrelated residuals... Residuals are not White Noise.


## 5.e - LM Test for seasonality in residuals.		
zz <- fit_arma43$residuals
Feb1 <- rep(c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create January dummy
Mar1 <- rep(c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create March dummy
Apr1 <- rep(c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create April dummy
May1 <- rep(c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create May dummy
Jun1 <- rep(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create June dummy
Jul1 <- rep(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create Jul dummy
Aug1 <- rep(c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), (length(zz)/12+1))	# Create Aug dummy
Sep1 <- rep(c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0), (length(zz)/12+1))	# Create Sep dummy
Oct1 <- rep(c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), (length(zz)/12+1))	# Create Oct dummy
Nov1 <- rep(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), (length(zz)/12+1))	# Create Oct dummy
Dec1 <- rep(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), (length(zz)/12+1))	# Create Oct dummy
Feb <- Feb1[1:T_new]
Mar <- Mar1[1:T_new]
Apr <- Apr1[1:T_new]
May <- May1[1:T_new]
Jun <- Jun1[1:T_new]
Jul <- Jul1[1:T_new]
Aug <- Aug1[1:T_new]
Sep <- Sep1[1:T_new]
Oct <- Oct1[1:T_new]
Nov <- Nov1[1:T_new]
Dec <- Dec1[1:T_new]


e_arma43 <- fit_arma43$residuals

lm_seas <- lm(e_arma43 ~ Feb + Mar + Apr + May + Jun + Jul + Aug + Sep + Oct + Nov + Dec)
R2_r <- summary(lm_seas)$r.squared 	# extracting  R^2 from fit_lm 
R2_r
LM_test <- R2_r * length(e_arma43)
LM_test
qchisq(.95, df = 11) 		# chi-squared (df=2) value at 5% level

p_val <- 1 - pchisq(LM_test, df = 11)  			# p-value of LM_test 
p_val

# High p-value => No evidence of seasonality in residuals. No need to reformulate model for seasonality, but very likely needs a dummy for the 2008 Financial Crisis.



###### QUESTION 6 - THEORY REVIEW ######
## 6.a - Yes. See above example of Log Changes for Earnings
## 6.b - Yes. White SE are used when only heteroscedasticy is present. For autocorrelated errors, we use NW.
## 6.c - False. Not the same, but, in practice, both estimates should be "similar."
## 6.d - True. It relies on the Empirical Distribution, not normality assumption.
## 6.e - True. Second order stationarity only looks for constancy of the first two moments and the covariance function, which should depend on k, not t.
## 6.f - False. Stationarity depends on the AR(1) coefficient. The AR(1) coefficient, Phi_1 has to be smaller than 1 in absolute value.
##	 MA processes are always stationary.	
## 6.g - True. The variance depends on time and as time grows it become undefined.