############# Linear Model: Data Problemd, Bootstrap SE[b] and t[b] & Testing Joint Hypothesis (Code #6) #############

SFX_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/Stocks_FX_1973.csv",head=TRUE,sep=",")
names(SFX_da)
x_sp <- SFX_da$SP500
x_ibm <- SFX_da$IBM
x_Mkt_RF<- SFX_da$Mkt_RF
x_SMB <- SFX_da$SMB
x_HML <- SFX_da$HML
x_RF <- SFX_da$RF

T <- length(x_ibm)
lr_ibm <- log(x_ibm[-1]/x_ibm[-T])
lr_sp <- log(x_sp[-1]/x_sp[-T])
x0 <- matrix(1,T-1,1)
 Mkt_RF <- x_Mkt_RF[-1]/100
 SMB <- x_SMB[-1]/100
 HML <- x_HML[-1]/100
 RF <- x_RF[-1]/100

ibm_x <- lr_ibm - RF

# OLS results
fit_ibm_ff3 <- lm(ibm_x ~ Mkt_RF + SMB + HML)
summary(fit_ibm_ff3)


y <- ibm_x 
x <- cbind(x0, Mkt_RF, SMB, HML)
data_xy <- data.frame(y, x)		# data frame to use with head function to locate first 10 influential obs
k <- ncol(x)

# Plot residuals
x_resid <- residuals(fit_ibm_ff3)
plot(x_resid, typ ="l", col="blue", main ="IBM Residuals from 3 FF Factor Model", xlab="Date", ylab="IBM residuals")

# Plot standardized residuals
x_stand_resid <- x_resid/sd(x_resid) 	# standardized residuals
plot(x_stand_resid, typ ="l", col="blue", main ="IBM Standardized Residuals from 3 FF Factor Model", xlab="Date", ylab="IBM residuals")


cooksd <- cooks.distance(fit_ibm_ff3)

# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance") 
# add cutoff line
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
# add labels
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T), names(cooksd),""), col="red")  # add labels

# influential row numbers
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  
 # print first 10 influential observations.
head(data_xy[influential, ], n=10L)

install.packages("olsrr") 

library(olsrr)				# need to install package olsrr
sum(x_stand_resid > 2)			# Rule of thumb count (5% count is OK)
x_lev <- ols_leverage(fit_ibm_ff3)		# leverage residuals
sum(x_lev > (2*k+2)/T) 			# Rule of thumb count (5% count is OK)
sum(cooksd > 4/T)			# Rule of thumb count (5% count is OK)


sum(x_stand_resid > 2)/T		# proportion of "outliers"
sum(x_lev > (2*k+2)/T)/T		# proportion of "outliers"	 
sum(cooksd > 4/T)/T			# proportion of "outliers"

ols_plot_resid_stand(fit_ibm_ff3)		# Plot Standardize residuals

ols_plot_cooksd_bar(fit_ibm_ff3) 		# Plot Cook’s D measure

ols_plot_dfbetas(fit_ibm_ff3)		# Plot Difference in betas

ols_vif_tol(fit_ibm_ff3)

ols_eigen_cindex(fit_ibm_ff3)


#### Bootstrap

library(boot)

## Bootstrap Correlation coefficients B = 1000
sim_size = 1000

dat_spibm <- data.frame(lr_sp, lr_ibm)

# function to obtain the correlation coefficient from the data
cor_xy <- function(data, i) {
	d <-data[i,]
  	return(cor(d$lr_sp,d$lr_ibm))
}
# bootstrapping with sim_size replications
boot.samps <- boot(data=dat_spibm, statistic=cor_xy,   R=sim_size)

# view stored bootstrap samples and compute mean
boot.samps				# Print original ρ, bias and SE of bootstraps
mean(boot.samps$t)			# our estimate of the correlation
sd(boot.samps$t)			# SD of the correlation estimate

# Elegant histogram
 hist(boot.samps$t,main="Histogram for Bootstrapped Correlations", 
     xlab="Correlations", breaks=20)

boot.ci(boot.samps, type="basic")


## Bootstrap Correlation coefficients B = 100
sim_size = 100
boot.samps <- boot(data=dat_spibm, statistic=cor_xy,   R=sim_size)

# view stored bootstrap samples and compute mean
boot.samps				# Print original ρ, bias and SE of bootstraps
mean(boot.samps$t)			# our estimate of the correlation
sd(boot.samps$t)			# SD of the correlation estimate

boot.ci(boot.samps, type="basic")

## Bootstrap Correlation coefficients B = 25
sim_size = 25
boot.samps <- boot(data=dat_spibm, statistic=cor_xy,   R=sim_size)

# view stored bootstrap samples and compute mean
boot.samps				# Print original ρ, bias and SE of bootstraps
mean(boot.samps$t)			# our estimate of the correlation
sd(boot.samps$t)			# SD of the correlation estimate

boot.ci(boot.samps, type="basic")


## Bootstrap SE for b in a Linear Model
install.packages("lmboot")
library(lmboot)				# need to run before install.packages(“lmboot”)

y <- ibm_x 
x <- cbind(x0, Mkt_RF, SMB, HML)
dat_yx <- data.frame(y, x)		# lmboot needs an R data frame. We make one.

t_stat <- summary(fit_ibm_ff3)$coef[,3]	#extracting t-values from lm (3rd row in table)

sim_size = 1000
ff3_b <- paired.boot(y ~ x-1, data=dat_yx, B=sim_size)

ff3_b$origEstParam			# print OLS results (“original estimates”)

ff3_b$bootEstParam[1:10,]		# print the first 10 of B=1,000 bootstrap samples

## Mean values for b
mean(ff3_b$bootEstParam[,1])		# print mean of bootstrap samples for constant
mean(ff3_b$bootEstParam[,2]) 		# print mean of bootstrap samples for Mkt_RF
mean(ff3_b$bootEstParam[,3])
mean(ff3_b$bootEstParam[,4])

# Statistics for sampling distribution of b
summary(ff3_b$bootEstParam)		# Summary of bootstraps (mean, median, max, min, etc)

hist(ff3_b$bootEstParam[,2],main="Histogram for Bootstrapped Market Beta", 
     xlab="Correlations", breaks=20)


## bootstrap bias
ff3_b$origEstParam[1] - mean(ff3_b$bootEstParam[,1])
ff3_b$origEstParam[2] - mean(ff3_b$bootEstParam[,2])
ff3_b$origEstParam[3] - mean(ff3_b$bootEstParam[,3])
ff3_b$origEstParam[4] - mean(ff3_b$bootEstParam[,4])


## SD for each parameter
sd1 <- sd(ff3_b$bootEstParam[,1])
sd2 <- sd(ff3_b$bootEstParam[,2])
sd3 <- sd(ff3_b$bootEstParam[,3])
sd4 <- sd(ff3_b$bootEstParam[,4])


## t-values for each parameter
t1 <- ff3_b$bootEstParam[,1]/sd1
t2 <- ff3_b$bootEstParam[,2]/sd2
t3 <- ff3_b$bootEstParam[,3]/sd3
t4 <- ff3_b$bootEstParam[,4]/sd4
  
## mean t-values
mean(t1) 
mean(t2) 
mean(t3) 
mean(t4)
  
## bias in t-values for each parameter
mean(t1) - t_stat[1]
mean(t2) - t_stat[2]
mean(t3) - t_stat[3]
mean(t4) - t_stat[4]

#####  Testing Linear Hypothesis in the Linear Model

###  Testing the Expectations Hypothesis (EH)
SF_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/SpFor_prices.csv", head=TRUE, sep=",")
summary(SF_da)

x_date <- SF_da$Date
x_S <- SF_da$GBPSP
x_F3m <- SF_da$GBP3M
i_us3 <- SF_da$Dep_USD3M
i_uk3 <- SF_da$Dep_UKP3M
T <- length(x_S)

prem <- (x_S[-1] - x_F3m[-T])/x_S[-1]		# forward premium
int_dif <- (i_us3 - i_uk3)/100			# interest rate differential

y <- prem
x <- int_dif[-T]
fit_eh <- lm( y ~ x)				# Regression to test EH
summary(fit_eh)					# Check t-values



###  Testing Linear Hypothesis in the Fama-French 3 factor Linear Model
x <- cbind(x0, Mkt_RF, SMB, HML)
k <- ncol(x)

# OLS results
summary(fit_ibm_ff3)

J <- 2						# number of restriction
R <- matrix(c(0,0,0,0,1,0,0,1), nrow=2)		# matrix of restrictions
q <- c(.2, .6)					# hypothesized values

b <- fit_ibm_ff3$coefficients			# Extract OLS coefficients
Var_b <- vcov(fit_ibm_ff3)			# Extract Var[b]
m <- R%*%b - q					# m = Estimated R*Beta - q
Var_m <- R %*% Var_b %*% t(R)			# Variance of m
det(Var_m)					# check for non-singularity

W <- t(m)%*%solve(Var_m)%*%m			# W = m' Var[m] m
F_t <- as.numeric(W/J)				# F-test statistic
F_t
F_t_asym <- J*F_t				# Chi-square-test statistic (asymptotic)
F_t_asym


qf(.95, df1=J, df2=(T - k)) 			# exact distribution (F-dist) if errors normal (A5)
p_val <- 1 - pf(F_t, df1=J, df2=(T - k)) 	# p-value(F_t) under (A5)
p_val

qchisq(.95, df=J)
p_val <- 1 - pchisq(F_t_asym, df=J)		# p-value(F_t) under asymptotic distrib.
p_val
	

## Test Linear Restrictions using R package "car"
install.packages("car")
library(car)

linearHypothesis(fit_ibm_ff3,c("SMB = 0.2","HML = 0.6"),test="F")	# Exact F test
linearHypothesis(fit_ibm_ff3,c("SMB = 0.2","HML = 0.6"),test="Chisq")	# Asymptotic F-test

linearHypothesis(fit_ibm_ff3,c("SMB = HML"),test="F")	# testing beta_SMB = beta_HMB

linearHypothesis(fit_ibm_ff3,c("SMB + HML = 1"),test="F")	# testing beta_SMB + beta_HMB = 1


###  Testing the EH Model
linearHypothesis(fit_eh,c("(Intercept) = 0","x = 0"),test="F")
linearHypothesis(fit_eh,c("(Intercept) = 0","x = 0"),test="Chisq")


###  Goodness of Fit Model
y <- ibm_x
T <- length(y)
x0 <- matrix(1,T,1)
x <- cbind(x0,Mkt_RF, SMB, HML)
k <- ncol(x)
b <- fit_ibm_ff3$coefficients			# extracting OLS regression
e <- y - x%*%b					# we could have used e <- fit_ibm_ff3$residuals
RSS <- as.numeric(t(e)%*%e)
R2 <- 1 - as.numeric(RSS)/as.numeric(t(y)%*%y)	#R-squared
R2 
summary(fit_ibm_ff3)$r.squared			# extracting R-squared from fit-capm

F_goodfit <- (R2/(k-1))/((1-R2)/(T-k))		#F-test of goodness of fit.
F_goodfit



###  Testing with Unrestricted and Restricted Estimations
y <- ibm_x
x0 <- matrix(1,T,1)
x_r <- cbind(x0,Mkt_RF)				# Restricted X vector
k_r <- ncol(x_r)

fit_ibm_ff3_R <- lm(y ~ Mkt_RF)			# CAPM estimation ("Restricted FF3 Model)
summary(fit_ibm_ff3_R)
eR <- fit_ibm_ff3_R$residuals			# extracting Restricted residuals

RSSR <- as.numeric(t(eR)%*%eR)
J <- k - k_r					# J = degrees of freedom of numerator 
F_test <- ((RSSR - RSS)/J)/(RSS/(T-k))
F_test

qf(.95, df1=J, df2=(T - k)) 			# exact distribution (F-dist) if e normal
p_val <- 1 - pf(F_test, df1=J, df2=(T - k)) 	# p-value(F_t) under e normal
p_val


###  Testing with package lmtest
install.packages("lmtest")
library(lmtest)
fit_wU <- lm (y ~ Mkt_RF + SMB + HML)
fit_wR <- lm (y ~ Mkt_RF)
waldtest(fit_wU, fit_wR)