med.home.vec<-c(17200,17700,18200,18200,17500,18000,18400,17800,17900,17600,18400,18700,
17800,18000,19000,18800,19300,18800,19100,18900,18900,18900,19300,21000,
20700,20400,19800,19900,19600,19800,21000,20200,19600,19900,20600,20300,
21200,20900,20800,23000,22300,21200,21800,20700,22200,20800,21700,21700,
22200,22400,22400,22300,23700,23900,23300,21700,22800,22300,23100,22200,
23400,23500,24600,24500,25300,25400,24600,25000,25000,25600,24600,26500,
24600,26000,26200,25900,26900,25200,26800,24900,26000,24400,24900,25100,
23600,24100,24100,24200,25700,23800,22900,23500,22600,22100,23500,22300,
23900,24500,24300,25800,25500,26100,25200,25300,25400,25600,25700,25300,
24700,26500,27400,26700,27000,26800,27700,28100,28000,28900,29100,29700,
29900,29700,31400,32800,32000,33100,34200,33200,33200,33300,34000,35700,
34200,34900,36000,35700,35700,35100,36800,35700,36200,37200,37300,37400,
37200,37900,38800,39200,39500,37900,38600,38200,39700,40700,41100,42100,
41600,42700,43600,43300,43600,46100,44600,44200,44700,45300,45800,45900,
45500,47400,46200,48700,49300,48700,48600,49000,48500,51400,51800,52700,
52300,53200,53200,53300,55700,56700,54800,56100,57300,58300,58700,59900,
60300,61200,60400,62600,62800,64200,63800,64000,66000,62300,63900,61500,
62900,64900,62600,63100,63100,64900,64000,63400,68300,65600,66700,67000,
67600,66200,66700,68700,70800,68800,69500,72600,65800,69600,71200,68400,
66200,65700,67200,70200,69300,69300,70900,70100,67800,69700,73500,71700,
73500,73800,72500,74700,74500,75800,75200,76800,81000,75900,75900,75900,
76200,79200,78400,79600,81400,80500,80700,82000,81300,80100,82500,78300,
82500,82000,84200,85600,80100,86300,82100,83300,84700,85400,87200,87900,
86600,89700,88700,92500,92100,91200,94100,91500,95000,96400,94000,95000,
98500,95200,98400,96500,104900,109000,105000,106800,106500,106500,117000,111800,
119000,110900,108900,111000,110000,111500,118000,110000,116600,112900,110400,121000,
113000,118000,123000,116700,119000,122800,116000,122900,120000,123000,125000,125200,
125000,126900,119400,130000,125000,125000,118700,118400,113000,120000,118900,127000,
117900,119900,122500,121000,116000,119000,120000,120800,120000,122600,118500,122000,
120000,117200,120000,120000,113000,124500,118000,123500,119500,125000,128900,126000,
118000,129400,125000,127000,129900,124500,123900,126600,129400,125000,130000,125000,
126000,129900,132300,129000,129900,133500,124400,133300,129700,132000,129900,135000,
127900,135000,130000,134000,133900,133700,131000,134900,130000,135200,137000,138600,
131900,139400,137000,140000,136500,140000,144200,137000,139000,143800,143500,144900,
145000,143000,148000,150000,141000,145000,145900,144000,146300,141500,145000,145900,
148000,156000,152700,148000,153200,148000,149900,154900,155000,154500,151000,152500,
153100,160000,157300,159900,154400,159300,158200,154800,163200,161100,172000,164800,
163500,162400,165100,162600,164700,160100,169000,166600,171500,176300,174700,162000,
171300,169100,166300,175200,175300,179400,175000,173700,166400,171300,168100,180200,
187100,191100,183400,187100,181000,190600,175600,178900,177500,189200,181200,197600,
181700,187000,185100,189500,195500,187900,190200,190500,192000,194100,207100,196000,
209500,219600,209600,222300,211700,215700,212400,218100,211600,229200,224500,229600,
223100,237300,229300,236300,228300,226100,229200,240100,240400,243900,237900,238600,
244900,250800,238800,257000,238200,243200,238100,243900,226700,250400,240100,244700,
254400,250800,262600,242500,245000,235500,246200,236500,240300,234300,249100,227700,
232400,245300,229300,246400,229300,234300,237300,221000,225200,213200,221600,229600,
208600,209700,205100,219200,222300,214700,214200,207100,216600,215100,218800,222600,
218200,221900,224800,208300,230500,219500,212100,226600,228000,204200,219600,241200,
240100,220100,220500,224700,222000,240200,229900,219600,217000,224800,214300,218600,
221700,239900,239800,236400,239200,232600,237400,253200,254600,247200,245000,258300,
251500,265100,257500,279300,263700,259800,262200,255300,269800,264300,277100,275500,
269800,268400,282300,274500,285600,287000,280400,291700,261500,297000,298300,301500,
292000,286600,286600,294500,287500,285100,292300,293000,299500,298000,312600,297100,
288400,305800,303200,318300,295200,311200,297400,298900,314800,302800,315000,327000,
315200,298000,321700,311100,323600,315200,322900,314200,331500,319500,343400,343300,
329600,327200,335400,314400,316700,310500,327500,321400,328300,328300,308500,329700,
305400,320800,310600,339000,312700,311800,307600,325200,299400)
med.home.ts<-ts(med.home.vec,start=c(1963,1),frequency=12)
tsplot(med.home.ts)
This data looks clearly nonstationary with non-constant mean and variance. It recorded a long prediod of time so it is hard to see trend of the data. We take a look at some short periods of time as following:
par(mfrow=c(2,2))
plot(ts(med.home.vec[1:120],start=c(1963,1),frequency=12), main='Jan1963 - Jan1973',ylab=NULL,xlab=NULL)
plot(ts(med.home.vec[121:240],start=c(1973,1),frequency=12), main='Jan1973 - Jan1983',ylab=NULL,xlab=NULL)
plot(ts(med.home.vec[445:564],start=c(2000,1),frequency=12), main='Jan2000 - Jan2010',ylab=NULL,xlab=NULL)
plot(ts(med.home.vec[612:672],start=c(2014,1),frequency=12), main='Jan2014 - Jan2019',ylab=NULL,xlab=NULL)
Even when we look at shorter prediods of time, there is no clear trend for this data, we can only say that this time series is nonstationary. This is confirmed by Augmented Dickey-Fuller Test with p-value is 0.58. We now analyze residuals in the linear, quadratic, and seasonal time models.
df <- data.frame(residuals=c(residuals(lm(med.home.ts ~ time(med.home.ts))),
residuals(lm(med.home.ts ~ time(med.home.ts) + I(time(med.home.ts)^2))),
residuals(lm(med.home.ts ~ season(med.home.ts)-1))),
model=c(rep('Linear time',681),
rep('Quadratic time',681),
rep('Seasonal time',681)))
ggplot(df, aes(x=rep(time(med.home.ts),3), y=residuals, color=model)) +
geom_line() +
facet_grid(model ~ .) +
theme_bw() +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(),
axis.title.x = element_blank(),legend.position = "none") +
scale_x_continuous(breaks=c(1963,1980,1990,2000,2010,2019)) +
scale_color_brewer(palette = 'Dark2')
All three models do not have stationary residuals. Another way to detrend the time series is to take differencing. In the following plots, the first differencing series has approximately constant mean but increasing variance, making it be a nonstationary time series. The log-transformed data also is nonstationary with the same reason.
par(mfrow=c(1,2))
plot(diff(med.home.ts), type='l',main='The first differencing',ylab='')
plot(log(med.home.ts), type='l',main='Log-transformation',ylab='')
We then analyze the first differencing of log-transformation, this series is stationary with constant mean, variance and p-value of Augmented Dickey-Fuller Test is 0.01.
ts_2 <- diff(log(med.home.ts))
ggtsdisplay(diff(log(med.home.ts)))
eacf(ts_2)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x o o o o x x o o o x x o
## 1 x x o o o o o x o o o o x o
## 2 x x o o o o o o o o o o x o
## 3 x o o x o o o o o o o o o x
## 4 x o x x x o o o o o o o o o
## 5 x o x x o o o x o o o o o o
## 6 x o o x x o o o o o o o o o
## 7 x x x x o o o o o o o o o o
Both ACF and PACF do not have a clear cut-off, and the EACF plot does not provide which orders we should pick, so we need to create a function to search for the lowest AIC for ARIMA(p,1,q) with p is from 0 to 7 and q from 0 to 13. Here is the result:
aic <- matrix(rep(NA,112),ncol=14)
for (i in 0:7) {
for (j in 0:13) {
aic[i+1,j+1] <- arima(ts_2, order=c(i,0,j))$aic
}
}
colnames(aic) <- seq(0,13)
rownames(aic) <- seq(0,7)
round(aic,0) %>%
kbl() %>%
kable_styling(font_size = 10) %>%
row_spec(1)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -2501 | -2733 | -2739 | -2740 | -2739 | -2740 | -2740 | -2738 | -2744 | -2742 | -2743 | -2743 | -2741 | -2748 |
1 | -2710 | -2740 | -2740 | -2738 | -2751 | -2749 | -2740 | -2738 | -2743 | -2741 | -2745 | -2742 | -2740 | -2747 |
2 | -2730 | -2740 | -2738 | -2738 | -2736 | -2747 | -2746 | -2745 | -2745 | -2744 | -2747 | -2745 | -2743 | -2746 |
3 | -2735 | -2738 | -2750 | -2737 | -2736 | -2746 | -2758 | -2754 | -2753 | -2751 | -2749 | -2747 | -2745 | -2744 |
4 | -2738 | -2738 | -2738 | -2736 | -2745 | -2744 | -2754 | -2752 | -2751 | -2749 | -2747 | -2745 | -2752 | -2746 |
5 | -2740 | -2741 | -2744 | -2746 | -2753 | -2754 | -2752 | -2751 | -2751 | -2750 | -2749 | -2747 | -2745 | -2744 |
6 | -2740 | -2739 | -2744 | -2742 | -2754 | -2752 | -2751 | -2749 | -2750 | -2748 | -2747 | -2745 | -2743 | -2752 |
7 | -2739 | -2736 | -2742 | -2745 | -2752 | -2750 | -2750 | -2749 | -2747 | -2745 | -2744 | -2742 | -2740 | -2747 |
pos <- which(aic == min(aic), arr.ind = TRUE)
As we can see, the lowest AIC is -2758 which is the order 3 of AR and order 6 of MA. Here are estimated coefficients :
model_2 <- arima(ts_2, order=c(3,0,6))
coef(model_2)
## ar1 ar2 ar3 ma1 ma2 ma3
## -0.800240369 0.661133478 0.852184904 0.161062693 -1.055581245 -0.434088506
## ma4 ma5 ma6 intercept
## 0.420262780 -0.032210441 0.149973107 0.004150235
sarima(log(med.home.ts), 3,1,6)
The residuals vs. time plot, and Q-Q plot look good with random and approximately normal residuals. ACF plot is a kind of white noise since all lags are inside the two restricted lines. The Ljung-Box test for lag 1 of residuals has the p-value of 0.9976, indicating the residuals are independent.
The ARIMA(3,1,6) model is: \[\begin{align*} W_t &= 0.0042 + -0.8 \times W_{t-1} + 0.66 \times W_{t-2} + 0.85 \times W_{t-3} + e_t + \\ & 0.16 \times e_{t-1} -1.05 \times e_{t-2} -0.43 \times e_{t-3} + 0.42 \times e_{t-4} - 0.03 \times e_{t-5} + 0.15 \times e_{t-6} \end{align*}\]
where \(W_t = log{Y_t} - log{Y_{t-1}}\)
autoplot(forecast(Arima(log(med.home.ts), order=c(3,1,6),include.mean = FALSE, include.drift = FALSE, include.constant =TRUE),h=100),main=NULL)
exp(sarima.for(log(med.home.ts), 12, 3, 1, 6)$pred)
Using predict() function, we have the predicted values of the next 6 months: October 2019, November 2019, December 2019, January 2020, February 2020, and March 2020 are:
round(exp(sarima.for(log(med.home.ts), 6, 3, 1, 6)$pred),0)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2019 315812 310315 313659
## 2020 313573 313541 314237
The 95% confident interval for prediction at time \(t + l\) are \(\hat{Y_t}(l) \pm Z_{0.25} \times se_t(l)\) or
\(\hat{Y_t}(l) \pm 1.96 \times se_t(l)\), with standard errors for 6 predictions are:
round(exp(sarima.for(log(med.home.ts), 6, 3, 1, 6)$se),3)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2019 1.032 1.034 1.037
## 2020 1.039 1.041 1.043
LB <- exp(sarima.for(log(med.home.ts), 6, 3, 1, 6)$pred) -
1.96 * exp(sarima.for(log(med.home.ts), 6, 3, 1, 6)$se)
UB <- exp(sarima.for(log(med.home.ts), 6, 3, 1, 6)$pred) +
1.96 * exp(sarima.for(log(med.home.ts), 6, 3, 1, 6)$se)
Therefore, the 95% confident intervals for predictions of the next 6 months are [315809.8, 315813.8], [310313.3, 310317.4], [313657.3, 313661.3], [313570.9, 313574.9], [313538.6, 313542.7], and [314234.6, 314238.7]
The median home price data has 681 time points, hence let the median home price in October 2019 be \(Y_{682}\). The approximate probability that the median home price in October 2019 will be more than $300,000 is:
\[\begin{align*} P(Y_{682} > 300000) &= 1 - P(log{Y_{682}} \le log{300000}) \\ &= 1 - P(\frac{log{Y_{682}} - log{\hat{Y}_{682}}}{\sigma_{682}} \le \frac{log{300000} - log{\hat{Y}_{682}}}{\sigma_{682}}) \\ &= 1 - P(Z \le \frac{log{300000} - 12.6629}{0.0313}) \\ &= 1 - P(Z \le -1.6392) \\ &\approx 1 - 0.05050 \\ &=0.9495 \text{ or } 94.95\% \end{align*}\]