1. Algorithm Design
# helper 1
h_get_BIC_value = function(input_diff_data,
input_max_ar,
input_max_ma,
input_BIC_values_position) {
# Get all BIC values
BIC_values = input_diff_data %>%
armasubsets(input_max_ar, input_max_ma) %>%
summary() %>%
extract2(input_BIC_values_position)
# Find out the smallest BIC value
BIC_value = which(BIC_values == min(BIC_values))
return(BIC_value)
}
# helper 2
h_get_AR_or_MA_shaded_box_logic_values = function(input_diff_data,
input_max_ar,
input_max_ma,
input_AR_MA_position,
input_BIC_value,
input_AR_or_MA_start,
input_AR_or_MA_end) {
AR_or_MA_shaded_box_logic_values = input_diff_data %>%
armasubsets(input_max_ar, input_max_ma) %>%
summary() %>%
extract2(input_AR_MA_position) %>%
extract(input_BIC_value, input_AR_or_MA_start : input_AR_or_MA_end) %>%
unname()
return(AR_or_MA_shaded_box_logic_values)
}
# helper 3
h_get_model_AR_or_MA_order = function(input_model_AR_or_MA_position) {
if (input_model_AR_or_MA_position %>% length() %>% equals(0)) {
model_AR_or_MA_order = 0
}
else {
model_AR_or_MA_order = input_model_AR_or_MA_position %>% max()
}
return(model_AR_or_MA_order)
}
# Build ARIMA model by given data
build_arima_model = function(input_data){
### Pick 6 as the maximum AR and MA value
max_ar = 6 # user adjustable
max_ma = 6 # user adjustable
### diff_1 is normal difference of input data, diff_2 is seasonal difference of diff_1
diff_1 = input_data %>% diff()
diff_2 = diff_1 %>% diff(lag = 12)
### Let model_1 be the ARIMA model based on diff_1 and model_2 be the ARIMA model based on diff_2.
### ARMA subset summary does not give in descending order of BIC, so determination of the row with smallest BIC value is necessary. Fortunately, the 6th section of the summary is BIC value.
BIC_values_position = 6
### AR and MA positions
ar_start = 2
ar_end = ar_start + max_ar - 1
ma_start = ar_end + 1
ma_end = ma_start + max_ma - 1
### The 1st section of the summary is AR and MA part. TRUE indicates shaded box from the plot.
AR_MA_position = 1
### Model 1
model_1_BIC_value = h_get_BIC_value(diff_1,
BIC_values_position,
max_ar, max_ma)
# Get shaded box logic value for AR part of model_1
model_1_ar = h_get_AR_or_MA_shaded_box_logic_values(diff_1,
max_ar, max_ma,
AR_MA_position,
model_1_BIC_value,
ar_start, ar_end)
# Get shaded box logic value for MA part of model_1
model_1_ma = h_get_AR_or_MA_shaded_box_logic_values(diff_1,
max_ar, max_ma,
AR_MA_position,
model_1_BIC_value,
ma_start, ma_end)
# Now, shaded boxes for model_1 is ready. Need to determine order and fixed position of AR and MA.
model_1_ar_position = model_1_ar %>% equals(TRUE) %>% which()
model_1_ar_order = h_get_model_AR_or_MA_order(model_1_ar_position)
model_1_ma_position = model_1_ma %>% equals(TRUE) %>% which()
model_1_ma_order = h_get_model_AR_or_MA_order(model_1_ma_position)
model_1_fixed = 0 %>% rep(model_1_ar_order + model_1_ma_order)
model_1_fixed[c(model_1_ar_position, model_1_ar_order + model_1_ma_position)] = NA
# Set arima model
model_1 = arima(input_data,
order = c(model_1_ar_order, 1, model_1_ma_order),
fixed = model_1_fixed,
transform.pars = FALSE)
### Model 2
model_2_BIC_value = h_get_BIC_value(diff_2,
BIC_values_position,
max_ar, max_ma)
model_2_ar = h_get_AR_or_MA_shaded_box_logic_values(diff_2,
max_ar, max_ma,
AR_MA_position,
model_2_BIC_value,
ar_start, ar_end)
model_2_ma = h_get_AR_or_MA_shaded_box_logic_values(diff_2,
max_ar, max_ma,
AR_MA_position,
model_2_BIC_value,
ma_start, ma_end)
model_2_ar_position = model_2_ar %>% equals(TRUE) %>% which()
model_2_ar_order = h_get_model_AR_or_MA_order(model_2_ar_position)
model_2_ma_position = model_2_ma %>% equals(TRUE) %>% which()
model_2_ma_order = h_get_model_AR_or_MA_order(model_2_ma_position)
model_2_fixed = 0 %>% rep(model_2_ar_order + model_2_ma_order)
model_2_fixed[c(model_2_ar_position, model_2_ar_order + model_2_ma_position)] = NA
model_2 = arima(input_data,
order = c(model_2_ar_order, 2, model_2_ma_order),
fixed = model_2_fixed,
transform.pars = FALSE)
### Compare BIC of both model_1 and model_2, and let the one with smaller BIC value be the optimize simulation model
if (model_1 %>% BIC() %>% is_weakly_less_than(model_2 %>% BIC())) {
arima_model = model_1
}
else {
arima_model = model_2
}
return(arima_model)
}
# Detect outlier points and plot them out on the graph
detect_outliers = function(input_data){
arima_model = build_arima_model(input_data)
fitted_data = fitted(arima_model)
upper = fitted_data + 2.5 * sqrt(arima_model %>% use_series(sigma2))
lower = fitted_data - 2.5 * sqrt(arima_model %>% use_series(sigma2))
outliers = input_data %>% is_less_than(lower) %>% or(input_data %>% is_greater_than(upper))
# Plot graph
input_data %>%
plot(main = "Outlier Detection",
ylab = "Oil Price",
type = "n",
ylim = range(lower, upper))
polygon(c(input_data %>% time(), input_data %>% time() %>% rev()),
c(upper, lower %>% rev()),
col = rgb(0, 0, 0.6, 0.2),
border = FALSE)
input_data %>%
lines()
points(x = input_data %>% time() %>% extract(outliers),
y = input_data %>% extract(outliers),
pch = 19)
# Print position of outliers
paste0("Outliers at: ",
outliers %>%
equals(TRUE) %>%
which() %>%
paste(collapse = ", ")) %>%
cat()
}